From 22a2d74c0c6dbde6b3503ec51486d6cf5d0c83f1 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Taei" Date: Tue, 17 Sep 2024 21:06:46 -0700 Subject: [PATCH 001/321] [NVPTX] Emit ld.v4.b16 for loading <4 x bfloat> (#109069) This PR enables emitting a single load instruction for <4 x bfloat>, otherwise, 2 ld.b32 loads are generated. --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 1 + llvm/test/CodeGen/NVPTX/vector-loads.ll | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index c5a40e4308860c..31a5e937adae96 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -6179,6 +6179,7 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, case MVT::v4i16: case MVT::v4i32: case MVT::v4f16: + case MVT::v4bf16: case MVT::v4f32: case MVT::v8f16: // <4 x f16x2> case MVT::v8bf16: // <4 x bf16x2> diff --git a/llvm/test/CodeGen/NVPTX/vector-loads.ll b/llvm/test/CodeGen/NVPTX/vector-loads.ll index 9322b9e0fe6c82..f582ebc166dd0d 100644 --- a/llvm/test/CodeGen/NVPTX/vector-loads.ll +++ b/llvm/test/CodeGen/NVPTX/vector-loads.ll @@ -198,3 +198,12 @@ define void @extv8f16_generic_a4(ptr noalias readonly align 16 %dst, ptr noalias !1 = !{i32 0, i32 64} + +; CHECK-LABEL: bf16_v4_align_load_store +define dso_local void @bf16_v4_align_load_store(ptr noundef %0, ptr noundef %1) #0 { + ; CHECK: ld.v4.b16 + ; CHECK: st.v4.b16 + %3 = load <4 x bfloat>, ptr %1, align 8 + store <4 x bfloat> %3, ptr %0, align 8 + ret void +} From ddbe6c412bab3fe7a3ffaf6f42c49849a518b4c6 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Tue, 17 Sep 2024 21:15:44 -0700 Subject: [PATCH 002/321] [clang-format[NFC] Clean up FormatTestBase and Proto/TextProto tests (#108334) --- clang/unittests/Format/FormatTestBase.h | 14 +-- clang/unittests/Format/FormatTestProto.cpp | 2 - .../unittests/Format/FormatTestTextProto.cpp | 90 ++++++++----------- 3 files changed, 45 insertions(+), 61 deletions(-) diff --git a/clang/unittests/Format/FormatTestBase.h b/clang/unittests/Format/FormatTestBase.h index 33110ca5d9edfd..9d9472964fd3b4 100644 --- a/clang/unittests/Format/FormatTestBase.h +++ b/clang/unittests/Format/FormatTestBase.h @@ -61,23 +61,23 @@ class FormatTestBase : public testing::Test { return *Result; } - FormatStyle getStyleWithColumns(FormatStyle Style, unsigned ColumnLimit) { + FormatStyle getStyleWithColumns(FormatStyle Style, + unsigned ColumnLimit) const { Style.ColumnLimit = ColumnLimit; return Style; } - FormatStyle getLLVMStyleWithColumns(unsigned ColumnLimit) { + FormatStyle getLLVMStyleWithColumns(unsigned ColumnLimit) const { return getStyleWithColumns(getLLVMStyle(), ColumnLimit); } - FormatStyle getGoogleStyleWithColumns(unsigned ColumnLimit) { + FormatStyle getGoogleStyleWithColumns(unsigned ColumnLimit) const { return getStyleWithColumns(getGoogleStyle(), ColumnLimit); } - FormatStyle getTextProtoStyleWithColumns(unsigned ColumnLimit) { - FormatStyle Style = getGoogleStyle(FormatStyle::FormatStyle::LK_TextProto); - Style.ColumnLimit = ColumnLimit; - return Style; + FormatStyle getTextProtoStyleWithColumns(unsigned ColumnLimit) const { + return getStyleWithColumns(getGoogleStyle(FormatStyle::LK_TextProto), + ColumnLimit); } bool _verifyFormat(const char *File, int Line, StringRef Expected, diff --git a/clang/unittests/Format/FormatTestProto.cpp b/clang/unittests/Format/FormatTestProto.cpp index 5adb532ae4a412..30ce57c545ec76 100644 --- a/clang/unittests/Format/FormatTestProto.cpp +++ b/clang/unittests/Format/FormatTestProto.cpp @@ -516,8 +516,6 @@ TEST_F(FormatTestProto, AcceptsOperatorAsKeyInOptions) { } TEST_F(FormatTestProto, BreaksEntriesOfSubmessagesContainingSubmessages) { - FormatStyle Style = getGoogleStyle(FormatStyle::LK_TextProto); - Style.ColumnLimit = 60; // The column limit allows for the keys submessage to be put on 1 line, but we // break it since it contains a submessage an another entry. verifyFormat("option (MyProto.options) = {\n" diff --git a/clang/unittests/Format/FormatTestTextProto.cpp b/clang/unittests/Format/FormatTestTextProto.cpp index 23f46202a34637..fd65c9a58db5d8 100644 --- a/clang/unittests/Format/FormatTestTextProto.cpp +++ b/clang/unittests/Format/FormatTestTextProto.cpp @@ -18,9 +18,7 @@ namespace { class FormatTestTextProto : public FormatTestBase { protected: virtual FormatStyle getDefaultStyle() const override { - FormatStyle Style = getGoogleStyle(FormatStyle::LK_TextProto); - Style.ColumnLimit = 60; // To make writing tests easier. - return Style; + return getTextProtoStyleWithColumns(60); } }; @@ -126,7 +124,8 @@ TEST_F(FormatTestTextProto, ImplicitStringLiteralConcatenation) { " 'bbbbb'"); verifyFormat("field_a: \"aaaaa\"\n" " \"bbbbb\""); - FormatStyle Style = getGoogleStyle(FormatStyle::LK_TextProto); + + auto Style = getDefaultStyle(); Style.AlwaysBreakBeforeMultilineStrings = true; verifyFormat("field_a:\n" " 'aaaaa'\n" @@ -359,46 +358,40 @@ TEST_F(FormatTestTextProto, KeepsCommentsIndentedInList) { } TEST_F(FormatTestTextProto, UnderstandsHashComments) { - FormatStyle Style = getGoogleStyle(FormatStyle::LK_TextProto); - Style.ColumnLimit = 60; // To make writing tests easier. - EXPECT_EQ("aaa: 100\n" - "## this is a double-hash comment.\n" - "bb: 100\n" - "## another double-hash comment.\n" - "### a triple-hash comment\n" - "cc: 200\n" - "### another triple-hash comment\n" - "#### a quadriple-hash comment\n" - "dd: 100\n" - "#### another quadriple-hash comment", - format("aaa: 100\n" - "##this is a double-hash comment.\n" - "bb: 100\n" - "## another double-hash comment.\n" - "###a triple-hash comment\n" - "cc: 200\n" - "### another triple-hash comment\n" - "####a quadriple-hash comment\n" - "dd: 100\n" - "#### another quadriple-hash comment", - Style)); + auto Style = getDefaultStyle(); + + verifyFormat("aaa: 100\n" + "## this is a double-hash comment.\n" + "bb: 100\n" + "## another double-hash comment.\n" + "### a triple-hash comment\n" + "cc: 200\n" + "### another triple-hash comment\n" + "#### a quadriple-hash comment\n" + "dd: 100\n" + "#### another quadriple-hash comment", + "aaa: 100\n" + "##this is a double-hash comment.\n" + "bb: 100\n" + "## another double-hash comment.\n" + "###a triple-hash comment\n" + "cc: 200\n" + "### another triple-hash comment\n" + "####a quadriple-hash comment\n" + "dd: 100\n" + "#### another quadriple-hash comment", + Style); // Ensure we support a common pattern for naming sections. - EXPECT_EQ("##############\n" - "# section name\n" - "##############", - format("##############\n" - "# section name\n" - "##############", - Style)); - - EXPECT_EQ("///////////////\n" - "// section name\n" - "///////////////", - format("///////////////\n" - "// section name\n" - "///////////////", - Style)); + verifyFormat("##############\n" + "# section name\n" + "##############", + Style); + + verifyFormat("///////////////\n" + "// section name\n" + "///////////////", + Style); } TEST_F(FormatTestTextProto, FormatsExtensions) { @@ -519,8 +512,8 @@ TEST_F(FormatTestTextProto, FormatsRepeatedListInitializers) { " ]\n" "}\n" "key: value"); - FormatStyle Style = getGoogleStyle(FormatStyle::LK_TextProto); - Style.ColumnLimit = 60; // To make writing tests easier. + + auto Style = getDefaultStyle(); Style.Cpp11BracedListStyle = true; verifyFormat("keys: [1]", Style); } @@ -544,7 +537,6 @@ TEST_F(FormatTestTextProto, BreaksConsecutiveStringLiterals) { } TEST_F(FormatTestTextProto, PutsMultipleEntriesInExtensionsOnNewlines) { - FormatStyle Style = getGoogleStyle(FormatStyle::LK_TextProto); verifyFormat("pppppppppp: {\n" " ssssss: \"http://example.com/blahblahblah\"\n" " ppppppp: \"sssss/MMMMMMMMMMMM\"\n" @@ -556,12 +548,10 @@ TEST_F(FormatTestTextProto, PutsMultipleEntriesInExtensionsOnNewlines) { " key: value\n" " }\n" "}", - Style); + getGoogleStyle(FormatStyle::LK_TextProto)); } TEST_F(FormatTestTextProto, BreaksAfterBraceFollowedByClosingBraceOnNextLine) { - FormatStyle Style = getGoogleStyle(FormatStyle::LK_TextProto); - Style.ColumnLimit = 60; verifyFormat("keys: [\n" " data: { item: 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa' }\n" "]"); @@ -571,10 +561,6 @@ TEST_F(FormatTestTextProto, BreaksAfterBraceFollowedByClosingBraceOnNextLine) { } TEST_F(FormatTestTextProto, BreaksEntriesOfSubmessagesContainingSubmessages) { - FormatStyle Style = getGoogleStyle(FormatStyle::LK_TextProto); - Style.ColumnLimit = 60; - // The column limit allows for the keys submessage to be put on 1 line, but we - // break it since it contains a submessage an another entry. verifyFormat("key: valueeeeeeee\n" "keys: {\n" " item: 'aaaaaaaaaaaaaaaa'\n" From 7153a4bbf6d46e58ce32d59220515c5ab9f35691 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Tue, 17 Sep 2024 21:16:20 -0700 Subject: [PATCH 003/321] [clang-format] Reimplement InsertNewlineAtEOF (#108513) Fixes #108333. --- clang/lib/Format/FormatTokenLexer.cpp | 7 +++++++ clang/lib/Format/TokenAnnotator.cpp | 5 ----- clang/unittests/Format/FormatTest.cpp | 6 ++++++ 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp index e21b5a882b7773..63949b2e26bdc1 100644 --- a/clang/lib/Format/FormatTokenLexer.cpp +++ b/clang/lib/Format/FormatTokenLexer.cpp @@ -100,6 +100,13 @@ ArrayRef FormatTokenLexer::lex() { if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline) FirstInLineIndex = Tokens.size() - 1; } while (Tokens.back()->isNot(tok::eof)); + if (Style.InsertNewlineAtEOF) { + auto &TokEOF = *Tokens.back(); + if (TokEOF.NewlinesBefore == 0) { + TokEOF.NewlinesBefore = 1; + TokEOF.OriginalColumn = 0; + } + } return Tokens; } diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index dfa703aed0d34d..aa0d310a355ff6 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -3704,11 +3704,6 @@ void TokenAnnotator::annotate(AnnotatedLine &Line) { auto *First = Line.First; First->SpacesRequiredBefore = 1; First->CanBreakBefore = First->MustBreakBefore; - - if (First->is(tok::eof) && First->NewlinesBefore == 0 && - Style.InsertNewlineAtEOF) { - First->NewlinesBefore = 1; - } } // This function heuristically determines whether 'Current' starts the name of a diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index 5ebf0d7068dd6c..033daa3645db0d 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -27577,6 +27577,12 @@ TEST_F(FormatTest, InsertNewlineAtEOF) { verifyNoChange("int i;\n", Style); verifyFormat("int i;\n", "int i;", Style); + + constexpr StringRef Code{"namespace {\n" + "int i;\n" + "} // namespace"}; + verifyFormat(Code.str() + '\n', Code, Style, + {tooling::Range(19, 13)}); // line 3 } TEST_F(FormatTest, KeepEmptyLinesAtEOF) { From a8dd8f6302e5fd405de7ed2bbfe195f305279bf8 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Tue, 17 Sep 2024 21:17:30 -0700 Subject: [PATCH 004/321] [clang-format] Fix a bug in SpacesInParens InConditionalStatements (#108797) Fixes #64416. --- clang/lib/Format/TokenAnnotator.cpp | 36 +++++++++++++-------------- clang/unittests/Format/FormatTest.cpp | 6 +++++ 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index aa0d310a355ff6..580f183419f78f 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -4413,31 +4413,29 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line, Right.MatchingParen == &Left && Line.Children.empty()) { return Style.SpaceInEmptyBlock; } - if ((Left.is(tok::l_paren) && Right.is(tok::r_paren)) || - (Left.is(tok::l_brace) && Left.isNot(BK_Block) && - Right.is(tok::r_brace) && Right.isNot(BK_Block))) { - return Style.SpacesInParensOptions.InEmptyParentheses; - } - if (Style.SpacesInParens == FormatStyle::SIPO_Custom && - Style.SpacesInParensOptions.ExceptDoubleParentheses && - Left.is(tok::r_paren) && Right.is(tok::r_paren)) { - auto *InnerLParen = Left.MatchingParen; - if (InnerLParen && InnerLParen->Previous == Right.MatchingParen) { - InnerLParen->SpacesRequiredBefore = 0; - return false; + if (Style.SpacesInParens == FormatStyle::SIPO_Custom) { + if ((Left.is(tok::l_paren) && Right.is(tok::r_paren)) || + (Left.is(tok::l_brace) && Left.isNot(BK_Block) && + Right.is(tok::r_brace) && Right.isNot(BK_Block))) { + return Style.SpacesInParensOptions.InEmptyParentheses; + } + if (Style.SpacesInParensOptions.ExceptDoubleParentheses && + Left.is(tok::r_paren) && Right.is(tok::r_paren)) { + auto *InnerLParen = Left.MatchingParen; + if (InnerLParen && InnerLParen->Previous == Right.MatchingParen) { + InnerLParen->SpacesRequiredBefore = 0; + return false; + } } - } - if (Style.SpacesInParensOptions.InConditionalStatements) { const FormatToken *LeftParen = nullptr; if (Left.is(tok::l_paren)) LeftParen = &Left; else if (Right.is(tok::r_paren) && Right.MatchingParen) LeftParen = Right.MatchingParen; - if (LeftParen) { - if (LeftParen->is(TT_ConditionLParen)) - return true; - if (LeftParen->Previous && isKeywordWithCondition(*LeftParen->Previous)) - return true; + if (LeftParen && (LeftParen->is(TT_ConditionLParen) || + (LeftParen->Previous && + isKeywordWithCondition(*LeftParen->Previous)))) { + return Style.SpacesInParensOptions.InConditionalStatements; } } diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index 033daa3645db0d..53aa93a7a4fb01 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -17282,6 +17282,12 @@ TEST_F(FormatTest, ConfigurableSpacesInParens) { Spaces.SpacesInParens = FormatStyle::SIPO_Custom; Spaces.SpacesInParensOptions = {}; Spaces.SpacesInParensOptions.Other = true; + + EXPECT_FALSE(Spaces.SpacesInParensOptions.InConditionalStatements); + verifyFormat("if (a)\n" + " return;", + Spaces); + Spaces.SpacesInParensOptions.InConditionalStatements = true; verifyFormat("do_something( ::globalVar );", Spaces); verifyFormat("call( x, y, z );", Spaces); From 4d18ce1dd2640829c3ad9cbb31e6ff92e2e29438 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Tue, 17 Sep 2024 21:19:56 -0700 Subject: [PATCH 005/321] [clang-format] Handle C-style cast of qualified type (#108929) Fixes #102874. --- clang/lib/Format/TokenAnnotator.cpp | 13 ++++++++----- clang/unittests/Format/TokenAnnotatorTest.cpp | 8 ++++++++ 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index 580f183419f78f..6f09835bad3a83 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -2840,11 +2840,14 @@ class AnnotatingParser { if (AfterRParen->isOneOf(tok::identifier, tok::kw_this)) return true; - // Look for a cast `( x ) (`. - if (AfterRParen->is(tok::l_paren) && BeforeRParen->Previous) { - if (BeforeRParen->is(tok::identifier) && - BeforeRParen->Previous->is(tok::l_paren)) { - return true; + // Look for a cast `( x ) (`, where x may be a qualified identifier. + if (AfterRParen->is(tok::l_paren)) { + for (const auto *Prev = BeforeRParen; Prev->is(tok::identifier);) { + Prev = Prev->Previous; + if (Prev->is(tok::coloncolon)) + Prev = Prev->Previous; + if (Prev == LParen) + return true; } } diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index baa5ab0ac5e456..34c03d668a9a0a 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -781,6 +781,14 @@ TEST_F(TokenAnnotatorTest, UnderstandsCasts) { EXPECT_TOKEN(Tokens[9], tok::r_paren, TT_Unknown); EXPECT_TOKEN(Tokens[10], tok::minus, TT_BinaryOperator); + Tokens = annotate("return (::Type)(1 + 2);"); + ASSERT_EQ(Tokens.size(), 12u) << Tokens; + EXPECT_TOKEN(Tokens[4], tok::r_paren, TT_CastRParen); + + Tokens = annotate("return (Namespace::Class)(1 + 2);"); + ASSERT_EQ(Tokens.size(), 13u) << Tokens; + EXPECT_TOKEN(Tokens[5], tok::r_paren, TT_CastRParen); + auto Style = getLLVMStyle(); Style.TypeNames.push_back("Foo"); Tokens = annotate("#define FOO(bar) foo((Foo)&bar)", Style); From fd21b7911fbdddc80db2d3971ff10ee70a49b7e3 Mon Sep 17 00:00:00 2001 From: Ryosuke Niwa Date: Tue, 17 Sep 2024 22:23:27 -0700 Subject: [PATCH 006/321] [webkit.RefCntblBaseVirtualDtor] ThreadSafeRefCounted still generates warnings (#108656) Improve the fix in 203a2ca8cd6af505e11a38aebceeaf864271042c by allowing variable references and more ignoring of parentheses. --- .../WebKit/RefCntblBaseVirtualDtorChecker.cpp | 21 ++++++++++++------- .../ref-cntbl-crtp-base-no-virtual-dtor.cpp | 18 ++++++++++++++++ 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RefCntblBaseVirtualDtorChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RefCntblBaseVirtualDtorChecker.cpp index ecba5f9aa23ee3..e80246f49a3100 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RefCntblBaseVirtualDtorChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RefCntblBaseVirtualDtorChecker.cpp @@ -72,7 +72,7 @@ class DerefFuncDeleteExprVisitor if (name == "ensureOnMainThread" || name == "ensureOnMainRunLoop") { for (unsigned i = 0; i < CE->getNumArgs(); ++i) { auto *Arg = CE->getArg(i); - if (VisitLabmdaArgument(Arg)) + if (VisitLambdaArgument(Arg)) return true; } } @@ -80,17 +80,24 @@ class DerefFuncDeleteExprVisitor return false; } - bool VisitLabmdaArgument(const Expr *E) { + bool VisitLambdaArgument(const Expr *E) { E = E->IgnoreParenCasts(); if (auto *TempE = dyn_cast(E)) E = TempE->getSubExpr(); + E = E->IgnoreParenCasts(); + if (auto *Ref = dyn_cast(E)) { + if (auto *VD = dyn_cast_or_null(Ref->getDecl())) + return VisitLambdaArgument(VD->getInit()); + return false; + } + if (auto *Lambda = dyn_cast(E)) { + if (VisitBody(Lambda->getBody())) + return true; + } if (auto *ConstructE = dyn_cast(E)) { for (unsigned i = 0; i < ConstructE->getNumArgs(); ++i) { - auto *Arg = ConstructE->getArg(i); - if (auto *Lambda = dyn_cast(Arg)) { - if (VisitBody(Lambda->getBody())) - return true; - } + if (VisitLambdaArgument(ConstructE->getArg(i))) + return true; } } return false; diff --git a/clang/test/Analysis/Checkers/WebKit/ref-cntbl-crtp-base-no-virtual-dtor.cpp b/clang/test/Analysis/Checkers/WebKit/ref-cntbl-crtp-base-no-virtual-dtor.cpp index 01527addb52992..33c60ea8ca64d1 100644 --- a/clang/test/Analysis/Checkers/WebKit/ref-cntbl-crtp-base-no-virtual-dtor.cpp +++ b/clang/test/Analysis/Checkers/WebKit/ref-cntbl-crtp-base-no-virtual-dtor.cpp @@ -119,6 +119,11 @@ template ensureOnMainThread([this] { delete static_cast(this); }); + } else if constexpr (destructionThread == DestructionThread::MainRunLoop) { + auto deleteThis = [this] { + delete static_cast(this); + }; + ensureOnMainThread(deleteThis); } } @@ -230,3 +235,16 @@ class FancyRefCountedClass4 final : public BadNestedThreadSafeRefCounted { +public: + static Ref create() + { + return adoptRef(*new FancyRefCountedClass5()); + } + + virtual ~FancyRefCountedClass5(); + +private: + FancyRefCountedClass5(); +}; From 125635eb68a5582b840e900b91ee2db5e7fd65e6 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 17 Sep 2024 22:55:53 -0700 Subject: [PATCH 007/321] [CMake] Remove unused HAVE_SYS_PARAM_H/HAVE_SYS_TYPES_H --- llvm/cmake/config-ix.cmake | 2 -- llvm/include/llvm/Config/config.h.cmake | 6 ------ llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn | 3 --- .../llvm-project-overlay/llvm/include/llvm/Config/config.h | 6 ------ utils/bazel/llvm_configs/config.h.cmake | 6 ------ 5 files changed, 23 deletions(-) diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake index 3707ca824f6e9c..86f2bac7d23e84 100644 --- a/llvm/cmake/config-ix.cmake +++ b/llvm/cmake/config-ix.cmake @@ -51,11 +51,9 @@ endif() check_include_file(signal.h HAVE_SIGNAL_H) check_include_file(sys/ioctl.h HAVE_SYS_IOCTL_H) check_include_file(sys/mman.h HAVE_SYS_MMAN_H) -check_include_file(sys/param.h HAVE_SYS_PARAM_H) check_include_file(sys/resource.h HAVE_SYS_RESOURCE_H) check_include_file(sys/stat.h HAVE_SYS_STAT_H) check_include_file(sys/time.h HAVE_SYS_TIME_H) -check_include_file(sys/types.h HAVE_SYS_TYPES_H) check_include_file(sysexits.h HAVE_SYSEXITS_H) check_include_file(termios.h HAVE_TERMIOS_H) check_include_file(unistd.h HAVE_UNISTD_H) diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake index d71ff40144c097..4c9404d95daf8d 100644 --- a/llvm/include/llvm/Config/config.h.cmake +++ b/llvm/include/llvm/Config/config.h.cmake @@ -191,9 +191,6 @@ /* Define to 1 if you have the header file. */ #cmakedefine HAVE_SYS_MMAN_H ${HAVE_SYS_MMAN_H} -/* Define to 1 if you have the header file. */ -#cmakedefine HAVE_SYS_PARAM_H ${HAVE_SYS_PARAM_H} - /* Define to 1 if you have the header file. */ #cmakedefine HAVE_SYS_RESOURCE_H ${HAVE_SYS_RESOURCE_H} @@ -209,9 +206,6 @@ /* Define to 1 if stat struct has st_mtim member. */ #cmakedefine HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC ${HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC} -/* Define to 1 if you have the header file. */ -#cmakedefine HAVE_SYS_TYPES_H ${HAVE_SYS_TYPES_H} - /* Define to 1 if you have the header file. */ #cmakedefine HAVE_TERMIOS_H ${HAVE_TERMIOS_H} diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn index 02532f63dd67b0..8a7bb4a27923f0 100644 --- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn @@ -101,7 +101,6 @@ write_cmake_config("config") { "HAVE_PTHREAD_SET_NAME_NP=", "HAVE_SIGNAL_H=1", "HAVE_SYS_STAT_H=1", - "HAVE_SYS_TYPES_H=1", "HAVE_VALGRIND_VALGRIND_H=", "HAVE__ALLOCA=", "HAVE___ALLOCA=", @@ -228,7 +227,6 @@ write_cmake_config("config") { "HAVE_SYSCONF=", "HAVE_SYS_IOCTL_H=", "HAVE_SYS_MMAN_H=", - "HAVE_SYS_PARAM_H=", "HAVE_SYS_RESOURCE_H=", "HAVE_SYS_TIME_H=", "HAVE_TERMIOS_H=", @@ -264,7 +262,6 @@ write_cmake_config("config") { "HAVE_SYSCONF=1", "HAVE_SYS_IOCTL_H=1", "HAVE_SYS_MMAN_H=1", - "HAVE_SYS_PARAM_H=1", "HAVE_SYS_RESOURCE_H=1", "HAVE_SYS_TIME_H=1", "HAVE_TERMIOS_H=1", diff --git a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h index 15696c346bff17..74b4eca0889a7a 100644 --- a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h +++ b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h @@ -198,9 +198,6 @@ /* Define to 1 if you have the header file. */ #define HAVE_SYS_MMAN_H 1 -/* Define to 1 if you have the header file. */ -#define HAVE_SYS_PARAM_H 1 - /* Define to 1 if you have the header file. */ #define HAVE_SYS_RESOURCE_H 1 @@ -216,9 +213,6 @@ /* Define to 1 if stat struct has st_mtim member. */ /* HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC defined in Bazel */ -/* Define to 1 if you have the header file. */ -#define HAVE_SYS_TYPES_H 1 - /* Define to 1 if you have the header file. */ #define HAVE_TERMIOS_H 1 diff --git a/utils/bazel/llvm_configs/config.h.cmake b/utils/bazel/llvm_configs/config.h.cmake index d71ff40144c097..4c9404d95daf8d 100644 --- a/utils/bazel/llvm_configs/config.h.cmake +++ b/utils/bazel/llvm_configs/config.h.cmake @@ -191,9 +191,6 @@ /* Define to 1 if you have the header file. */ #cmakedefine HAVE_SYS_MMAN_H ${HAVE_SYS_MMAN_H} -/* Define to 1 if you have the header file. */ -#cmakedefine HAVE_SYS_PARAM_H ${HAVE_SYS_PARAM_H} - /* Define to 1 if you have the header file. */ #cmakedefine HAVE_SYS_RESOURCE_H ${HAVE_SYS_RESOURCE_H} @@ -209,9 +206,6 @@ /* Define to 1 if stat struct has st_mtim member. */ #cmakedefine HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC ${HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC} -/* Define to 1 if you have the header file. */ -#cmakedefine HAVE_SYS_TYPES_H ${HAVE_SYS_TYPES_H} - /* Define to 1 if you have the header file. */ #cmakedefine HAVE_TERMIOS_H ${HAVE_TERMIOS_H} From e1b40dc06373de1bb8535d543a3887646367dd8d Mon Sep 17 00:00:00 2001 From: Yuxuan Chen Date: Tue, 17 Sep 2024 22:58:21 -0700 Subject: [PATCH 008/321] [Clang] Propagate elide safe context through [[clang::coro_await_elidable_argument]] (#108474) --- clang/docs/ReleaseNotes.rst | 5 +- clang/include/clang/Basic/Attr.td | 8 ++ clang/include/clang/Basic/AttrDocs.td | 83 ++++++++++++++++--- clang/lib/Sema/SemaCoroutine.cpp | 40 ++++++--- .../CodeGenCoroutines/coro-await-elidable.cpp | 40 +++++++++ ...a-attribute-supported-attributes-list.test | 1 + 6 files changed, 153 insertions(+), 24 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index d92b59334f8f32..af3ab14d70d871 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -252,7 +252,10 @@ Attribute Changes in Clang (#GH106864) - Introduced a new attribute ``[[clang::coro_await_elidable]]`` on coroutine return types - to express elideability at call sites where the coroutine is co_awaited as a prvalue. + to express elideability at call sites where the coroutine is invoked under a safe elide context. + +- Introduced a new attribute ``[[clang::coro_await_elidable_argument]]`` on function parameters + to propagate safe elide context to arguments if such function is also under a safe elide context. Improvements to Clang's diagnostics ----------------------------------- diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 35b9716e13ff21..ce86116680d7a3 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -1258,6 +1258,14 @@ def CoroAwaitElidable : InheritableAttr { let SimpleHandler = 1; } +def CoroAwaitElidableArgument : InheritableAttr { + let Spellings = [Clang<"coro_await_elidable_argument">]; + let Subjects = SubjectList<[ParmVar]>; + let LangOpts = [CPlusPlus]; + let Documentation = [CoroAwaitElidableArgumentDoc]; + let SimpleHandler = 1; +} + // OSObject-based attributes. def OSConsumed : InheritableParamAttr { let Spellings = [Clang<"os_consumed">]; diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index cc9bc499c9cc24..8ef151b3f2fddb 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -8258,15 +8258,23 @@ but do not pass them to the underlying coroutine or pass them by value. def CoroAwaitElidableDoc : Documentation { let Category = DocCatDecl; let Content = [{ -The ``[[clang::coro_await_elidable]]`` is a class attribute which can be applied -to a coroutine return type. +The ``[[clang::coro_await_elidable]]`` is a class attribute which can be +applied to a coroutine return type. It provides a hint to the compiler to apply +Heap Allocation Elision more aggressively. -When a coroutine function that returns such a type calls another coroutine function, -the compiler performs heap allocation elision when the call to the coroutine function -is immediately co_awaited as a prvalue. In this case, the coroutine frame for the -callee will be a local variable within the enclosing braces in the caller's stack -frame. And the local variable, like other variables in coroutines, may be collected -into the coroutine frame, which may be allocated on the heap. +When a coroutine function returns such a type, a direct call expression therein +that returns a prvalue of a type attributed ``[[clang::coro_await_elidable]]`` +is said to be under a safe elide context if one of the following is true: +- it is the immediate right-hand side operand to a co_await expression. +- it is an argument to a ``[[clang::coro_await_elidable_argument]]`` parameter +or parameter pack of another direct call expression under a safe elide context. + +Do note that the safe elide context applies only to the call expression itself, +and the context does not transitively include any of its subexpressions unless +exceptional rules of ``[[clang::coro_await_elidable_argument]]`` apply. + +The compiler performs heap allocation elision on call expressions under a safe +elide context, if the callee is a coroutine. Example: @@ -8281,8 +8289,63 @@ Example: co_await t; } -The behavior is undefined if the caller coroutine is destroyed earlier than the -callee coroutine. +Such elision replaces the heap allocated activation frame of the callee coroutine +with a local variable within the enclosing braces in the caller's stack frame. +The local variable, like other variables in coroutines, may be collected into the +coroutine frame, which may be allocated on the heap. The behavior is undefined +if the caller coroutine is destroyed earlier than the callee coroutine. + +}]; +} + +def CoroAwaitElidableArgumentDoc : Documentation { + let Category = DocCatDecl; + let Content = [{ + +The ``[[clang::coro_await_elidable_argument]]`` is a function parameter attribute. +It works in conjunction with ``[[clang::coro_await_elidable]]`` to propagate a +safe elide context to a parameter or parameter pack if the function is called +under a safe elide context. + +This is sometimes necessary on utility functions used to compose or modify the +behavior of a callee coroutine. + +Example: + +.. code-block:: c++ + + template + class [[clang::coro_await_elidable]] Task { ... }; + + template + class [[clang::coro_await_elidable]] WhenAll { ... }; + + // `when_all` is a utility function that composes coroutines. It does not + // need to be a coroutine to propagate. + template + WhenAll when_all([[clang::coro_await_elidable_argument]] Task tasks...); + + Task foo(); + Task bar(); + Task example1() { + // `when_all``, `foo``, and `bar` are all elide safe because `when_all` is + // under a safe elide context and, thanks to the [[clang::coro_await_elidable_argument]] + // attribute, such context is propagated to foo and bar. + co_await when_all(foo(), bar()); + } + + Task example2() { + // `when_all` and `bar` are elide safe. `foo` is not elide safe. + auto f = foo(); + co_await when_all(f, bar()); + } + + + Task example3() { + // None of the calls are elide safe. + auto t = when_all(foo(), bar()); + co_await t; + } }]; } diff --git a/clang/lib/Sema/SemaCoroutine.cpp b/clang/lib/Sema/SemaCoroutine.cpp index a574d56646f3a2..89a0beadc61f3d 100644 --- a/clang/lib/Sema/SemaCoroutine.cpp +++ b/clang/lib/Sema/SemaCoroutine.cpp @@ -849,12 +849,28 @@ static bool isAttributedCoroAwaitElidable(const QualType &QT) { return Record && Record->hasAttr(); } -static bool isCoroAwaitElidableCall(Expr *Operand) { - if (!Operand->isPRValue()) { - return false; - } +static void applySafeElideContext(Expr *Operand) { + auto *Call = dyn_cast(Operand->IgnoreImplicit()); + if (!Call || !Call->isPRValue()) + return; + + if (!isAttributedCoroAwaitElidable(Call->getType())) + return; + + Call->setCoroElideSafe(); - return isAttributedCoroAwaitElidable(Operand->getType()); + // Check parameter + auto *Fn = llvm::dyn_cast_if_present(Call->getCalleeDecl()); + if (!Fn) + return; + + size_t ParmIdx = 0; + for (ParmVarDecl *PD : Fn->parameters()) { + if (PD->hasAttr()) + applySafeElideContext(Call->getArg(ParmIdx)); + + ParmIdx++; + } } // Attempts to resolve and build a CoawaitExpr from "raw" inputs, bailing out to @@ -880,14 +896,12 @@ ExprResult Sema::BuildUnresolvedCoawaitExpr(SourceLocation Loc, Expr *Operand, } auto *RD = Promise->getType()->getAsCXXRecordDecl(); - bool AwaitElidable = - isCoroAwaitElidableCall(Operand) && - isAttributedCoroAwaitElidable( - getCurFunctionDecl(/*AllowLambda=*/true)->getReturnType()); - - if (AwaitElidable) - if (auto *Call = dyn_cast(Operand->IgnoreImplicit())) - Call->setCoroElideSafe(); + + bool CurFnAwaitElidable = isAttributedCoroAwaitElidable( + getCurFunctionDecl(/*AllowLambda=*/true)->getReturnType()); + + if (CurFnAwaitElidable) + applySafeElideContext(Operand); Expr *Transformed = Operand; if (lookupMember(*this, "await_transform", RD, Loc)) { diff --git a/clang/test/CodeGenCoroutines/coro-await-elidable.cpp b/clang/test/CodeGenCoroutines/coro-await-elidable.cpp index 8512995dfad45a..deb19b4a500437 100644 --- a/clang/test/CodeGenCoroutines/coro-await-elidable.cpp +++ b/clang/test/CodeGenCoroutines/coro-await-elidable.cpp @@ -84,4 +84,44 @@ Task nonelidable() { co_return 1; } +// CHECK-LABEL: define{{.*}} @_Z8addTasksO4TaskIiES1_{{.*}} { +Task addTasks([[clang::coro_await_elidable_argument]] Task &&t1, Task &&t2) { + int i1 = co_await t1; + int i2 = co_await t2; + co_return i1 + i2; +} + +// CHECK-LABEL: define{{.*}} @_Z10returnSamei{{.*}} { +Task returnSame(int i) { + co_return i; +} + +// CHECK-LABEL: define{{.*}} @_Z21elidableWithMustAwaitv{{.*}} { +Task elidableWithMustAwait() { + // CHECK: call void @_Z10returnSamei(ptr {{.*}}, i32 noundef 2) #[[ELIDE_SAFE]] + // CHECK: call void @_Z10returnSamei(ptr {{.*}}, i32 noundef 3){{$}} + co_return co_await addTasks(returnSame(2), returnSame(3)); +} + +template +Task sumAll([[clang::coro_await_elidable_argument]] Args && ... tasks); + +// CHECK-LABEL: define{{.*}} @_Z16elidableWithPackv{{.*}} { +Task elidableWithPack() { + // CHECK: call void @_Z10returnSamei(ptr {{.*}}, i32 noundef 1){{$}} + // CHECK: call void @_Z10returnSamei(ptr {{.*}}, i32 noundef 2) #[[ELIDE_SAFE]] + // CHECK: call void @_Z10returnSamei(ptr {{.*}}, i32 noundef 3) #[[ELIDE_SAFE]] + auto t = returnSame(1); + co_return co_await sumAll(t, returnSame(2), returnSame(3)); +} + + +// CHECK-LABEL: define{{.*}} @_Z25elidableWithPackRecursivev{{.*}} { +Task elidableWithPackRecursive() { + // CHECK: call void @_Z10returnSamei(ptr {{.*}}, i32 noundef 1) #[[ELIDE_SAFE]] + // CHECK: call void @_Z10returnSamei(ptr {{.*}}, i32 noundef 2){{$}} + // CHECK: call void @_Z10returnSamei(ptr {{.*}}, i32 noundef 3) #[[ELIDE_SAFE]] + co_return co_await sumAll(addTasks(returnSame(1), returnSame(2)), returnSame(3)); +} + // CHECK: attributes #[[ELIDE_SAFE]] = { coro_elide_safe } diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test index baa1816358b156..914f94c08a9fd9 100644 --- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test +++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test @@ -60,6 +60,7 @@ // CHECK-NEXT: ConsumableSetOnRead (SubjectMatchRule_record) // CHECK-NEXT: Convergent (SubjectMatchRule_function) // CHECK-NEXT: CoroAwaitElidable (SubjectMatchRule_record) +// CHECK-NEXT: CoroAwaitElidableArgument (SubjectMatchRule_variable_is_parameter) // CHECK-NEXT: CoroDisableLifetimeBound (SubjectMatchRule_function) // CHECK-NEXT: CoroLifetimeBound (SubjectMatchRule_record) // CHECK-NEXT: CoroOnlyDestroyWhenComplete (SubjectMatchRule_record) From 30eb19321349827056facd54afab9b856b9f9d0a Mon Sep 17 00:00:00 2001 From: Serban Date: Wed, 18 Sep 2024 09:05:35 +0300 Subject: [PATCH 009/321] [LLDB][lldb-dap][vscode-lldb] Add Environment configuration for the lldb-dap process (#108948) Frequently, environment variables such as `LLDB_USE_NATIVE_PDB_READER` are needed to be able to use lldb-dap in vscode This PR adds a way to set the environment for the lldb-dap process using configuration. --- lldb/tools/lldb-dap/package-lock.json | 4 ++-- lldb/tools/lldb-dap/package.json | 11 ++++++++++- lldb/tools/lldb-dap/src-ts/extension.ts | 11 +++++++++-- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/lldb/tools/lldb-dap/package-lock.json b/lldb/tools/lldb-dap/package-lock.json index 96570e42dbfdc4..8663659715590a 100644 --- a/lldb/tools/lldb-dap/package-lock.json +++ b/lldb/tools/lldb-dap/package-lock.json @@ -1,12 +1,12 @@ { "name": "lldb-dap", - "version": "0.2.4", + "version": "0.2.6", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "lldb-dap", - "version": "0.2.4", + "version": "0.2.6", "license": "Apache 2.0 License with LLVM exceptions", "devDependencies": { "@types/node": "^18.11.18", diff --git a/lldb/tools/lldb-dap/package.json b/lldb/tools/lldb-dap/package.json index d35accfb6ec4e8..33b09d56ab17e2 100644 --- a/lldb/tools/lldb-dap/package.json +++ b/lldb/tools/lldb-dap/package.json @@ -1,7 +1,7 @@ { "name": "lldb-dap", "displayName": "LLDB DAP", - "version": "0.2.5", + "version": "0.2.6", "publisher": "llvm-vs-code-extensions", "homepage": "https://lldb.llvm.org", "description": "LLDB debugging from VSCode", @@ -78,6 +78,15 @@ "scope": "resource", "type": "string", "description": "The log path for lldb-dap (if any)" + }, + "lldb-dap.environment": { + "scope": "resource", + "type": "object", + "default": {}, + "description": "The environment of the lldb-dap process.", + "additionalProperties": { + "type": "string" + } } } }, diff --git a/lldb/tools/lldb-dap/src-ts/extension.ts b/lldb/tools/lldb-dap/src-ts/extension.ts index fdc4f47b238b5a..36d3dfba18c142 100644 --- a/lldb/tools/lldb-dap/src-ts/extension.ts +++ b/lldb/tools/lldb-dap/src-ts/extension.ts @@ -25,9 +25,15 @@ function createDefaultLLDBDapOptions(): LLDBDapOptions { if (log_path) { env["LLDBDAP_LOG"] = log_path; } - + const configEnvironment = config.get<{ [key: string]: string }>("environment") || {}; if (path) { - return new vscode.DebugAdapterExecutable(path, [], { env }); + const dbgOptions = { + env: { + ...configEnvironment, + ...env, + } + }; + return new vscode.DebugAdapterExecutable(path, [], dbgOptions); } else if (packageJSONExecutable) { return new vscode.DebugAdapterExecutable( packageJSONExecutable.command, @@ -36,6 +42,7 @@ function createDefaultLLDBDapOptions(): LLDBDapOptions { ...packageJSONExecutable.options, env: { ...packageJSONExecutable.options?.env, + ...configEnvironment, ...env, }, }, From fe012bd52dd7638cfa9abeae786c28a75cde939b Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 17 Sep 2024 23:26:56 -0700 Subject: [PATCH 010/321] [SelectionDAG] Use Register around RegisterSDNode related functions. NFC RegisterSDNode itself already stored a Register. --- llvm/include/llvm/CodeGen/SelectionDAG.h | 10 +++++----- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 8 ++++---- .../CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 14 +++++++------- llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp | 4 ++-- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index 7ee8ca18c2c1de..d6c2c36a0d482a 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -772,7 +772,7 @@ class SelectionDAG { SDValue getMCSymbol(MCSymbol *Sym, EVT VT); SDValue getValueType(EVT); - SDValue getRegister(unsigned Reg, EVT VT); + SDValue getRegister(Register Reg, EVT VT); SDValue getRegisterMask(const uint32_t *RegMask); SDValue getEHLabel(const SDLoc &dl, SDValue Root, MCSymbol *Label); SDValue getLabelNode(unsigned Opcode, const SDLoc &dl, SDValue Root, @@ -784,7 +784,7 @@ class SelectionDAG { return getBlockAddress(BA, VT, Offset, true, TargetFlags); } - SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, + SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N) { return getNode(ISD::CopyToReg, dl, MVT::Other, Chain, getRegister(Reg, N.getValueType()), N); @@ -793,7 +793,7 @@ class SelectionDAG { // This version of the getCopyToReg method takes an extra operand, which // indicates that there is potentially an incoming glue value (if Glue is not // null) and that there should be a glue result. - SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N, + SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N, SDValue Glue) { SDVTList VTs = getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, getRegister(Reg, N.getValueType()), N, Glue }; @@ -810,7 +810,7 @@ class SelectionDAG { ArrayRef(Ops, Glue.getNode() ? 4 : 3)); } - SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT) { + SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT) { SDVTList VTs = getVTList(VT, MVT::Other); SDValue Ops[] = { Chain, getRegister(Reg, VT) }; return getNode(ISD::CopyFromReg, dl, VTs, Ops); @@ -819,7 +819,7 @@ class SelectionDAG { // This version of the getCopyFromReg method takes an extra operand, which // indicates that there is potentially an incoming glue value (if Glue is not // null) and that there should be a glue result. - SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT, + SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT, SDValue Glue) { SDVTList VTs = getVTList(VT, MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, getRegister(Reg, VT), Glue }; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 44ec6f7cab145a..3918da3ef031b6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -760,7 +760,7 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) { ID.AddPointer(cast(N)->getBasicBlock()); break; case ISD::Register: - ID.AddInteger(cast(N)->getReg()); + ID.AddInteger(cast(N)->getReg().id()); break; case ISD::RegisterMask: ID.AddPointer(cast(N)->getRegMask()); @@ -2292,16 +2292,16 @@ SDValue SelectionDAG::getCommutedVectorShuffle(const ShuffleVectorSDNode &SV) { return getVectorShuffle(VT, SDLoc(&SV), Op1, Op0, MaskVec); } -SDValue SelectionDAG::getRegister(unsigned RegNo, EVT VT) { +SDValue SelectionDAG::getRegister(Register Reg, EVT VT) { SDVTList VTs = getVTList(VT); FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::Register, VTs, std::nullopt); - ID.AddInteger(RegNo); + ID.AddInteger(Reg.id()); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); - auto *N = newSDNode(RegNo, VTs); + auto *N = newSDNode(Reg, VTs); N->SDNodeBits.IsDivergent = TLI->isSDNodeSourceOfDivergence(N, FLI, UA); CSEMap.InsertNode(N, IP); InsertNode(N); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 1dbcf8fd765101..c7a340f3f82b6f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -2183,7 +2183,7 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) { } if (!FuncInfo.CanLowerReturn) { - unsigned DemoteReg = FuncInfo.DemoteRegister; + Register DemoteReg = FuncInfo.DemoteRegister; const Function *F = I.getParent()->getParent(); // Emit a store of the return value through the virtual register. @@ -3013,7 +3013,7 @@ void SelectionDAGBuilder::visitJumpTableHeader(SwitchCG::JumpTable &JT, SwitchOp = DAG.getZExtOrTrunc(Sub, dl, TLI.getJumpTableRegTy(DAG.getDataLayout())); - unsigned JumpTableReg = + Register JumpTableReg = FuncInfo.CreateReg(TLI.getJumpTableRegTy(DAG.getDataLayout())); SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), dl, JumpTableReg, SwitchOp); @@ -7748,7 +7748,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, const auto *CPI = cast(I.getArgOperand(0)); MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); const TargetRegisterClass *PtrRC = TLI.getRegClassFor(PtrVT); - unsigned VReg = FuncInfo.getCatchPadExceptionPointerVReg(CPI, PtrRC); + Register VReg = FuncInfo.getCatchPadExceptionPointerVReg(CPI, PtrRC); SDValue N = DAG.getCopyFromReg(DAG.getEntryNode(), sdl, VReg, PtrVT); if (Intrinsic == Intrinsic::eh_exceptioncode) N = DAG.getZExtOrTrunc(N, sdl, MVT::i32); @@ -11817,8 +11817,8 @@ void SelectionDAGISel::LowerArguments(const Function &F) { // Update the SwiftErrorVRegDefMap. if (Res.getOpcode() == ISD::CopyFromReg && isSwiftErrorArg) { - unsigned Reg = cast(Res.getOperand(1))->getReg(); - if (Register::isVirtualRegister(Reg)) + Register Reg = cast(Res.getOperand(1))->getReg(); + if (Reg.isVirtual()) SwiftError->setCurrentVReg(FuncInfo->MBB, SwiftError->getFunctionArg(), Reg); } @@ -11829,8 +11829,8 @@ void SelectionDAGISel::LowerArguments(const Function &F) { // If we can, though, try to skip creating an unnecessary vreg. // FIXME: This isn't very clean... it would be nice to make this more // general. - unsigned Reg = cast(Res.getOperand(1))->getReg(); - if (Register::isVirtualRegister(Reg)) { + Register Reg = cast(Res.getOperand(1))->getReg(); + if (Reg.isVirtual()) { FuncInfo->ValueMap[&Arg] = Reg; continue; } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 263a213bd4f641..2a97580942df36 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -895,8 +895,8 @@ void SelectionDAGISel::ComputeLiveOutVRegInfo() { if (N->getOpcode() != ISD::CopyToReg) continue; - unsigned DestReg = cast(N->getOperand(1))->getReg(); - if (!Register::isVirtualRegister(DestReg)) + Register DestReg = cast(N->getOperand(1))->getReg(); + if (!DestReg.isVirtual()) continue; // Ignore non-integer values. From 9d3ab1c36e03d5ad23e209938c32973fbee18a57 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 17 Sep 2024 23:45:58 -0700 Subject: [PATCH 011/321] [SelectionDAGBuilder] Use Register in more places. NFC" --- .../SelectionDAG/SelectionDAGBuilder.cpp | 22 +++++++++---------- .../SelectionDAG/SelectionDAGBuilder.h | 8 +++---- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index c7a340f3f82b6f..a719ff859e778e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -845,13 +845,13 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL, } } -RegsForValue::RegsForValue(const SmallVector ®s, MVT regvt, +RegsForValue::RegsForValue(const SmallVector ®s, MVT regvt, EVT valuevt, std::optional CC) : ValueVTs(1, valuevt), RegVTs(1, regvt), Regs(regs), RegCount(1, regs.size()), CallConv(CC) {} RegsForValue::RegsForValue(LLVMContext &Context, const TargetLowering &TLI, - const DataLayout &DL, unsigned Reg, Type *Ty, + const DataLayout &DL, Register Reg, Type *Ty, std::optional CC) { ComputeValueVTs(TLI, DL, Ty, ValueVTs); @@ -870,7 +870,7 @@ RegsForValue::RegsForValue(LLVMContext &Context, const TargetLowering &TLI, Regs.push_back(Reg + i); RegVTs.push_back(RegisterVT); RegCount.push_back(NumRegs); - Reg += NumRegs; + Reg = Reg.id() + NumRegs; } } @@ -1070,9 +1070,9 @@ void RegsForValue::AddInlineAsmOperands(InlineAsm::Kind Code, bool HasMatching, } } -SmallVector, 4> +SmallVector, 4> RegsForValue::getRegsAndSizes() const { - SmallVector, 4> OutVec; + SmallVector, 4> OutVec; unsigned I = 0; for (auto CountAndVT : zip_first(RegCount, RegVTs)) { unsigned RegCount = std::get<0>(CountAndVT); @@ -5956,7 +5956,7 @@ static SDValue expandDivFix(unsigned Opcode, const SDLoc &DL, // getUnderlyingArgRegs - Find underlying registers used for a truncated, // bitcasted, or split argument. Returns a list of static void -getUnderlyingArgRegs(SmallVectorImpl> &Regs, +getUnderlyingArgRegs(SmallVectorImpl> &Regs, const SDValue &N) { switch (N.getOpcode()) { case ISD::CopyFromReg: { @@ -6101,7 +6101,7 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue( if (FI != std::numeric_limits::max()) Op = MachineOperand::CreateFI(FI); - SmallVector, 8> ArgRegsAndSizes; + SmallVector, 8> ArgRegsAndSizes; if (!Op && N.getNode()) { getUnderlyingArgRegs(ArgRegsAndSizes, N); Register Reg; @@ -6131,7 +6131,7 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue( if (!Op) { // Create a DBG_VALUE for each decomposed value in ArgRegs to cover Reg - auto splitMultiRegDbgValue = [&](ArrayRef> + auto splitMultiRegDbgValue = [&](ArrayRef> SplitRegs) { unsigned Offset = 0; for (const auto &RegAndSize : SplitRegs) { @@ -9653,7 +9653,7 @@ getRegistersForValue(SelectionDAG &DAG, const SDLoc &DL, const TargetLowering &TLI = DAG.getTargetLoweringInfo(); MachineFunction &MF = DAG.getMachineFunction(); - SmallVector Regs; + SmallVector Regs; const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); // No work to do for memory/address operands. @@ -10078,7 +10078,7 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call, return; } - SmallVector Regs; + SmallVector Regs; MachineFunction &MF = DAG.getMachineFunction(); MachineRegisterInfo &MRI = MF.getRegInfo(); const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); @@ -12654,7 +12654,7 @@ void SelectionDAGBuilder::visitCallBrLandingPad(const CallInst &I) { // getRegistersForValue may produce 1 to many registers based on whether // the OpInfo.ConstraintVT is legal on the target or not. - for (unsigned &Reg : OpInfo.AssignedRegs.Regs) { + for (Register &Reg : OpInfo.AssignedRegs.Regs) { Register OriginalDef = FollowCopyChain(MRI, InitialDef++); if (Register::isPhysicalRegister(OriginalDef)) FuncInfo.MBB->addLiveIn(OriginalDef); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index b13a2df7b48eb8..9544f02b9a4808 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -740,7 +740,7 @@ struct RegsForValue { /// This list holds the registers assigned to the values. /// Each legal or promoted value requires one register, and each /// expanded value requires multiple registers. - SmallVector Regs; + SmallVector Regs; /// This list holds the number of registers for each value. SmallVector RegCount; @@ -750,10 +750,10 @@ struct RegsForValue { std::optional CallConv; RegsForValue() = default; - RegsForValue(const SmallVector ®s, MVT regvt, EVT valuevt, + RegsForValue(const SmallVector ®s, MVT regvt, EVT valuevt, std::optional CC = std::nullopt); RegsForValue(LLVMContext &Context, const TargetLowering &TLI, - const DataLayout &DL, unsigned Reg, Type *Ty, + const DataLayout &DL, Register Reg, Type *Ty, std::optional CC); bool isABIMangled() const { return CallConv.has_value(); } @@ -796,7 +796,7 @@ struct RegsForValue { } /// Return a list of registers and their sizes. - SmallVector, 4> getRegsAndSizes() const; + SmallVector, 4> getRegsAndSizes() const; }; } // end namespace llvm From 3d2925b9de0d60694a9f28edd2419f8eed34f1a1 Mon Sep 17 00:00:00 2001 From: Hans Date: Wed, 18 Sep 2024 08:58:14 +0200 Subject: [PATCH 012/321] [win/asan] AllocateMemoryForTrampoline within 2 GB of the module's base address (#108822) Since we may copy code (see CopyInstructions) to the trampoline which could reference data inside the original module, we really want the trampoline to be within 2 GB of not just the original function, but within anything that function may have rip-relative accesses to, i.e. within 2 GB of that function's whole module. This fixes interception failures like the following scenario: 1. Intercept `CreateProcess` in kernel32.dll, allocating a trampoline region right after 2. Start intercepting `memcpy` in the main executable, which is loaded at a lower address than kernel32.dll, but still within 2 GB of the trampoline region so we keep using it. 3. Try to copy instructions from `memcpy` to the trampoline. Turns out one instruction references data that is more than 2GB away from the trampoline, so it can't be relocated. 4. The process exits due to a CHECK failure (Full story at https://crbug.com/341936875#comment45 and following.) --- .../lib/interception/interception_win.cpp | 26 ++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/compiler-rt/lib/interception/interception_win.cpp b/compiler-rt/lib/interception/interception_win.cpp index a638e66eccee58..a0ff124a89c9ed 100644 --- a/compiler-rt/lib/interception/interception_win.cpp +++ b/compiler-rt/lib/interception/interception_win.cpp @@ -130,6 +130,7 @@ #include "sanitizer_common/sanitizer_platform.h" #define WIN32_LEAN_AND_MEAN #include +#include namespace __interception { @@ -385,7 +386,30 @@ void TestOnlyReleaseTrampolineRegions() { } } -static uptr AllocateMemoryForTrampoline(uptr image_address, size_t size) { +static uptr AllocateMemoryForTrampoline(uptr func_address, size_t size) { + uptr image_address = func_address; + +#if SANITIZER_WINDOWS64 + // Allocate memory after the module (DLL or EXE file), but within 2GB + // of the start of the module so that any address within the module can be + // referenced with PC-relative operands. + // This allows us to not just jump to the trampoline with a PC-relative + // offset, but to relocate any instructions that we copy to the trampoline + // which have references to the original module. If we can't find the base + // address of the module (e.g. if func_address is in mmap'ed memory), just + // use func_address as is. + HMODULE module; + if (::GetModuleHandleExW(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | + GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, + (LPCWSTR)func_address, &module)) { + MODULEINFO module_info; + if (::GetModuleInformation(::GetCurrentProcess(), module, + &module_info, sizeof(module_info))) { + image_address = (uptr)module_info.lpBaseOfDll; + } + } +#endif + // Find a region within 2G with enough space to allocate |size| bytes. TrampolineMemoryRegion *region = nullptr; for (size_t bucket = 0; bucket < kMaxTrampolineRegion; ++bucket) { From a2994b299986305f17917d61a99fc18185e209f0 Mon Sep 17 00:00:00 2001 From: LiqinWeng Date: Wed, 18 Sep 2024 15:03:37 +0800 Subject: [PATCH 013/321] [LV][NFC] Unify printing for WidenEVLReicpe with other EVL recipes (#108177) --- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 4 ++-- .../Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 9068ccf519c55c..ecdf0b526f608d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1353,9 +1353,9 @@ void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN-VP "; + O << Indent << "WIDEN "; printAsOperand(O, SlotTracker); - O << " = " << Instruction::getOpcodeName(getOpcode()); + O << " = vp." << Instruction::getOpcodeName(getOpcode()); printFlags(O); printOperands(O, SlotTracker); } diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll index 04b3ba52cbefc6..6dfe5b608199b3 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll @@ -31,7 +31,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> ; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]> -; IF-EVL-NEXT: WIDEN-VP ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]>, vp<[[EVL]]> +; IF-EVL-NEXT: WIDEN ir<[[ADD:%.+]]> = vp.add nsw ir<[[LD2]]>, ir<[[LD1]]>, vp<[[EVL]]> ; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]> ; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[EVL]]> From e0a16371c6cce47e2b0626225a727b458ebe7666 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Wed, 18 Sep 2024 00:08:23 -0700 Subject: [PATCH 014/321] [AMDGPU] Omit isReg() check for all_uses() in SIInsertWaitcnts. NFC. (#109041) --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index fd9fe1196b7853..a5668272601384 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -820,7 +820,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, Inst.getOpcode() != AMDGPU::DS_CONSUME && Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) { for (const MachineOperand &Op : Inst.all_uses()) { - if (Op.isReg() && TRI->isVectorRegister(*MRI, Op.getReg())) + if (TRI->isVectorRegister(*MRI, Op.getReg())) setExpScore(&Inst, TRI, MRI, Op, CurrScore); } } @@ -872,7 +872,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, } } for (const MachineOperand &Op : Inst.all_uses()) { - if (Op.isReg() && TRI->isVectorRegister(*MRI, Op.getReg())) + if (TRI->isVectorRegister(*MRI, Op.getReg())) setExpScore(&Inst, TRI, MRI, Op, CurrScore); } } @@ -2327,7 +2327,7 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, HasVMemStore = true; } for (const MachineOperand &Op : MI.all_uses()) { - if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg())) + if (!TRI->isVectorRegister(*MRI, Op.getReg())) continue; RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, Op); // Vgpr use From 08f5f6dc8b09c702125e57a5e87ba56203de6263 Mon Sep 17 00:00:00 2001 From: Vyacheslav Levytskyy Date: Wed, 18 Sep 2024 09:22:30 +0200 Subject: [PATCH 015/321] [SPIR-V] Fix incorrect emission of G_SPLAT_VECTOR for fixed vectors (#108534) This PR replaces MIRBuilder.buildSplatVector() by MIRBuilder.buildSplatBuildVector(), so that we emit G_BUILD_VECTOR instead of G_SPLAT_VECTOR: the latter is incorrect for fixed vectors and is limited to scalable vectors only. --- llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp index 553d86efa3df34..ca3e47a4b78f23 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp @@ -496,7 +496,7 @@ Register SPIRVGlobalRegistry::getOrCreateIntCompositeOrNull( assignSPIRVTypeToVReg(SpvType, SpvVecConst, *CurMF); DT.add(CA, CurMF, SpvVecConst); if (EmitIR) { - MIRBuilder.buildSplatVector(SpvVecConst, SpvScalConst); + MIRBuilder.buildSplatBuildVector(SpvVecConst, SpvScalConst); } else { if (Val) { auto MIB = MIRBuilder.buildInstr(SPIRV::OpConstantComposite) From 5a8d2dd1f97017253be5d2262b25a9cf9d002546 Mon Sep 17 00:00:00 2001 From: Aditi Medhane Date: Wed, 18 Sep 2024 13:14:49 +0530 Subject: [PATCH 016/321] [AMDGPU] Handle subregisters properly in generic operand legalizer (#108496) Fix for the issue found during COPY introduction during legalization of PHI operands for sgpr to vgpr copy when subreg is involved. --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 5 ++--- llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index e4a679f6a3ef8f..30aa36be99c95f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6231,10 +6231,9 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, return; Register DstReg = MRI.createVirtualRegister(DstRC); - auto Copy = BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op); - + auto Copy = + BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg); Op.setReg(DstReg); - Op.setSubReg(0); MachineInstr *Def = MRI.getVRegDef(OpReg); if (!Def) diff --git a/llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir b/llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir index dab4c9d401407b..d21dbd290accea 100644 --- a/llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir +++ b/llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir @@ -73,13 +73,13 @@ body: | ; GCN-NEXT: successors: %bb.2(0x80000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[S_ADD_U:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO [[COPY]], [[COPY1]], implicit-def $scc - ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_ADD_U]].sub0, implicit $exec + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_ADD_U]], implicit $exec ; GCN-NEXT: S_BRANCH %bb.2 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: ; GCN-NEXT: successors: %bb.3(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B64_e32_]].sub0, %bb.3, [[COPY2]], %bb.1 + ; GCN-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B64_e32_]].sub0, %bb.3, [[COPY2]].sub0, %bb.1 ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.3: From 94a98cf5dc9317ba8b01c31443ca563808323d11 Mon Sep 17 00:00:00 2001 From: Chengjun Date: Wed, 18 Sep 2024 01:04:49 -0700 Subject: [PATCH 017/321] [InstCombine] Remove dead phi web (#108876) In current visitPHINode function during InstCombine, it can remove dead phi cycles (all phis have one use, which is another phi). However, it cannot deal with the case when the phis form a web (all phis have one or more uses, and all the uses are phi). This change extends the algorithm so that it can also deal with the dead phi web. --- .../InstCombine/InstCombineInternal.h | 5 ++ .../Transforms/InstCombine/InstCombinePHI.cpp | 64 ++++++++++--------- llvm/test/Transforms/InstCombine/phi.ll | 51 +++++++++++++++ 3 files changed, 89 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index a051a568bfd62e..da6f991ad4cd15 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -634,6 +634,11 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final Instruction *foldPHIArgZextsIntoPHI(PHINode &PN); Instruction *foldPHIArgIntToPtrToPHI(PHINode &PN); + /// If the phi is within a phi web, which is formed by the def-use chain + /// of phis and all the phis in the web are only used in the other phis. + /// In this case, these phis are dead and we will remove all of them. + bool foldDeadPhiWeb(PHINode &PN); + /// If an integer typed PHI has only one use which is an IntToPtr operation, /// replace the PHI with an existing pointer typed PHI if it exists. Otherwise /// insert a new pointer typed PHI and replace the original one. diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp index bcff9a72b65724..cb5c4473051262 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -53,6 +53,34 @@ void InstCombinerImpl::PHIArgMergedDebugLoc(Instruction *Inst, PHINode &PN) { } } +/// If the phi is within a phi web, which is formed by the def-use chain +/// of phis and all the phis in the web are only used in the other phis. +/// In this case, these phis are dead and we will remove all of them. +bool InstCombinerImpl::foldDeadPhiWeb(PHINode &PN) { + SmallVector Stack; + SmallPtrSet Visited; + Stack.push_back(&PN); + while (!Stack.empty()) { + PHINode *Phi = Stack.pop_back_val(); + if (!Visited.insert(Phi).second) + continue; + // Early stop if the set of PHIs is large + if (Visited.size() == 16) + return false; + for (User *Use : Phi->users()) { + if (PHINode *PhiUse = dyn_cast(Use)) + Stack.push_back(PhiUse); + else + return false; + } + } + for (PHINode *Phi : Visited) + replaceInstUsesWith(*Phi, PoisonValue::get(Phi->getType())); + for (PHINode *Phi : Visited) + eraseInstFromFunction(*Phi); + return true; +} + // Replace Integer typed PHI PN if the PHI's value is used as a pointer value. // If there is an existing pointer typed PHI that produces the same value as PN, // replace PN and the IntToPtr operation with it. Otherwise, synthesize a new @@ -976,26 +1004,6 @@ Instruction *InstCombinerImpl::foldPHIArgOpIntoPHI(PHINode &PN) { return NewCI; } -/// Return true if this PHI node is only used by a PHI node cycle that is dead. -static bool isDeadPHICycle(PHINode *PN, - SmallPtrSetImpl &PotentiallyDeadPHIs) { - if (PN->use_empty()) return true; - if (!PN->hasOneUse()) return false; - - // Remember this node, and if we find the cycle, return. - if (!PotentiallyDeadPHIs.insert(PN).second) - return true; - - // Don't scan crazily complex things. - if (PotentiallyDeadPHIs.size() == 16) - return false; - - if (PHINode *PU = dyn_cast(PN->user_back())) - return isDeadPHICycle(PU, PotentiallyDeadPHIs); - - return false; -} - /// Return true if this phi node is always equal to NonPhiInVal. /// This happens with mutually cyclic phi nodes like: /// z = some value; x = phi (y, z); y = phi (x, z) @@ -1474,27 +1482,21 @@ Instruction *InstCombinerImpl::visitPHINode(PHINode &PN) { } } - // If this is a trivial cycle in the PHI node graph, remove it. Basically, if - // this PHI only has a single use (a PHI), and if that PHI only has one use (a - // PHI)... break the cycle. + if (foldDeadPhiWeb(PN)) + return nullptr; + + // Optimization when the phi only has one use if (PN.hasOneUse()) { if (foldIntegerTypedPHI(PN)) return nullptr; - Instruction *PHIUser = cast(PN.user_back()); - if (PHINode *PU = dyn_cast(PHIUser)) { - SmallPtrSet PotentiallyDeadPHIs; - PotentiallyDeadPHIs.insert(&PN); - if (isDeadPHICycle(PU, PotentiallyDeadPHIs)) - return replaceInstUsesWith(PN, PoisonValue::get(PN.getType())); - } - // If this phi has a single use, and if that use just computes a value for // the next iteration of a loop, delete the phi. This occurs with unused // induction variables, e.g. "for (int j = 0; ; ++j);". Detecting this // common case here is good because the only other things that catch this // are induction variable analysis (sometimes) and ADCE, which is only run // late. + Instruction *PHIUser = cast(PN.user_back()); if (PHIUser->hasOneUse() && (isa(PHIUser) || isa(PHIUser) || isa(PHIUser)) && diff --git a/llvm/test/Transforms/InstCombine/phi.ll b/llvm/test/Transforms/InstCombine/phi.ll index 3b1fa3a97d9cd7..b33ad9a7d339f2 100644 --- a/llvm/test/Transforms/InstCombine/phi.ll +++ b/llvm/test/Transforms/InstCombine/phi.ll @@ -2742,3 +2742,54 @@ loop.latch: call void @use(i32 %and) br label %loop } + +define void @test_dead_phi_web(i64 %index, i1 %cond) { +; CHECK-LABEL: @test_dead_phi_web( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[BB0:%.*]] +; CHECK: BB0: +; CHECK-NEXT: switch i64 [[INDEX:%.*]], label [[BB4:%.*]] [ +; CHECK-NEXT: i64 0, label [[BB1:%.*]] +; CHECK-NEXT: i64 1, label [[BB2:%.*]] +; CHECK-NEXT: i64 2, label [[BB3:%.*]] +; CHECK-NEXT: ] +; CHECK: BB1: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[BB2]], label [[BB4]] +; CHECK: BB2: +; CHECK-NEXT: br i1 [[COND]], label [[BB3]], label [[BB4]] +; CHECK: BB3: +; CHECK-NEXT: br label [[BB4]] +; CHECK: BB4: +; CHECK-NEXT: br i1 [[COND]], label [[BB0]], label [[BB5:%.*]] +; CHECK: BB5: +; CHECK-NEXT: ret void +; +entry: + br label %BB0 + +BB0: ; preds = %BB4, %entry + %a = phi float [ 0.0, %entry ], [ %x, %BB4 ] + switch i64 %index, label %BB4 [ + i64 0, label %BB1 + i64 1, label %BB2 + i64 2, label %BB3 + ] + +BB1: ; preds = %BB0 + br i1 %cond, label %BB2, label %BB4 + +BB2: ; preds = %BB1, %BB0 + %b = phi float [ 2.0, %BB0 ], [ %a, %BB1 ] + br i1 %cond, label %BB3, label %BB4 + +BB3: ; preds = %BB2, %BB0 + %c = phi float [ 3.0, %BB0 ], [ %b, %BB2 ] + br label %BB4 + +BB4: ; preds = %BB3, %BB2, %BB1, %BB0 + %x = phi float [ %a, %BB0 ], [ %a, %BB1 ], [ %b, %BB2 ], [ %c, %BB3 ] + br i1 %cond, label %BB0, label %BB5 + +BB5: ; preds = %BB4 + ret void +} From 0dd56858fe188419182a57d0e03c8cd0aa693867 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20=C3=81lvarez=20Ayll=C3=B3n?= Date: Wed, 18 Sep 2024 10:11:03 +0200 Subject: [PATCH 018/321] [clang][Sema] Fix assertion in `tryDiagnoseOverloadedCast` (#108021) Fixed an assertion failure in debug mode, and potential crashes in release mode, when diagnosing a failed cast caused indirectly by a failed implicit conversion to the type of the constructor parameter. For instance ``` template struct StringTrait {}; template< int N > struct StringTrait< const char[ N ] > { typedef char CharType; static const MissingIntT length = N - 1; }; class String { public: template String(T& str, typename StringTrait::CharType = 0); }; class Exception { public: Exception(String const&); }; void foo() { throw Exception("some error"); } ``` `Exception(String const&)` is a matching constructor for `Exception` from a `const char*`, via an implicit conversion to `String`. However, the instantiation of the `String` constructor will fail because of the missing type `MissingIntT` inside the specialization of `StringTrait`. When trying to emit a diagnosis, `tryDiagnoseOverloadedCast` expects not to have a matching constructor, but there is; it just could not be instantiated. --- clang/docs/ReleaseNotes.rst | 2 ++ clang/lib/Sema/SemaCast.cpp | 7 ++++- .../cxx-bad-cast-diagnose-broken-template.cpp | 26 +++++++++++++++++++ 3 files changed, 34 insertions(+), 1 deletion(-) create mode 100644 clang/test/Parser/cxx-bad-cast-diagnose-broken-template.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index af3ab14d70d871..3ed9a2984a38fe 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -397,6 +397,8 @@ Bug Fixes to C++ Support - Fixed a crash when clang tries to subtitute parameter pack while retaining the parameter pack. #GH63819, #GH107560 - Fix a crash when a static assert declaration has an invalid close location. (#GH108687) +- Fixed an assertion failure in debug mode, and potential crashes in release mode, when + diagnosing a failed cast caused indirectly by a failed implicit conversion to the type of the constructor parameter. Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp index f01b22a72915c8..6ac6201843476b 100644 --- a/clang/lib/Sema/SemaCast.cpp +++ b/clang/lib/Sema/SemaCast.cpp @@ -446,7 +446,12 @@ static bool tryDiagnoseOverloadedCast(Sema &S, CastType CT, : InitializationKind::CreateCast(/*type range?*/ range); InitializationSequence sequence(S, entity, initKind, src); - assert(sequence.Failed() && "initialization succeeded on second try?"); + // It could happen that a constructor failed to be used because + // it requires a temporary of a broken type. Still, it will be found when + // looking for a match. + if (!sequence.Failed()) + return false; + switch (sequence.getFailureKind()) { default: return false; diff --git a/clang/test/Parser/cxx-bad-cast-diagnose-broken-template.cpp b/clang/test/Parser/cxx-bad-cast-diagnose-broken-template.cpp new file mode 100644 index 00000000000000..3500975d936953 --- /dev/null +++ b/clang/test/Parser/cxx-bad-cast-diagnose-broken-template.cpp @@ -0,0 +1,26 @@ +// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -fsyntax-only -verify %s + +template +struct StringTrait {}; + +template< int N > +struct StringTrait< const char[ N ] > { + typedef char CharType; + static const MissingIntT length = N - 1; // expected-error {{unknown type name 'MissingIntT'}} +}; + +class String { +public: + template + String(T& str, typename StringTrait::CharType = 0); +}; + + +class Exception { +public: + Exception(String const&); +}; + +void foo() { + throw Exception("some error"); // expected-error {{functional-style cast from 'const char[11]' to 'Exception' is not allowed}} +} From 24748339f517ede038b45202680d281b38809ceb Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Wed, 18 Sep 2024 09:32:37 +0100 Subject: [PATCH 019/321] [Clang][AMDGPU] Handle builtin types more generically. NFC. (#109004) Tweak encodeTypeForFunctionPointerAuth to handle all AMDGPU builtin types generically instead of just __amdgpu_buffer_rsrc_t which happens to be the only one defined so far. --- clang/lib/AST/ASTContext.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index 67841a30a571f3..ebd4a41ee6367a 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -3377,7 +3377,8 @@ static void encodeTypeForFunctionPointerAuth(const ASTContext &Ctx, #include "clang/Basic/HLSLIntangibleTypes.def" case BuiltinType::Dependent: llvm_unreachable("should never get here"); - case BuiltinType::AMDGPUBufferRsrc: +#define AMDGPU_TYPE(Name, Id, SingletonId) case BuiltinType::Id: +#include "clang/Basic/AMDGPUTypes.def" case BuiltinType::WasmExternRef: #define RVV_TYPE(Name, Id, SingletonId) case BuiltinType::Id: #include "clang/Basic/RISCVVTypes.def" From c2c425fccfaaf7e7e94dd057905cd6e91858443f Mon Sep 17 00:00:00 2001 From: Franklin Date: Wed, 18 Sep 2024 16:33:39 +0800 Subject: [PATCH 020/321] [AArch64] Add missing tests for Arm cpus (#106749) --- llvm/unittests/TargetParser/Host.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/llvm/unittests/TargetParser/Host.cpp b/llvm/unittests/TargetParser/Host.cpp index f8dd1d3a60a005..5e2edcef09bf8c 100644 --- a/llvm/unittests/TargetParser/Host.cpp +++ b/llvm/unittests/TargetParser/Host.cpp @@ -82,9 +82,21 @@ TEST(getLinuxHostCPUName, AArch64) { EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0x41\n" "CPU part : 0xd40"), "neoverse-v1"); + EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0x41\n" + "CPU part : 0xd4f"), + "neoverse-v2"); + EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0x41\n" + "CPU part : 0xd84"), + "neoverse-v3"); EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0x41\n" "CPU part : 0xd0c"), "neoverse-n1"); + EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0x41\n" + "CPU part : 0xd49"), + "neoverse-n2"); + EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0x41\n" + "CPU part : 0xd8e"), + "neoverse-n3"); // Verify that both CPU implementer and CPU part are checked: EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0x40\n" "CPU part : 0xd03"), From ef34cba1c38870197e2fe2ecd4c8326fc4b98340 Mon Sep 17 00:00:00 2001 From: Franklin Date: Wed, 18 Sep 2024 16:33:57 +0800 Subject: [PATCH 021/321] [AArch64] Fix sched model of Neoverse N2 (#106376) * fix write order of "Load vector reg, immed post-index" * fix a typo --- .../Target/AArch64/AArch64SchedNeoverseN2.td | 9 ++-- .../llvm-mca/AArch64/Neoverse/N2-writeback.s | 50 +++++++++---------- 2 files changed, 30 insertions(+), 29 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td index 8a7d2af3449814..737fc7390455d8 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td +++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td @@ -836,9 +836,11 @@ def : InstRW<[N2Write_3c_1V], (instregex "^FCVT[AMNPZ][SU][SU][WX][HSD]r$")>; def : SchedAlias; // FP move, immed -// FP move, register def : SchedAlias; +// FP move, register +def : InstRW<[N2Write_2c_1V], (instrs FMOVHr, FMOVSr, FMOVDr)>; + // FP transfer, from gen to low half of vec reg def : InstRW<[N2Write_3c_1M0], (instrs FMOVWHr, FMOVXHr, FMOVWSr, FMOVXDr, FMOVHWr, FMOVHXr, FMOVSWr, FMOVDXr)>; @@ -858,9 +860,8 @@ def : InstRW<[N2Write_6c_1L], (instregex "^LDR[SDQ]l$", "^LDUR[BHSDQ]i$")>; // Load vector reg, immed post-index -def : InstRW<[N2Write_6c_1I_1L, WriteI], (instregex "^LDR[BHSDQ]post$")>; // Load vector reg, immed pre-index -def : InstRW<[WriteAdr, N2Write_6c_1I_1L], (instregex "^LDR[BHSDQ]pre$")>; +def : InstRW<[WriteAdr, N2Write_6c_1I_1L], (instregex "^LDR[BHSDQ](post|pre)$")>; // Load vector reg, unsigned immed def : InstRW<[N2Write_6c_1L], (instregex "^LDR[BHSDQ]ui$")>; @@ -1119,7 +1120,7 @@ def : InstRW<[N2Write_5c_1V], (instregex "^FMLALv", "^FMLSLv")>; // ASIMD FP round, D-form F32 and Q-form F64 def : InstRW<[N2Write_3c_1V0], (instregex "^FRINT[AIMNPXZ]v2f(32|64)$", - "^FRINT[32|64)[XZ]v2f(32|64)$")>; + "^FRINT(32|64)[XZ]v2f(32|64)$")>; // ASIMD FP round, D-form F16 and Q-form F32 def : InstRW<[N2Write_4c_2V0], diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s index 0c6ccc1face972..5ffaf9138d4823 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s +++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s @@ -3298,28 +3298,28 @@ add x0, x27, 1 # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 1000 -# CHECK-NEXT: Total Cycles: 3004 +# CHECK-NEXT: Total Cycles: 508 # CHECK-NEXT: Total uOps: 2000 # CHECK: Dispatch Width: 10 -# CHECK-NEXT: uOps Per Cycle: 0.67 -# CHECK-NEXT: IPC: 0.33 +# CHECK-NEXT: uOps Per Cycle: 3.94 +# CHECK-NEXT: IPC: 1.97 # CHECK-NEXT: Block RThroughput: 3.8 # CHECK: Timeline view: -# CHECK-NEXT: 0123456789 0123 -# CHECK-NEXT: Index 0123456789 0123456789 +# CHECK-NEXT: 012 +# CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeeeER . . . . . . ldr b1, [x27], #254 -# CHECK-NEXT: [0,1] D======eER. . . . . . add x0, x27, #1 -# CHECK-NEXT: [0,2] D======eeeeeeER. . . . . ldr h1, [x27], #254 -# CHECK-NEXT: [0,3] D============eER . . . . add x0, x27, #1 -# CHECK-NEXT: [0,4] .D===========eeeeeeER . . . ldr s1, [x27], #254 -# CHECK-NEXT: [0,5] .D=================eER . . . add x0, x27, #1 -# CHECK-NEXT: [0,6] .D=================eeeeeeER . . ldr d1, [x27], #254 -# CHECK-NEXT: [0,7] .D=======================eER . . add x0, x27, #1 -# CHECK-NEXT: [0,8] . D======================eeeeeeER. ldr q1, [x27], #254 -# CHECK-NEXT: [0,9] . D============================eER add x0, x27, #1 +# CHECK: [0,0] DeeeeeeER . . ldr b1, [x27], #254 +# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1 +# CHECK-NEXT: [0,2] D=eeeeeeER. . ldr h1, [x27], #254 +# CHECK-NEXT: [0,3] D==eE----R. . add x0, x27, #1 +# CHECK-NEXT: [0,4] .D=eeeeeeER . ldr s1, [x27], #254 +# CHECK-NEXT: [0,5] .D==eE----R . add x0, x27, #1 +# CHECK-NEXT: [0,6] .D==eeeeeeER. ldr d1, [x27], #254 +# CHECK-NEXT: [0,7] .D===eE----R. add x0, x27, #1 +# CHECK-NEXT: [0,8] . D==eeeeeeER ldr q1, [x27], #254 +# CHECK-NEXT: [0,9] . D===eE----R add x0, x27, #1 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -3329,16 +3329,16 @@ add x0, x27, 1 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldr b1, [x27], #254 -# CHECK-NEXT: 1. 1 7.0 0.0 0.0 add x0, x27, #1 -# CHECK-NEXT: 2. 1 7.0 0.0 0.0 ldr h1, [x27], #254 -# CHECK-NEXT: 3. 1 13.0 0.0 0.0 add x0, x27, #1 -# CHECK-NEXT: 4. 1 12.0 0.0 0.0 ldr s1, [x27], #254 -# CHECK-NEXT: 5. 1 18.0 0.0 0.0 add x0, x27, #1 -# CHECK-NEXT: 6. 1 18.0 0.0 0.0 ldr d1, [x27], #254 -# CHECK-NEXT: 7. 1 24.0 0.0 0.0 add x0, x27, #1 -# CHECK-NEXT: 8. 1 23.0 0.0 0.0 ldr q1, [x27], #254 -# CHECK-NEXT: 9. 1 29.0 0.0 0.0 add x0, x27, #1 -# CHECK-NEXT: 1 15.2 0.1 0.0 +# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1 +# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ldr h1, [x27], #254 +# CHECK-NEXT: 3. 1 3.0 0.0 4.0 add x0, x27, #1 +# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ldr s1, [x27], #254 +# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1 +# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ldr d1, [x27], #254 +# CHECK-NEXT: 7. 1 4.0 0.0 4.0 add x0, x27, #1 +# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ldr q1, [x27], #254 +# CHECK-NEXT: 9. 1 4.0 0.0 4.0 add x0, x27, #1 +# CHECK-NEXT: 1 2.7 0.1 2.0 # CHECK: [47] Code Region - G48 From dd222ff25129f4d67473a9af598a78d0adfcfd29 Mon Sep 17 00:00:00 2001 From: Younan Zhang Date: Wed, 18 Sep 2024 16:34:55 +0800 Subject: [PATCH 022/321] [Clang] Avoid transforming lambdas when rebuilding immediate expressions (#108693) When rebuilding immediate invocations inside `RemoveNestedImmediateInvocation()`, we employed a `TreeTransform` to exercise the traversal. The transformation has a side effect that, for template specialization types, their default template arguments are substituted separately, and if any lambdas are present, they will be transformed into distinct types than those used to instantiate the templates right before the `consteval` handling. This resulted in `B::func()` getting redundantly instantiated for the case in question. Since we're also in an immediate evaluation context, the body of `foo()` would also get instantiated, so we end up with a spurious friend redefinition error. Like what we have done in `ComplexRemove`, this patch also avoids the lambda's transformation in TemplateInstantiator if we know we're rebuilding immediate calls. In addition, this patch also consolidates the default argument substitution logic in `CheckTemplateArgumentList()`. Fixes #107175 --- clang/docs/ReleaseNotes.rst | 3 +- clang/lib/Sema/SemaExpr.cpp | 4 +- clang/lib/Sema/SemaTemplate.cpp | 55 ++++++------------- clang/lib/Sema/SemaTemplateInstantiate.cpp | 4 ++ .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 10 +++- clang/test/SemaCXX/cxx2a-consteval.cpp | 24 ++++++++ 6 files changed, 57 insertions(+), 43 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 3ed9a2984a38fe..dd004228b679e4 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -395,8 +395,9 @@ Bug Fixes to C++ Support - A follow-up fix was added for (#GH61460), as the previous fix was not entirely correct. (#GH86361) - Fixed a crash in the typo correction of an invalid CTAD guide. (#GH107887) - Fixed a crash when clang tries to subtitute parameter pack while retaining the parameter - pack. #GH63819, #GH107560 + pack. (#GH63819), (#GH107560) - Fix a crash when a static assert declaration has an invalid close location. (#GH108687) +- Avoided a redundant friend declaration instantiation under a certain ``consteval`` context. (#GH107175) - Fixed an assertion failure in debug mode, and potential crashes in release mode, when diagnosing a failed cast caused indirectly by a failed implicit conversion to the type of the constructor parameter. diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 80c252c79e4d7a..2f7e9c754ce095 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -17557,7 +17557,7 @@ static void RemoveNestedImmediateInvocation( else break; } - /// ConstantExpr are the first layer of implicit node to be removed so if + /// ConstantExprs are the first layer of implicit node to be removed so if /// Init isn't a ConstantExpr, no ConstantExpr will be skipped. if (auto *CE = dyn_cast(Init); CE && CE->isImmediateInvocation()) @@ -17570,7 +17570,7 @@ static void RemoveNestedImmediateInvocation( } ExprResult TransformLambdaExpr(LambdaExpr *E) { // Do not rebuild lambdas to avoid creating a new type. - // Lambdas have already been processed inside their eval context. + // Lambdas have already been processed inside their eval contexts. return E; } bool AlwaysRebuild() { return false; } diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index e5ea02a919f4eb..b052afede2cd67 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -5508,50 +5508,31 @@ bool Sema::CheckTemplateArgumentList( } // Check whether we have a default argument. - TemplateArgumentLoc Arg; + bool HasDefaultArg; // Retrieve the default template argument from the template // parameter. For each kind of template parameter, we substitute the // template arguments provided thus far and any "outer" template arguments // (when the template parameter was part of a nested template) into // the default argument. - if (TemplateTypeParmDecl *TTP = dyn_cast(*Param)) { - if (!hasReachableDefaultArgument(TTP)) - return diagnoseMissingArgument(*this, TemplateLoc, Template, TTP, + TemplateArgumentLoc Arg = SubstDefaultTemplateArgumentIfAvailable( + Template, TemplateLoc, RAngleLoc, *Param, SugaredConverted, + CanonicalConverted, HasDefaultArg); + + if (Arg.getArgument().isNull()) { + if (!HasDefaultArg) { + if (TemplateTypeParmDecl *TTP = dyn_cast(*Param)) + return diagnoseMissingArgument(*this, TemplateLoc, Template, TTP, + NewArgs); + if (NonTypeTemplateParmDecl *NTTP = + dyn_cast(*Param)) + return diagnoseMissingArgument(*this, TemplateLoc, Template, NTTP, + NewArgs); + return diagnoseMissingArgument(*this, TemplateLoc, Template, + cast(*Param), NewArgs); - - if (SubstDefaultTemplateArgument(*this, Template, TemplateLoc, RAngleLoc, - TTP, SugaredConverted, - CanonicalConverted, Arg)) - return true; - } else if (NonTypeTemplateParmDecl *NTTP - = dyn_cast(*Param)) { - if (!hasReachableDefaultArgument(NTTP)) - return diagnoseMissingArgument(*this, TemplateLoc, Template, NTTP, - NewArgs); - - if (SubstDefaultTemplateArgument(*this, Template, TemplateLoc, RAngleLoc, - NTTP, SugaredConverted, - CanonicalConverted, Arg)) - return true; - } else { - TemplateTemplateParmDecl *TempParm - = cast(*Param); - - if (!hasReachableDefaultArgument(TempParm)) - return diagnoseMissingArgument(*this, TemplateLoc, Template, TempParm, - NewArgs); - - NestedNameSpecifierLoc QualifierLoc; - TemplateName Name = SubstDefaultTemplateArgument( - *this, Template, TemplateLoc, RAngleLoc, TempParm, SugaredConverted, - CanonicalConverted, QualifierLoc); - if (Name.isNull()) - return true; - - Arg = TemplateArgumentLoc( - Context, TemplateArgument(Name), QualifierLoc, - TempParm->getDefaultArgument().getTemplateNameLoc()); + } + return true; } // Introduce an instantiation record that describes where we are using diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp index c42cc250bb904a..55f38743e2768e 100644 --- a/clang/lib/Sema/SemaTemplateInstantiate.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp @@ -1673,6 +1673,10 @@ namespace { } ExprResult TransformLambdaExpr(LambdaExpr *E) { + // Do not rebuild lambdas to avoid creating a new type. + // Lambdas have already been processed inside their eval contexts. + if (SemaRef.RebuildingImmediateInvocation) + return E; LocalInstantiationScope Scope(SemaRef, /*CombineWithOuterScope=*/true); Sema::ConstraintEvalRAII RAII(*this); diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index e97a7d768b931b..e055c87e783813 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -6294,9 +6294,13 @@ NamedDecl *Sema::FindInstantiatedDecl(SourceLocation Loc, NamedDecl *D, if (!SubstRecord) { // T can be a dependent TemplateSpecializationType when performing a - // substitution for building a deduction guide. - assert(CodeSynthesisContexts.back().Kind == - CodeSynthesisContext::BuildingDeductionGuides); + // substitution for building a deduction guide or for template + // argument deduction in the process of rebuilding immediate + // expressions. (Because the default argument that involves a lambda + // is untransformed and thus could be dependent at this point.) + assert(SemaRef.RebuildingImmediateInvocation || + CodeSynthesisContexts.back().Kind == + CodeSynthesisContext::BuildingDeductionGuides); // Return a nullptr as a sentinel value, we handle it properly in // the TemplateInstantiator::TransformInjectedClassNameType // override, which we transform it to a TemplateSpecializationType. diff --git a/clang/test/SemaCXX/cxx2a-consteval.cpp b/clang/test/SemaCXX/cxx2a-consteval.cpp index 81923617f637e8..ae331055c52b2e 100644 --- a/clang/test/SemaCXX/cxx2a-consteval.cpp +++ b/clang/test/SemaCXX/cxx2a-consteval.cpp @@ -1248,3 +1248,27 @@ void test() { } } + +// Test that we don't redundantly instantiate the friend declaration in +// RemoveNestedImmediateInvocation(). Otherwise, we would end up with spurious +// redefinition errors. +namespace GH107175 { + +consteval void consteval_func() {} + +template struct define_f { + friend void foo() {} +}; + +template struct A {}; + +struct B { + template consteval void func() { (void)define_f{}; } +}; + +int main() { + B{}.func(); + consteval_func(); +} + +} // namespace GH107175 From 8d7d4c25cbdec49d6363132be004e51c15606452 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 18 Sep 2024 16:36:13 +0800 Subject: [PATCH 023/321] [RISCV] Split fp rounding ops with zvfhmin nxv32f16 (#108765) This adds zvfhmin test coverage for fceil, ffloor, fnearbyint, frint, fround and froundeven and splits them at nxv32f16 to avoid crashing, similarly to what we do for other nodes that we promote. This also sets ftrunc to promote which was previously missing. We already promote the VP version of it, vp_froundtozero. Marking it as promoted affects some of the cost model tests since they're no longer expanded. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 17 +- llvm/test/Analysis/CostModel/RISCV/fround.ll | 16 +- llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll | 304 ++++-- llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll | 304 ++++-- .../CodeGen/RISCV/rvv/fixed-vectors-fp.ll | 943 +----------------- .../CodeGen/RISCV/rvv/fnearbyint-sdnode.ll | 319 ++++-- llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll | 268 +++-- llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll | 304 ++++-- .../CodeGen/RISCV/rvv/froundeven-sdnode.ll | 304 ++++-- llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll | 268 +++-- 10 files changed, 1549 insertions(+), 1498 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index d5b3cccda02d3b..42b14c669d0c80 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -942,12 +942,13 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, // TODO: support more ops. static const unsigned ZvfhminPromoteOps[] = { - ISD::FMINNUM, ISD::FMAXNUM, ISD::FADD, ISD::FSUB, - ISD::FMUL, ISD::FMA, ISD::FDIV, ISD::FSQRT, - ISD::FCEIL, ISD::FFLOOR, ISD::FROUND, ISD::FROUNDEVEN, - ISD::FRINT, ISD::FNEARBYINT, ISD::IS_FPCLASS, ISD::SETCC, - ISD::FMAXIMUM, ISD::FMINIMUM, ISD::STRICT_FADD, ISD::STRICT_FSUB, - ISD::STRICT_FMUL, ISD::STRICT_FDIV, ISD::STRICT_FSQRT, ISD::STRICT_FMA}; + ISD::FMINNUM, ISD::FMAXNUM, ISD::FADD, ISD::FSUB, + ISD::FMUL, ISD::FMA, ISD::FDIV, ISD::FSQRT, + ISD::FCEIL, ISD::FTRUNC, ISD::FFLOOR, ISD::FROUND, + ISD::FROUNDEVEN, ISD::FRINT, ISD::FNEARBYINT, ISD::IS_FPCLASS, + ISD::SETCC, ISD::FMAXIMUM, ISD::FMINIMUM, ISD::STRICT_FADD, + ISD::STRICT_FSUB, ISD::STRICT_FMUL, ISD::STRICT_FDIV, ISD::STRICT_FSQRT, + ISD::STRICT_FMA}; // TODO: support more vp ops. static const unsigned ZvfhminPromoteVPOps[] = {ISD::VP_FADD, @@ -6941,6 +6942,10 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, case ISD::FRINT: case ISD::FROUND: case ISD::FROUNDEVEN: + if (Op.getValueType() == MVT::nxv32f16 && + (Subtarget.hasVInstructionsF16Minimal() && + !Subtarget.hasVInstructionsF16())) + return SplitVectorOp(Op, DAG); return lowerFTRUNC_FCEIL_FFLOOR_FROUND(Op, DAG, Subtarget); case ISD::LRINT: case ISD::LLRINT: diff --git a/llvm/test/Analysis/CostModel/RISCV/fround.ll b/llvm/test/Analysis/CostModel/RISCV/fround.ll index dc501b82417d3d..b4740f223eca3a 100644 --- a/llvm/test/Analysis/CostModel/RISCV/fround.ll +++ b/llvm/test/Analysis/CostModel/RISCV/fround.ll @@ -233,10 +233,10 @@ define void @trunc_fp16() { ; ; ZVFHMIN-LABEL: 'trunc_fp16' ; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %1 = call half @llvm.trunc.f16(half undef) -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %2 = call <2 x half> @llvm.trunc.v2f16(<2 x half> undef) -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %3 = call <4 x half> @llvm.trunc.v4f16(<4 x half> undef) -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %4 = call <8 x half> @llvm.trunc.v8f16(<8 x half> undef) -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 159 for instruction: %5 = call <16 x half> @llvm.trunc.v16f16(<16 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x half> @llvm.trunc.v2f16(<2 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x half> @llvm.trunc.v4f16(<4 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x half> @llvm.trunc.v8f16(<8 x half> undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %5 = call <16 x half> @llvm.trunc.v16f16(<16 x half> undef) ; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = call @llvm.trunc.nxv1f16( undef) ; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = call @llvm.trunc.nxv2f16( undef) ; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = call @llvm.trunc.nxv4f16( undef) @@ -1108,10 +1108,10 @@ define void @vp_roundtozero_f16() { ; ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; ZVFHMIN-LABEL: 'vp_roundtozero_f16' -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %1 = call <2 x half> @llvm.vp.roundtozero.v2f16(<2 x half> undef, <2 x i1> undef, i32 undef) -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %2 = call <4 x half> @llvm.vp.roundtozero.v4f16(<4 x half> undef, <4 x i1> undef, i32 undef) -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %3 = call <8 x half> @llvm.vp.roundtozero.v8f16(<8 x half> undef, <8 x i1> undef, i32 undef) -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 190 for instruction: %4 = call <16 x half> @llvm.vp.roundtozero.v16f16(<16 x half> undef, <16 x i1> undef, i32 undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call <2 x half> @llvm.vp.roundtozero.v2f16(<2 x half> undef, <2 x i1> undef, i32 undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <4 x half> @llvm.vp.roundtozero.v4f16(<4 x half> undef, <4 x i1> undef, i32 undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = call <8 x half> @llvm.vp.roundtozero.v8f16(<8 x half> undef, <8 x i1> undef, i32 undef) +; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = call <16 x half> @llvm.vp.roundtozero.v16f16(<16 x half> undef, <16 x i1> undef, i32 undef) ; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %5 = call @llvm.vp.roundtozero.nxv1f16( undef, undef, i32 undef) ; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = call @llvm.vp.roundtozero.nxv2f16( undef, undef, i32 undef) ; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = call @llvm.vp.roundtozero.nxv4f16( undef, undef, i32 undef) diff --git a/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll index 9efc3183f15a52..111d1d8e07d3bf 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll @@ -1,124 +1,256 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN define @ceil_nxv1f16( %x) { -; CHECK-LABEL: ceil_nxv1f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: ceil_nxv1f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI0_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: fsrmi a0, 3 +; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; ZVFH-NEXT: fsrm a0 +; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: ceil_nxv1f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v9 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: fsrmi a0, 3 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t +; ZVFHMIN-NEXT: fsrm a0 +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: ret %a = call @llvm.ceil.nxv1f16( %x) ret %a } declare @llvm.ceil.nxv1f16() define @ceil_nxv2f16( %x) { -; CHECK-LABEL: ceil_nxv2f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: ceil_nxv2f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI1_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: fsrmi a0, 3 +; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; ZVFH-NEXT: fsrm a0 +; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: ceil_nxv2f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v9 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: fsrmi a0, 3 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t +; ZVFHMIN-NEXT: fsrm a0 +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: ret %a = call @llvm.ceil.nxv2f16( %x) ret %a } declare @llvm.ceil.nxv2f16() define @ceil_nxv4f16( %x) { -; CHECK-LABEL: ceil_nxv4f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI2_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: ceil_nxv4f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI2_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: fsrmi a0, 3 +; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; ZVFH-NEXT: fsrm a0 +; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: ceil_nxv4f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v10 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: fsrmi a0, 3 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t +; ZVFHMIN-NEXT: fsrm a0 +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v10, v8, v10, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 +; ZVFHMIN-NEXT: ret %a = call @llvm.ceil.nxv4f16( %x) ret %a } declare @llvm.ceil.nxv4f16() define @ceil_nxv8f16( %x) { -; CHECK-LABEL: ceil_nxv8f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: ceil_nxv8f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI3_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; ZVFH-NEXT: vfabs.v v10, v8 +; ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; ZVFH-NEXT: fsrmi a0, 3 +; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; ZVFH-NEXT: fsrm a0 +; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: ceil_nxv8f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v12 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: fsrmi a0, 3 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t +; ZVFHMIN-NEXT: fsrm a0 +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v12, v8, v12, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 +; ZVFHMIN-NEXT: ret %a = call @llvm.ceil.nxv8f16( %x) ret %a } declare @llvm.ceil.nxv8f16() define @ceil_nxv16f16( %x) { -; CHECK-LABEL: ceil_nxv16f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI4_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: ceil_nxv16f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI4_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFH-NEXT: vfabs.v v12, v8 +; ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; ZVFH-NEXT: fsrmi a0, 3 +; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; ZVFH-NEXT: fsrm a0 +; ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: ceil_nxv16f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v16 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: fsrmi a0, 3 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t +; ZVFHMIN-NEXT: fsrm a0 +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: ret %a = call @llvm.ceil.nxv16f16( %x) ret %a } declare @llvm.ceil.nxv16f16() define @ceil_nxv32f16( %x) { -; CHECK-LABEL: ceil_nxv32f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI5_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: ceil_nxv32f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI5_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma +; ZVFH-NEXT: vfabs.v v16, v8 +; ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; ZVFH-NEXT: fsrmi a0, 3 +; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; ZVFH-NEXT: fsrm a0 +; ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, m8, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: ceil_nxv32f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfabs.v v24, v16 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; ZVFHMIN-NEXT: fsrmi a0, 3 +; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; ZVFHMIN-NEXT: fsrm a0 +; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfabs.v v24, v16 +; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; ZVFHMIN-NEXT: fsrmi a0, 3 +; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; ZVFHMIN-NEXT: fsrm a0 +; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: ret %a = call @llvm.ceil.nxv32f16( %x) ret %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll index ec60b3ed3e0c88..97d84e91744038 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll @@ -1,124 +1,256 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN define @floor_nxv1f16( %x) { -; CHECK-LABEL: floor_nxv1f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: floor_nxv1f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI0_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: fsrmi a0, 2 +; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; ZVFH-NEXT: fsrm a0 +; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: floor_nxv1f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v9 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: fsrmi a0, 2 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t +; ZVFHMIN-NEXT: fsrm a0 +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: ret %a = call @llvm.floor.nxv1f16( %x) ret %a } declare @llvm.floor.nxv1f16() define @floor_nxv2f16( %x) { -; CHECK-LABEL: floor_nxv2f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: floor_nxv2f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI1_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: fsrmi a0, 2 +; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; ZVFH-NEXT: fsrm a0 +; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: floor_nxv2f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v9 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: fsrmi a0, 2 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t +; ZVFHMIN-NEXT: fsrm a0 +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: ret %a = call @llvm.floor.nxv2f16( %x) ret %a } declare @llvm.floor.nxv2f16() define @floor_nxv4f16( %x) { -; CHECK-LABEL: floor_nxv4f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI2_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: floor_nxv4f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI2_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: fsrmi a0, 2 +; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; ZVFH-NEXT: fsrm a0 +; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: floor_nxv4f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v10 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: fsrmi a0, 2 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t +; ZVFHMIN-NEXT: fsrm a0 +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v10, v8, v10, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 +; ZVFHMIN-NEXT: ret %a = call @llvm.floor.nxv4f16( %x) ret %a } declare @llvm.floor.nxv4f16() define @floor_nxv8f16( %x) { -; CHECK-LABEL: floor_nxv8f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: floor_nxv8f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI3_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; ZVFH-NEXT: vfabs.v v10, v8 +; ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; ZVFH-NEXT: fsrmi a0, 2 +; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; ZVFH-NEXT: fsrm a0 +; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: floor_nxv8f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v12 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: fsrmi a0, 2 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t +; ZVFHMIN-NEXT: fsrm a0 +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v12, v8, v12, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 +; ZVFHMIN-NEXT: ret %a = call @llvm.floor.nxv8f16( %x) ret %a } declare @llvm.floor.nxv8f16() define @floor_nxv16f16( %x) { -; CHECK-LABEL: floor_nxv16f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI4_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: floor_nxv16f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI4_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFH-NEXT: vfabs.v v12, v8 +; ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; ZVFH-NEXT: fsrmi a0, 2 +; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; ZVFH-NEXT: fsrm a0 +; ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: floor_nxv16f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v16 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: fsrmi a0, 2 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t +; ZVFHMIN-NEXT: fsrm a0 +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: ret %a = call @llvm.floor.nxv16f16( %x) ret %a } declare @llvm.floor.nxv16f16() define @floor_nxv32f16( %x) { -; CHECK-LABEL: floor_nxv32f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI5_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: floor_nxv32f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI5_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma +; ZVFH-NEXT: vfabs.v v16, v8 +; ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; ZVFH-NEXT: fsrmi a0, 2 +; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; ZVFH-NEXT: fsrm a0 +; ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, m8, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: floor_nxv32f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfabs.v v24, v16 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; ZVFHMIN-NEXT: fsrmi a0, 2 +; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; ZVFHMIN-NEXT: fsrm a0 +; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfabs.v v24, v16 +; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; ZVFHMIN-NEXT: fsrmi a0, 2 +; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; ZVFHMIN-NEXT: fsrm a0 +; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: ret %a = call @llvm.floor.nxv32f16( %x) ret %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll index d996a9c05aca4d..b5c40fbfaac6c9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll @@ -5545,457 +5545,24 @@ define void @trunc_v8f16(ptr %x) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-ZFH-RV32-LABEL: trunc_v8f16: -; ZVFHMIN-ZFH-RV32: # %bb.0: -; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, -16 -; ZVFHMIN-ZFH-RV32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-ZFH-RV32-NEXT: mv a1, sp -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 2(sp) -; ZVFHMIN-ZFH-RV32-NEXT: lui a1, %hi(.LCPI115_0) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, %lo(.LCPI115_0)(a1) -; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: flt.h a1, fa3, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: beqz a1, .LBB115_2 -; ZVFHMIN-ZFH-RV32-NEXT: # %bb.1: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa4, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa3, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: .LBB115_2: -; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 0(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa1 -; ZVFHMIN-ZFH-RV32-NEXT: flt.h a1, fa3, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: beqz a1, .LBB115_4 -; ZVFHMIN-ZFH-RV32-NEXT: # %bb.3: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa1, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa1, fa3, fa1 -; ZVFHMIN-ZFH-RV32-NEXT: .LBB115_4: -; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 4(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa2 -; ZVFHMIN-ZFH-RV32-NEXT: flt.h a1, fa3, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: beqz a1, .LBB115_6 -; ZVFHMIN-ZFH-RV32-NEXT: # %bb.5: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa2, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa2, fa3, fa2 -; ZVFHMIN-ZFH-RV32-NEXT: .LBB115_6: -; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 6(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa0, fa3 -; ZVFHMIN-ZFH-RV32-NEXT: flt.h a1, fa0, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa1 -; ZVFHMIN-ZFH-RV32-NEXT: beqz a1, .LBB115_8 -; ZVFHMIN-ZFH-RV32-NEXT: # %bb.7: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa3, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa1, a1, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa3, fa1, fa3 -; ZVFHMIN-ZFH-RV32-NEXT: .LBB115_8: -; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 10(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa4, fa1 -; ZVFHMIN-ZFH-RV32-NEXT: flt.h a3, fa4, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-ZFH-RV32-NEXT: beqz a3, .LBB115_10 -; ZVFHMIN-ZFH-RV32-NEXT: # %bb.9: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a2, fa1, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa4, a2, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa1, fa4, fa1 -; ZVFHMIN-ZFH-RV32-NEXT: .LBB115_10: -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 8(sp) -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa2 -; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa2, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: flt.h a3, fa2, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa1 -; ZVFHMIN-ZFH-RV32-NEXT: beqz a3, .LBB115_12 -; ZVFHMIN-ZFH-RV32-NEXT: # %bb.11: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a3, fa4, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa2, a3, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa2, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: .LBB115_12: -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a2 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 12(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa3 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a3, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v9, a3 -; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa4, fa2 -; ZVFHMIN-ZFH-RV32-NEXT: flt.h a3, fa4, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV32-NEXT: beqz a3, .LBB115_14 -; ZVFHMIN-ZFH-RV32-NEXT: # %bb.13: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa2, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa4, a1, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa2, fa4, fa2 -; ZVFHMIN-ZFH-RV32-NEXT: .LBB115_14: -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 14(sp) -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a2 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa2 -; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: flt.h a2, fa3, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV32-NEXT: beqz a2, .LBB115_16 -; ZVFHMIN-ZFH-RV32-NEXT: # %bb.15: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa4, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa5, a1, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa5, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: .LBB115_16: -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, mu -; ZVFHMIN-ZFH-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, 16 -; ZVFHMIN-ZFH-RV32-NEXT: ret -; -; ZVFHMIN-ZFH-RV64-LABEL: trunc_v8f16: -; ZVFHMIN-ZFH-RV64: # %bb.0: -; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, -16 -; ZVFHMIN-ZFH-RV64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-ZFH-RV64-NEXT: mv a1, sp -; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 2(sp) -; ZVFHMIN-ZFH-RV64-NEXT: lui a1, %hi(.LCPI115_0) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, %lo(.LCPI115_0)(a1) -; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: flt.h a1, fa3, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: beqz a1, .LBB115_2 -; ZVFHMIN-ZFH-RV64-NEXT: # %bb.1: -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa4, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa3, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: .LBB115_2: -; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 0(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa1 -; ZVFHMIN-ZFH-RV64-NEXT: flt.h a1, fa3, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: beqz a1, .LBB115_4 -; ZVFHMIN-ZFH-RV64-NEXT: # %bb.3: -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa1, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa1, fa3, fa1 -; ZVFHMIN-ZFH-RV64-NEXT: .LBB115_4: -; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 4(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa2 -; ZVFHMIN-ZFH-RV64-NEXT: flt.h a1, fa3, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: beqz a1, .LBB115_6 -; ZVFHMIN-ZFH-RV64-NEXT: # %bb.5: -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa2, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa2, fa3, fa2 -; ZVFHMIN-ZFH-RV64-NEXT: .LBB115_6: -; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 6(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa0, fa3 -; ZVFHMIN-ZFH-RV64-NEXT: flt.h a1, fa0, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa1 -; ZVFHMIN-ZFH-RV64-NEXT: beqz a1, .LBB115_8 -; ZVFHMIN-ZFH-RV64-NEXT: # %bb.7: -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa3, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa1, a1, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa3, fa1, fa3 -; ZVFHMIN-ZFH-RV64-NEXT: .LBB115_8: -; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 10(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa4, fa1 -; ZVFHMIN-ZFH-RV64-NEXT: flt.h a3, fa4, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-ZFH-RV64-NEXT: beqz a3, .LBB115_10 -; ZVFHMIN-ZFH-RV64-NEXT: # %bb.9: -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a2, fa1, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa4, a2, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa1, fa4, fa1 -; ZVFHMIN-ZFH-RV64-NEXT: .LBB115_10: -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 8(sp) -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa2 -; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa2, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: flt.h a3, fa2, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa1 -; ZVFHMIN-ZFH-RV64-NEXT: beqz a3, .LBB115_12 -; ZVFHMIN-ZFH-RV64-NEXT: # %bb.11: -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a3, fa4, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa2, a3, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa2, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: .LBB115_12: -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a2 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 12(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa3 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a3, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v9, a3 -; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa4, fa2 -; ZVFHMIN-ZFH-RV64-NEXT: flt.h a3, fa4, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV64-NEXT: beqz a3, .LBB115_14 -; ZVFHMIN-ZFH-RV64-NEXT: # %bb.13: -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa2, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa4, a1, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa2, fa4, fa2 -; ZVFHMIN-ZFH-RV64-NEXT: .LBB115_14: -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 14(sp) -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a2 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa2 -; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: flt.h a2, fa3, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV64-NEXT: beqz a2, .LBB115_16 -; ZVFHMIN-ZFH-RV64-NEXT: # %bb.15: -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa4, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa5, a1, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa5, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: .LBB115_16: -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, mu -; ZVFHMIN-ZFH-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v9, (a0) -; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, 16 -; ZVFHMIN-ZFH-RV64-NEXT: ret -; -; ZVFHMIN-ZFHIN-RV32-LABEL: trunc_v8f16: -; ZVFHMIN-ZFHIN-RV32: # %bb.0: -; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, -16 -; ZVFHMIN-ZFHIN-RV32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-ZFHIN-RV32-NEXT: mv a1, sp -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 2(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: lui a1, 307200 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.w.x fa5, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa3, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a1, .LBB115_2 -; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.1: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa4, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa3, a1, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa3, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB115_2: -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 0(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa2, fa3 -; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa3, fa2 -; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa3, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a1, .LBB115_4 -; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.3: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa2, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa3, a1, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa2, fa3, fa2 -; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB115_4: -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 4(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa3, fa3 -; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa1, fa3 -; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa1, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa1, fa2 -; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a1, .LBB115_6 -; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.5: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa3, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa2, a1, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa3, fa2, fa3 -; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB115_6: -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa0, 6(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa2, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa0 -; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa0, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa0, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa1 -; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a1, .LBB115_8 -; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.7: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa4, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa1, a1, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa1, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB115_8: -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa1, 10(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa2 -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa2, fa1 -; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa1, fa2 -; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a2, fa1, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa3, fa3 -; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a2, .LBB115_10 -; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.9: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a2, fa2, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa1, a2, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa2, fa1, fa2 -; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB115_10: -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa1, 8(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa3 -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa2, fa2 -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa1 -; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa1, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a3, fa1, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa2 -; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a3, .LBB115_12 -; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.11: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a3, fa4, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa2, a3, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa2, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB115_12: -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa3 -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 12(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v9, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa3 -; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a3, fa3, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a3, .LBB115_14 -; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.13: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a2, fa4, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa3, a2, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa3, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB115_14: -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 14(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa3 -; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a2, fa3, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a2, .LBB115_16 -; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.15: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa4, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa5, a1, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa5, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB115_16: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa5, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, mu -; ZVFHMIN-ZFHIN-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, 16 -; ZVFHMIN-ZFHIN-RV32-NEXT: ret -; -; ZVFHMIN-ZFHIN-RV64-LABEL: trunc_v8f16: -; ZVFHMIN-ZFHIN-RV64: # %bb.0: -; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, -16 -; ZVFHMIN-ZFHIN-RV64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-ZFHIN-RV64-NEXT: mv a1, sp -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 2(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: lui a1, 307200 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.w.x fa5, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa3, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a1, .LBB115_2 -; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.1: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa4, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa3, a1, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa3, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB115_2: -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 0(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa2, fa3 -; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa3, fa2 -; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa3, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a1, .LBB115_4 -; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.3: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa2, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa3, a1, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa2, fa3, fa2 -; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB115_4: -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 4(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa3, fa3 -; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa1, fa3 -; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa1, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa1, fa2 -; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a1, .LBB115_6 -; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.5: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa3, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa2, a1, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa3, fa2, fa3 -; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB115_6: -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa0, 6(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa2, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa0 -; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa0, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa0, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa1 -; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a1, .LBB115_8 -; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.7: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa4, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa1, a1, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa1, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB115_8: -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa1, 10(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa2 -; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa2, fa1 -; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa1, fa2 -; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a2, fa1, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa3, fa3 -; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a2, .LBB115_10 -; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.9: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a2, fa2, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa1, a2, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa2, fa1, fa2 -; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB115_10: -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa1, 8(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa3 -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa2, fa2 -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa1 -; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa1, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a3, fa1, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa2 -; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a3, .LBB115_12 -; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.11: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a3, fa4, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa2, a3, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa2, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB115_12: -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa3 -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 12(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v9, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa3 -; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a3, fa3, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a3, .LBB115_14 -; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.13: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a2, fa4, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa3, a2, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa3, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB115_14: -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 14(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa3 -; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a2, fa3, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a2, .LBB115_16 -; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.15: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa4, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa5, a1, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa5, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB115_16: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa5, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, mu -; ZVFHMIN-ZFHIN-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v9, (a0) -; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, 16 -; ZVFHMIN-ZFHIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: trunc_v8f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v9 +; ZVFHMIN-NEXT: lui a1, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v8, v9, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: vse16.v v8, (a0) +; ZVFHMIN-NEXT: ret %a = load <8 x half>, ptr %x %b = call <8 x half> @llvm.trunc.v8f16(<8 x half> %a) store <8 x half> %b, ptr %x @@ -6020,461 +5587,25 @@ define void @trunc_v6f16(ptr %x) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-ZFH-RV32-LABEL: trunc_v6f16: -; ZVFHMIN-ZFH-RV32: # %bb.0: -; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, -16 -; ZVFHMIN-ZFH-RV32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, ma -; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-ZFH-RV32-NEXT: mv a1, sp -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 2(sp) -; ZVFHMIN-ZFH-RV32-NEXT: lui a1, %hi(.LCPI116_0) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, %lo(.LCPI116_0)(a1) -; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: flt.h a1, fa3, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: beqz a1, .LBB116_2 -; ZVFHMIN-ZFH-RV32-NEXT: # %bb.1: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa4, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa3, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: .LBB116_2: -; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 0(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa1 -; ZVFHMIN-ZFH-RV32-NEXT: flt.h a1, fa3, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: beqz a1, .LBB116_4 -; ZVFHMIN-ZFH-RV32-NEXT: # %bb.3: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa1, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa1, fa3, fa1 -; ZVFHMIN-ZFH-RV32-NEXT: .LBB116_4: -; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 4(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa2 -; ZVFHMIN-ZFH-RV32-NEXT: flt.h a1, fa3, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: beqz a1, .LBB116_6 -; ZVFHMIN-ZFH-RV32-NEXT: # %bb.5: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa2, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa2, fa3, fa2 -; ZVFHMIN-ZFH-RV32-NEXT: .LBB116_6: -; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 6(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa0, fa3 -; ZVFHMIN-ZFH-RV32-NEXT: flt.h a1, fa0, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa1 -; ZVFHMIN-ZFH-RV32-NEXT: beqz a1, .LBB116_8 -; ZVFHMIN-ZFH-RV32-NEXT: # %bb.7: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa3, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa1, a1, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa3, fa1, fa3 -; ZVFHMIN-ZFH-RV32-NEXT: .LBB116_8: -; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 10(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa4, fa1 -; ZVFHMIN-ZFH-RV32-NEXT: flt.h a3, fa4, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-ZFH-RV32-NEXT: beqz a3, .LBB116_10 -; ZVFHMIN-ZFH-RV32-NEXT: # %bb.9: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a2, fa1, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa4, a2, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa1, fa4, fa1 -; ZVFHMIN-ZFH-RV32-NEXT: .LBB116_10: -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 8(sp) -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa2 -; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa2, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: flt.h a3, fa2, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa1 -; ZVFHMIN-ZFH-RV32-NEXT: beqz a3, .LBB116_12 -; ZVFHMIN-ZFH-RV32-NEXT: # %bb.11: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a3, fa4, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa2, a3, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa2, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: .LBB116_12: -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a2 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 12(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa3 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a3, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v9, a3 -; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa4, fa2 -; ZVFHMIN-ZFH-RV32-NEXT: flt.h a3, fa4, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV32-NEXT: beqz a3, .LBB116_14 -; ZVFHMIN-ZFH-RV32-NEXT: # %bb.13: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa2, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa4, a1, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa2, fa4, fa2 -; ZVFHMIN-ZFH-RV32-NEXT: .LBB116_14: -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 14(sp) -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a2 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa2 -; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: flt.h a2, fa3, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV32-NEXT: beqz a2, .LBB116_16 -; ZVFHMIN-ZFH-RV32-NEXT: # %bb.15: -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa4, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa5, a1, rtz -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa5, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: .LBB116_16: -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, mu -; ZVFHMIN-ZFH-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, 16 -; ZVFHMIN-ZFH-RV32-NEXT: ret -; -; ZVFHMIN-ZFH-RV64-LABEL: trunc_v6f16: -; ZVFHMIN-ZFH-RV64: # %bb.0: -; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, -16 -; ZVFHMIN-ZFH-RV64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, ma -; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-ZFH-RV64-NEXT: mv a1, sp -; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 2(sp) -; ZVFHMIN-ZFH-RV64-NEXT: lui a1, %hi(.LCPI116_0) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, %lo(.LCPI116_0)(a1) -; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: flt.h a1, fa3, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: beqz a1, .LBB116_2 -; ZVFHMIN-ZFH-RV64-NEXT: # %bb.1: -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa4, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa3, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: .LBB116_2: -; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 0(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa1 -; ZVFHMIN-ZFH-RV64-NEXT: flt.h a1, fa3, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: beqz a1, .LBB116_4 -; ZVFHMIN-ZFH-RV64-NEXT: # %bb.3: -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa1, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa1, fa3, fa1 -; ZVFHMIN-ZFH-RV64-NEXT: .LBB116_4: -; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 4(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa2 -; ZVFHMIN-ZFH-RV64-NEXT: flt.h a1, fa3, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: beqz a1, .LBB116_6 -; ZVFHMIN-ZFH-RV64-NEXT: # %bb.5: -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa2, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa3, a1, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa2, fa3, fa2 -; ZVFHMIN-ZFH-RV64-NEXT: .LBB116_6: -; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 6(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa0, fa3 -; ZVFHMIN-ZFH-RV64-NEXT: flt.h a1, fa0, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa1 -; ZVFHMIN-ZFH-RV64-NEXT: beqz a1, .LBB116_8 -; ZVFHMIN-ZFH-RV64-NEXT: # %bb.7: -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa3, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa1, a1, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa3, fa1, fa3 -; ZVFHMIN-ZFH-RV64-NEXT: .LBB116_8: -; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 10(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa4, fa1 -; ZVFHMIN-ZFH-RV64-NEXT: flt.h a3, fa4, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-ZFH-RV64-NEXT: beqz a3, .LBB116_10 -; ZVFHMIN-ZFH-RV64-NEXT: # %bb.9: -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a2, fa1, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa4, a2, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa1, fa4, fa1 -; ZVFHMIN-ZFH-RV64-NEXT: .LBB116_10: -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 8(sp) -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa2 -; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa2, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: flt.h a3, fa2, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa1 -; ZVFHMIN-ZFH-RV64-NEXT: beqz a3, .LBB116_12 -; ZVFHMIN-ZFH-RV64-NEXT: # %bb.11: -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a3, fa4, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa2, a3, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa2, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: .LBB116_12: -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a2 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 12(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa3 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a3, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v9, a3 -; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa4, fa2 -; ZVFHMIN-ZFH-RV64-NEXT: flt.h a3, fa4, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV64-NEXT: beqz a3, .LBB116_14 -; ZVFHMIN-ZFH-RV64-NEXT: # %bb.13: -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa2, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa4, a1, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa2, fa4, fa2 -; ZVFHMIN-ZFH-RV64-NEXT: .LBB116_14: -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 14(sp) -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a2 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa2 -; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: flt.h a2, fa3, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV64-NEXT: beqz a2, .LBB116_16 -; ZVFHMIN-ZFH-RV64-NEXT: # %bb.15: -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa4, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa5, a1, rtz -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa5, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: .LBB116_16: -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, mu -; ZVFHMIN-ZFH-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v9, (a0) -; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, 16 -; ZVFHMIN-ZFH-RV64-NEXT: ret -; -; ZVFHMIN-ZFHIN-RV32-LABEL: trunc_v6f16: -; ZVFHMIN-ZFHIN-RV32: # %bb.0: -; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, -16 -; ZVFHMIN-ZFHIN-RV32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-ZFHIN-RV32-NEXT: mv a1, sp -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 2(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: lui a1, 307200 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.w.x fa5, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa3, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a1, .LBB116_2 -; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.1: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa4, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa3, a1, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa3, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB116_2: -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 0(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa2, fa3 -; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa3, fa2 -; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa3, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a1, .LBB116_4 -; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.3: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa2, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa3, a1, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa2, fa3, fa2 -; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB116_4: -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 4(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa3, fa3 -; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa1, fa3 -; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa1, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa1, fa2 -; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a1, .LBB116_6 -; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.5: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa3, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa2, a1, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa3, fa2, fa3 -; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB116_6: -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa0, 6(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa2, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa0 -; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa0, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa0, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa1 -; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a1, .LBB116_8 -; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.7: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa4, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa1, a1, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa1, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB116_8: -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa1, 10(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa2 -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa2, fa1 -; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa1, fa2 -; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a2, fa1, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa3, fa3 -; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a2, .LBB116_10 -; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.9: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a2, fa2, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa1, a2, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa2, fa1, fa2 -; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB116_10: -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa1, 8(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa3 -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa2, fa2 -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa1 -; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa1, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a3, fa1, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa2 -; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a3, .LBB116_12 -; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.11: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a3, fa4, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa2, a3, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa2, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB116_12: -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa3 -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 12(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v9, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa3 -; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a3, fa3, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a3, .LBB116_14 -; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.13: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a2, fa4, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa3, a2, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa3, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB116_14: -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 14(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa3 -; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a2, fa3, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a2, .LBB116_16 -; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.15: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa4, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa5, a1, rtz -; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa5, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB116_16: -; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa5, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, mu -; ZVFHMIN-ZFHIN-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, 16 -; ZVFHMIN-ZFHIN-RV32-NEXT: ret -; -; ZVFHMIN-ZFHIN-RV64-LABEL: trunc_v6f16: -; ZVFHMIN-ZFHIN-RV64: # %bb.0: -; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, -16 -; ZVFHMIN-ZFHIN-RV64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-ZFHIN-RV64-NEXT: mv a1, sp -; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 2(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: lui a1, 307200 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.w.x fa5, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa3, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a1, .LBB116_2 -; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.1: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa4, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa3, a1, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa3, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB116_2: -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 0(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa2, fa3 -; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa3, fa2 -; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa3, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a1, .LBB116_4 -; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.3: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa2, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa3, a1, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa2, fa3, fa2 -; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB116_4: -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 4(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa3, fa3 -; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa1, fa3 -; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa1, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa1, fa2 -; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a1, .LBB116_6 -; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.5: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa3, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa2, a1, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa3, fa2, fa3 -; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB116_6: -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa0, 6(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa2, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa0 -; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa0, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa0, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa1 -; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a1, .LBB116_8 -; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.7: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa4, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa1, a1, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa1, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB116_8: -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa1, 10(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa2 -; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa2, fa1 -; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa1, fa2 -; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a2, fa1, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa3, fa3 -; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a2, .LBB116_10 -; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.9: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a2, fa2, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa1, a2, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa2, fa1, fa2 -; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB116_10: -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa1, 8(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa3 -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa2, fa2 -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa1 -; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa1, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a3, fa1, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa2 -; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a3, .LBB116_12 -; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.11: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a3, fa4, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa2, a3, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa2, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB116_12: -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa3 -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 12(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v9, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa3 -; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a3, fa3, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a3, .LBB116_14 -; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.13: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a2, fa4, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa3, a2, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa3, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB116_14: -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 14(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa4, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa3 -; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa3, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a2, fa3, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a2, .LBB116_16 -; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.15: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa4, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa5, a1, rtz -; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa5, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB116_16: -; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa5, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, mu -; ZVFHMIN-ZFHIN-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v9, (a0) -; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, 16 -; ZVFHMIN-ZFHIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: trunc_v6f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v9 +; ZVFHMIN-NEXT: lui a1, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v8, v9, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: vse16.v v8, (a0) +; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = call <6 x half> @llvm.trunc.v6f16(<6 x half> %a) store <6 x half> %b, ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll index 9e14852305caa1..0655b9d099cbb7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll @@ -1,124 +1,271 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN define @nearbyint_nxv1f16( %x) { -; CHECK-LABEL: nearbyint_nxv1f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; ZVFH-LABEL: nearbyint_nxv1f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI0_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: frflags a0 +; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; ZVFH-NEXT: fsflags a0 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: nearbyint_nxv1f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v9 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: frflags a0 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: fsflags a0 +; ZVFHMIN-NEXT: ret %a = call @llvm.nearbyint.nxv1f16( %x) ret %a } declare @llvm.nearbyint.nxv1f16() define @nearbyint_nxv2f16( %x) { -; CHECK-LABEL: nearbyint_nxv2f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; ZVFH-LABEL: nearbyint_nxv2f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI1_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: frflags a0 +; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; ZVFH-NEXT: fsflags a0 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: nearbyint_nxv2f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v9 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: frflags a0 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: fsflags a0 +; ZVFHMIN-NEXT: ret %a = call @llvm.nearbyint.nxv2f16( %x) ret %a } declare @llvm.nearbyint.nxv2f16() define @nearbyint_nxv4f16( %x) { -; CHECK-LABEL: nearbyint_nxv4f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI2_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; ZVFH-LABEL: nearbyint_nxv4f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI2_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: frflags a0 +; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; ZVFH-NEXT: fsflags a0 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: nearbyint_nxv4f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v10 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: frflags a0 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v10, v8, v10, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 +; ZVFHMIN-NEXT: fsflags a0 +; ZVFHMIN-NEXT: ret %a = call @llvm.nearbyint.nxv4f16( %x) ret %a } declare @llvm.nearbyint.nxv4f16() define @nearbyint_nxv8f16( %x) { -; CHECK-LABEL: nearbyint_nxv8f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; ZVFH-LABEL: nearbyint_nxv8f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI3_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; ZVFH-NEXT: vfabs.v v10, v8 +; ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; ZVFH-NEXT: frflags a0 +; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; ZVFH-NEXT: fsflags a0 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: nearbyint_nxv8f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v12 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: frflags a0 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v12, v8, v12, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 +; ZVFHMIN-NEXT: fsflags a0 +; ZVFHMIN-NEXT: ret %a = call @llvm.nearbyint.nxv8f16( %x) ret %a } declare @llvm.nearbyint.nxv8f16() define @nearbyint_nxv16f16( %x) { -; CHECK-LABEL: nearbyint_nxv16f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI4_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; ZVFH-LABEL: nearbyint_nxv16f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI4_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFH-NEXT: vfabs.v v12, v8 +; ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; ZVFH-NEXT: frflags a0 +; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; ZVFH-NEXT: fsflags a0 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: nearbyint_nxv16f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v16 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: frflags a0 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: fsflags a0 +; ZVFHMIN-NEXT: ret %a = call @llvm.nearbyint.nxv16f16( %x) ret %a } declare @llvm.nearbyint.nxv16f16() define @nearbyint_nxv32f16( %x) { -; CHECK-LABEL: nearbyint_nxv32f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI5_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; ZVFH-LABEL: nearbyint_nxv32f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI5_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma +; ZVFH-NEXT: vfabs.v v16, v8 +; ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; ZVFH-NEXT: frflags a0 +; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, m8, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; ZVFH-NEXT: fsflags a0 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: nearbyint_nxv32f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: addi sp, sp, -16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: sub sp, sp, a0 +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfabs.v v24, v16 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; ZVFHMIN-NEXT: frflags a0 +; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; ZVFHMIN-NEXT: fsflags a0 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v24 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: frflags a0 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v24, v0.t +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v24, v16, v24, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 +; ZVFHMIN-NEXT: fsflags a0 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add sp, sp, a0 +; ZVFHMIN-NEXT: addi sp, sp, 16 +; ZVFHMIN-NEXT: ret %a = call @llvm.nearbyint.nxv32f16( %x) ret %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll index fb77b746549400..ca1f72ee4d524b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll @@ -1,112 +1,232 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN define @rint_nxv1f16( %x) { -; CHECK-LABEL: rint_nxv1f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: rint_nxv1f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI0_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: rint_nxv1f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v9 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: ret %a = call @llvm.rint.nxv1f16( %x) ret %a } declare @llvm.rint.nxv1f16() define @rint_nxv2f16( %x) { -; CHECK-LABEL: rint_nxv2f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: rint_nxv2f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI1_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: rint_nxv2f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v9 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: ret %a = call @llvm.rint.nxv2f16( %x) ret %a } declare @llvm.rint.nxv2f16() define @rint_nxv4f16( %x) { -; CHECK-LABEL: rint_nxv4f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI2_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: rint_nxv4f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI2_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: rint_nxv4f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v10 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v10, v8, v10, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 +; ZVFHMIN-NEXT: ret %a = call @llvm.rint.nxv4f16( %x) ret %a } declare @llvm.rint.nxv4f16() define @rint_nxv8f16( %x) { -; CHECK-LABEL: rint_nxv8f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: rint_nxv8f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI3_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; ZVFH-NEXT: vfabs.v v10, v8 +; ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: rint_nxv8f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v12 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v12, v8, v12, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 +; ZVFHMIN-NEXT: ret %a = call @llvm.rint.nxv8f16( %x) ret %a } declare @llvm.rint.nxv8f16() define @rint_nxv16f16( %x) { -; CHECK-LABEL: rint_nxv16f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI4_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: rint_nxv16f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI4_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFH-NEXT: vfabs.v v12, v8 +; ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: rint_nxv16f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v16 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: ret %a = call @llvm.rint.nxv16f16( %x) ret %a } declare @llvm.rint.nxv16f16() define @rint_nxv32f16( %x) { -; CHECK-LABEL: rint_nxv32f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI5_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: rint_nxv32f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI5_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma +; ZVFH-NEXT: vfabs.v v16, v8 +; ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, m8, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: rint_nxv32f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfabs.v v24, v16 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v24 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfcvt.x.f.v v16, v24, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v24, v16, v24, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 +; ZVFHMIN-NEXT: ret %a = call @llvm.rint.nxv32f16( %x) ret %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll index bb6724eeb32006..a39abcc6ed0e27 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll @@ -1,126 +1,258 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN ; This file tests the code generation for `llvm.round.*` on scalable vector type. define @round_nxv1f16( %x) { -; CHECK-LABEL: round_nxv1f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: round_nxv1f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI0_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: fsrmi a0, 4 +; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; ZVFH-NEXT: fsrm a0 +; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: round_nxv1f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v9 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: fsrmi a0, 4 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t +; ZVFHMIN-NEXT: fsrm a0 +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: ret %a = call @llvm.round.nxv1f16( %x) ret %a } declare @llvm.round.nxv1f16() define @round_nxv2f16( %x) { -; CHECK-LABEL: round_nxv2f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: round_nxv2f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI1_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: fsrmi a0, 4 +; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; ZVFH-NEXT: fsrm a0 +; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: round_nxv2f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v9 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: fsrmi a0, 4 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t +; ZVFHMIN-NEXT: fsrm a0 +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: ret %a = call @llvm.round.nxv2f16( %x) ret %a } declare @llvm.round.nxv2f16() define @round_nxv4f16( %x) { -; CHECK-LABEL: round_nxv4f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI2_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: round_nxv4f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI2_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: fsrmi a0, 4 +; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; ZVFH-NEXT: fsrm a0 +; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: round_nxv4f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v10 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: fsrmi a0, 4 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t +; ZVFHMIN-NEXT: fsrm a0 +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v10, v8, v10, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 +; ZVFHMIN-NEXT: ret %a = call @llvm.round.nxv4f16( %x) ret %a } declare @llvm.round.nxv4f16() define @round_nxv8f16( %x) { -; CHECK-LABEL: round_nxv8f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: round_nxv8f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI3_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; ZVFH-NEXT: vfabs.v v10, v8 +; ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; ZVFH-NEXT: fsrmi a0, 4 +; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; ZVFH-NEXT: fsrm a0 +; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: round_nxv8f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v12 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: fsrmi a0, 4 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t +; ZVFHMIN-NEXT: fsrm a0 +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v12, v8, v12, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 +; ZVFHMIN-NEXT: ret %a = call @llvm.round.nxv8f16( %x) ret %a } declare @llvm.round.nxv8f16() define @round_nxv16f16( %x) { -; CHECK-LABEL: round_nxv16f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI4_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: round_nxv16f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI4_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFH-NEXT: vfabs.v v12, v8 +; ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; ZVFH-NEXT: fsrmi a0, 4 +; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; ZVFH-NEXT: fsrm a0 +; ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: round_nxv16f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v16 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: fsrmi a0, 4 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t +; ZVFHMIN-NEXT: fsrm a0 +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: ret %a = call @llvm.round.nxv16f16( %x) ret %a } declare @llvm.round.nxv16f16() define @round_nxv32f16( %x) { -; CHECK-LABEL: round_nxv32f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI5_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: round_nxv32f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI5_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma +; ZVFH-NEXT: vfabs.v v16, v8 +; ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; ZVFH-NEXT: fsrmi a0, 4 +; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; ZVFH-NEXT: fsrm a0 +; ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, m8, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: round_nxv32f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfabs.v v24, v16 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; ZVFHMIN-NEXT: fsrmi a0, 4 +; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; ZVFHMIN-NEXT: fsrm a0 +; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfabs.v v24, v16 +; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; ZVFHMIN-NEXT: fsrmi a0, 4 +; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; ZVFHMIN-NEXT: fsrm a0 +; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: ret %a = call @llvm.round.nxv32f16( %x) ret %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll index 6f5207a25518f5..52ad443bfdebda 100644 --- a/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll @@ -1,126 +1,258 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN ; This file tests the code generation for `llvm.roundeven.*` on scalable vector type. define @roundeven_nxv1f16( %x) { -; CHECK-LABEL: roundeven_nxv1f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: roundeven_nxv1f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI0_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: fsrmi a0, 0 +; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; ZVFH-NEXT: fsrm a0 +; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: roundeven_nxv1f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v9 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: fsrmi a0, 0 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t +; ZVFHMIN-NEXT: fsrm a0 +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: ret %a = call @llvm.roundeven.nxv1f16( %x) ret %a } declare @llvm.roundeven.nxv1f16() define @roundeven_nxv2f16( %x) { -; CHECK-LABEL: roundeven_nxv2f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: roundeven_nxv2f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI1_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: fsrmi a0, 0 +; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; ZVFH-NEXT: fsrm a0 +; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: roundeven_nxv2f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v9 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: fsrmi a0, 0 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t +; ZVFHMIN-NEXT: fsrm a0 +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: ret %a = call @llvm.roundeven.nxv2f16( %x) ret %a } declare @llvm.roundeven.nxv2f16() define @roundeven_nxv4f16( %x) { -; CHECK-LABEL: roundeven_nxv4f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI2_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: roundeven_nxv4f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI2_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: fsrmi a0, 0 +; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; ZVFH-NEXT: fsrm a0 +; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: roundeven_nxv4f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v10 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: fsrmi a0, 0 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t +; ZVFHMIN-NEXT: fsrm a0 +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v10, v8, v10, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 +; ZVFHMIN-NEXT: ret %a = call @llvm.roundeven.nxv4f16( %x) ret %a } declare @llvm.roundeven.nxv4f16() define @roundeven_nxv8f16( %x) { -; CHECK-LABEL: roundeven_nxv8f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: roundeven_nxv8f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI3_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; ZVFH-NEXT: vfabs.v v10, v8 +; ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; ZVFH-NEXT: fsrmi a0, 0 +; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; ZVFH-NEXT: fsrm a0 +; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: roundeven_nxv8f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v12 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: fsrmi a0, 0 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t +; ZVFHMIN-NEXT: fsrm a0 +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v12, v8, v12, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 +; ZVFHMIN-NEXT: ret %a = call @llvm.roundeven.nxv8f16( %x) ret %a } declare @llvm.roundeven.nxv8f16() define @roundeven_nxv16f16( %x) { -; CHECK-LABEL: roundeven_nxv16f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI4_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: roundeven_nxv16f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI4_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFH-NEXT: vfabs.v v12, v8 +; ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; ZVFH-NEXT: fsrmi a0, 0 +; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; ZVFH-NEXT: fsrm a0 +; ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: roundeven_nxv16f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v16 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: fsrmi a0, 0 +; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t +; ZVFHMIN-NEXT: fsrm a0 +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: ret %a = call @llvm.roundeven.nxv16f16( %x) ret %a } declare @llvm.roundeven.nxv16f16() define @roundeven_nxv32f16( %x) { -; CHECK-LABEL: roundeven_nxv32f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI5_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: roundeven_nxv32f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI5_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma +; ZVFH-NEXT: vfabs.v v16, v8 +; ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; ZVFH-NEXT: fsrmi a0, 0 +; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; ZVFH-NEXT: fsrm a0 +; ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, m8, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: roundeven_nxv32f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfabs.v v24, v16 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; ZVFHMIN-NEXT: fsrmi a0, 0 +; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; ZVFHMIN-NEXT: fsrm a0 +; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfabs.v v24, v16 +; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; ZVFHMIN-NEXT: fsrmi a0, 0 +; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; ZVFHMIN-NEXT: fsrm a0 +; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: ret %a = call @llvm.roundeven.nxv32f16( %x) ret %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll index 8841232e7f76df..971424e8cea09e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll @@ -1,112 +1,232 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN define @trunc_nxv1f16( %x) { -; CHECK-LABEL: trunc_nxv1f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: trunc_nxv1f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI0_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t +; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: trunc_nxv1f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v9 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v8, v9, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: ret %a = call @llvm.trunc.nxv1f16( %x) ret %a } declare @llvm.trunc.nxv1f16() define @trunc_nxv2f16( %x) { -; CHECK-LABEL: trunc_nxv2f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: trunc_nxv2f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI1_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t +; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: trunc_nxv2f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v9 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v8, v9, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-NEXT: ret %a = call @llvm.trunc.nxv2f16( %x) ret %a } declare @llvm.trunc.nxv2f16() define @trunc_nxv4f16( %x) { -; CHECK-LABEL: trunc_nxv4f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI2_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: trunc_nxv4f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI2_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t +; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: trunc_nxv4f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v10 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v8, v10, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v10, v8, v10, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 +; ZVFHMIN-NEXT: ret %a = call @llvm.trunc.nxv4f16( %x) ret %a } declare @llvm.trunc.nxv4f16() define @trunc_nxv8f16( %x) { -; CHECK-LABEL: trunc_nxv8f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: trunc_nxv8f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI3_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; ZVFH-NEXT: vfabs.v v10, v8 +; ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; ZVFH-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t +; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: trunc_nxv8f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v12 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v8, v12, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v12, v8, v12, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 +; ZVFHMIN-NEXT: ret %a = call @llvm.trunc.nxv8f16( %x) ret %a } declare @llvm.trunc.nxv8f16() define @trunc_nxv16f16( %x) { -; CHECK-LABEL: trunc_nxv16f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI4_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: trunc_nxv16f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI4_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFH-NEXT: vfabs.v v12, v8 +; ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; ZVFH-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t +; ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: trunc_nxv16f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v16 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v8, v16, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: ret %a = call @llvm.trunc.nxv16f16( %x) ret %a } declare @llvm.trunc.nxv16f16() define @trunc_nxv32f16( %x) { -; CHECK-LABEL: trunc_nxv32f16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI5_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: vfcvt.rtz.x.f.v v16, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; ZVFH-LABEL: trunc_nxv32f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: lui a0, %hi(.LCPI5_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0) +; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma +; ZVFH-NEXT: vfabs.v v16, v8 +; ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; ZVFH-NEXT: vfcvt.rtz.x.f.v v16, v8, v0.t +; ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; ZVFH-NEXT: vsetvli zero, zero, e16, m8, ta, mu +; ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: trunc_nxv32f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfabs.v v24, v16 +; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v24, v16, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfabs.v v8, v24 +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v16, v24, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v24, v16, v24, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 +; ZVFHMIN-NEXT: ret %a = call @llvm.trunc.nxv32f16( %x) ret %a } From d2d947b7e24679e0d1710a4f31dc0c8c9ee7c0b7 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Wed, 18 Sep 2024 09:37:04 +0100 Subject: [PATCH 024/321] [AMDGPU] Fold llvm.amdgcn.cvt.pkrtz when either operand is fpext (#108237) This also generalizes the Undef handling and adds Poison handling. --- .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 47 ++++++----- .../InstCombine/AMDGPU/amdgcn-intrinsics.ll | 79 +++++++++++++++++++ 2 files changed, 108 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 9f8926432d00ae..e8674c4c775950 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -640,27 +640,38 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { break; } case Intrinsic::amdgcn_cvt_pkrtz: { - Value *Src0 = II.getArgOperand(0); - Value *Src1 = II.getArgOperand(1); - if (const ConstantFP *C0 = dyn_cast(Src0)) { - if (const ConstantFP *C1 = dyn_cast(Src1)) { - const fltSemantics &HalfSem = - II.getType()->getScalarType()->getFltSemantics(); + auto foldFPTruncToF16RTZ = [](Value *Arg) -> Value * { + Type *HalfTy = Type::getHalfTy(Arg->getContext()); + + if (isa(Arg)) + return PoisonValue::get(HalfTy); + if (isa(Arg)) + return UndefValue::get(HalfTy); + + ConstantFP *CFP = nullptr; + if (match(Arg, m_ConstantFP(CFP))) { bool LosesInfo; - APFloat Val0 = C0->getValueAPF(); - APFloat Val1 = C1->getValueAPF(); - Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); - Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); - - Constant *Folded = - ConstantVector::get({ConstantFP::get(II.getContext(), Val0), - ConstantFP::get(II.getContext(), Val1)}); - return IC.replaceInstUsesWith(II, Folded); + APFloat Val(CFP->getValueAPF()); + Val.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo); + return ConstantFP::get(HalfTy, Val); } - } - if (isa(Src0) && isa(Src1)) { - return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); + Value *Src = nullptr; + if (match(Arg, m_FPExt(m_Value(Src)))) { + if (Src->getType()->isHalfTy()) + return Src; + } + + return nullptr; + }; + + if (Value *Src0 = foldFPTruncToF16RTZ(II.getArgOperand(0))) { + if (Value *Src1 = foldFPTruncToF16RTZ(II.getArgOperand(1))) { + Value *V = PoisonValue::get(II.getType()); + V = IC.Builder.CreateInsertElement(V, Src0, (uint64_t)0); + V = IC.Builder.CreateInsertElement(V, Src1, (uint64_t)1); + return IC.replaceInstUsesWith(II, V); + } } break; diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll index f3a3b8c1dc5d8a..fabf8ab51764b9 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll @@ -1161,6 +1161,85 @@ define <2 x half> @constant_rtz_pkrtz() { ret <2 x half> %cvt } +define <2 x half> @fpext_const_cvt_pkrtz(half %x) { +; CHECK-LABEL: @fpext_const_cvt_pkrtz( +; CHECK-NEXT: [[CVT:%.*]] = insertelement <2 x half> , half [[X:%.*]], i64 0 +; CHECK-NEXT: ret <2 x half> [[CVT]] +; + %ext = fpext half %x to float + %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %ext, float 3.0) + ret <2 x half> %cvt +} + +define <2 x half> @const_fpext_cvt_pkrtz(half %y) { +; CHECK-LABEL: @const_fpext_cvt_pkrtz( +; CHECK-NEXT: [[CVT:%.*]] = insertelement <2 x half> , half [[Y:%.*]], i64 1 +; CHECK-NEXT: ret <2 x half> [[CVT]] +; + %ext = fpext half %y to float + %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 5.0, float %ext) + ret <2 x half> %cvt +} + +define <2 x half> @const_fpext_multi_cvt_pkrtz(half %y) { +; CHECK-LABEL: @const_fpext_multi_cvt_pkrtz( +; CHECK-NEXT: [[CVT1:%.*]] = insertelement <2 x half> , half [[Y:%.*]], i64 1 +; CHECK-NEXT: [[CVT2:%.*]] = insertelement <2 x half> , half [[Y]], i64 1 +; CHECK-NEXT: [[ADD:%.*]] = fadd <2 x half> [[CVT1]], [[CVT2]] +; CHECK-NEXT: ret <2 x half> [[ADD]] +; + %ext = fpext half %y to float + %cvt1 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 5.0, float %ext) + %cvt2 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 3.0, float %ext) + %add = fadd <2 x half> %cvt1, %cvt2 + ret <2 x half> %add +} + +define <2 x half> @fpext_fpext_cvt_pkrtz(half %x, half %y) { +; CHECK-LABEL: @fpext_fpext_cvt_pkrtz( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x half> poison, half [[X:%.*]], i64 0 +; CHECK-NEXT: [[CVT:%.*]] = insertelement <2 x half> [[TMP1]], half [[Y:%.*]], i64 1 +; CHECK-NEXT: ret <2 x half> [[CVT]] +; + %extx = fpext half %x to float + %exty = fpext half %y to float + %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %extx, float %exty) + ret <2 x half> %cvt +} + +define <2 x half> @fpext_fpext_bf16_cvt_pkrtz(bfloat %x, bfloat %y) { +; CHECK-LABEL: @fpext_fpext_bf16_cvt_pkrtz( +; CHECK-NEXT: [[EXTX:%.*]] = fpext bfloat [[X:%.*]] to float +; CHECK-NEXT: [[EXTY:%.*]] = fpext bfloat [[Y:%.*]] to float +; CHECK-NEXT: [[CVT:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[EXTX]], float [[EXTY]]) +; CHECK-NEXT: ret <2 x half> [[CVT]] +; + %extx = fpext bfloat %x to float + %exty = fpext bfloat %y to float + %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %extx, float %exty) + ret <2 x half> %cvt +} + +define <2 x half> @poison_fpext_cvt_pkrtz(half %y) { +; CHECK-LABEL: @poison_fpext_cvt_pkrtz( +; CHECK-NEXT: [[CVT:%.*]] = insertelement <2 x half> poison, half [[Y:%.*]], i64 1 +; CHECK-NEXT: ret <2 x half> [[CVT]] +; + %ext = fpext half %y to float + %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float poison, float %ext) + ret <2 x half> %cvt +} + +define <2 x half> @fpext_poison_cvt_pkrtz(half %x) { +; CHECK-LABEL: @fpext_poison_cvt_pkrtz( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x half> poison, half [[X:%.*]], i64 0 +; CHECK-NEXT: ret <2 x half> [[TMP1]] +; + %ext = fpext half %x to float + %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %ext, float poison) + ret <2 x half> %cvt +} + ; -------------------------------------------------------------------- ; llvm.amdgcn.cvt.pknorm.i16 ; -------------------------------------------------------------------- From 112aac4e8961b9626bb84f36deeaa5a674f03f5a Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 18 Sep 2024 09:38:28 +0100 Subject: [PATCH 025/321] [InstCombine] Fold fmod to frem if we know it does not set errno. (#107912) fmod will be folded to frem in clang under -fno-math-errno and can be constant folded in llvm if the operands are known. It can be relatively common to have fp code that handles special values before doing some calculation: ``` if (isnan(f)) return handlenan; if (isinf(f)) return handleinf; .. fmod(f, 2.0) ``` This patch enables the folding of fmod to frem in instcombine if the first parameter is not inf and the second is not zero. Other combinations do not set errno. The same transform is performed for fmod with the nnan flag, which implies the input is known to not be inf/zero. --- .../llvm/Transforms/Utils/SimplifyLibCalls.h | 1 + .../lib/Transforms/Utils/SimplifyLibCalls.cpp | 33 +++++++++++++++++++ llvm/test/Transforms/InstCombine/fmod.ll | 30 ++++++++++++----- 3 files changed, 55 insertions(+), 9 deletions(-) diff --git a/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h index 2e7a0ec29ed999..2d3d2ada6183a7 100644 --- a/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h +++ b/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h @@ -206,6 +206,7 @@ class LibCallSimplifier { Value *optimizeFMinFMax(CallInst *CI, IRBuilderBase &B); Value *optimizeLog(CallInst *CI, IRBuilderBase &B); Value *optimizeSqrt(CallInst *CI, IRBuilderBase &B); + Value *optimizeFMod(CallInst *CI, IRBuilderBase &B); Value *mergeSqrtToExp(CallInst *CI, IRBuilderBase &B); Value *optimizeSinCosPi(CallInst *CI, bool IsSin, IRBuilderBase &B); Value *optimizeTrigInversionPairs(CallInst *CI, IRBuilderBase &B); diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index 917f81863cf673..4933b5bf60eea8 100644 --- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -2796,6 +2796,35 @@ Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilderBase &B) { return copyFlags(*CI, FabsCall); } +Value *LibCallSimplifier::optimizeFMod(CallInst *CI, IRBuilderBase &B) { + SimplifyQuery SQ(DL, TLI, DT, AC, CI, true, true, DC); + + // fmod(x,y) can set errno if y == 0 or x == +/-inf, and returns Nan in those + // case. If we know those do not happen, then we can convert the fmod into + // frem. + bool IsNoNan = CI->hasNoNaNs(); + if (!IsNoNan) { + KnownFPClass Known0 = computeKnownFPClass(CI->getOperand(0), fcInf, + /*Depth=*/0, SQ); + if (Known0.isKnownNeverInfinity()) { + KnownFPClass Known1 = + computeKnownFPClass(CI->getOperand(1), fcZero | fcSubnormal, + /*Depth=*/0, SQ); + Function *F = CI->getParent()->getParent(); + if (Known1.isKnownNeverLogicalZero(*F, CI->getType())) + IsNoNan = true; + } + } + + if (IsNoNan) { + Value *FRem = B.CreateFRemFMF(CI->getOperand(0), CI->getOperand(1), CI); + if (auto *FRemI = dyn_cast(FRem)) + FRemI->setHasNoNaNs(true); + substituteInParent(CI, FRem); + } + return nullptr; +} + Value *LibCallSimplifier::optimizeTrigInversionPairs(CallInst *CI, IRBuilderBase &B) { Module *M = CI->getModule(); @@ -3945,6 +3974,10 @@ Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI, case LibFunc_sqrt: case LibFunc_sqrtl: return optimizeSqrt(CI, Builder); + case LibFunc_fmod: + case LibFunc_fmodf: + case LibFunc_fmodl: + return optimizeFMod(CI, Builder); case LibFunc_logf: case LibFunc_log: case LibFunc_logl: diff --git a/llvm/test/Transforms/InstCombine/fmod.ll b/llvm/test/Transforms/InstCombine/fmod.ll index c021d27e95fa5e..10cff189b8dfca 100644 --- a/llvm/test/Transforms/InstCombine/fmod.ll +++ b/llvm/test/Transforms/InstCombine/fmod.ll @@ -9,7 +9,7 @@ define float @test_inf_const(float %f) { ; CHECK-NEXT: [[ISINF:%.*]] = fcmp oeq float [[ABS]], 0x7FF0000000000000 ; CHECK-NEXT: br i1 [[ISINF]], label [[RETURN:%.*]], label [[IF_END:%.*]] ; CHECK: if.end: -; CHECK-NEXT: [[CALL:%.*]] = tail call float @fmodf(float [[F]], float 2.000000e+00) +; CHECK-NEXT: [[CALL:%.*]] = frem nnan float [[F]], 2.000000e+00 ; CHECK-NEXT: ret float [[CALL]] ; CHECK: return: ; CHECK-NEXT: ret float 0.000000e+00 @@ -34,7 +34,7 @@ define float @test_const_zero(float %f) { ; CHECK-NEXT: [[ISZERO:%.*]] = fcmp oeq float [[F]], 0.000000e+00 ; CHECK-NEXT: br i1 [[ISZERO]], label [[RETURN:%.*]], label [[IF_END:%.*]] ; CHECK: if.end: -; CHECK-NEXT: [[CALL:%.*]] = tail call float @fmodf(float 2.000000e+00, float [[F]]) +; CHECK-NEXT: [[CALL:%.*]] = frem nnan float 2.000000e+00, [[F]] ; CHECK-NEXT: ret float [[CALL]] ; CHECK: return: ; CHECK-NEXT: ret float 0.000000e+00 @@ -67,11 +67,11 @@ define float @test_noinf_nozero(float nofpclass(inf) %f, float nofpclass(zero) % ; CHECK-LABEL: define float @test_noinf_nozero( ; CHECK-SAME: float nofpclass(inf) [[F:%.*]], float nofpclass(zero) [[G:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CALL:%.*]] = tail call nnan float @fmodf(float [[F]], float [[G]]) +; CHECK-NEXT: [[CALL:%.*]] = frem nnan float [[F]], [[G]] ; CHECK-NEXT: ret float [[CALL]] ; entry: - %call = tail call nnan float @fmodf(float %f, float %g) + %call = tail call float @fmodf(float %f, float %g) ret float %call } @@ -79,7 +79,7 @@ define double @test_double(double nofpclass(inf) %f, double nofpclass(zero) %g) ; CHECK-LABEL: define double @test_double( ; CHECK-SAME: double nofpclass(inf) [[F:%.*]], double nofpclass(zero) [[G:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CALL:%.*]] = tail call double @fmod(double [[F]], double [[G]]) +; CHECK-NEXT: [[CALL:%.*]] = frem nnan double [[F]], [[G]] ; CHECK-NEXT: ret double [[CALL]] ; entry: @@ -91,7 +91,7 @@ define fp128 @test_fp128(fp128 nofpclass(inf) %f, fp128 nofpclass(zero) %g) { ; CHECK-LABEL: define fp128 @test_fp128( ; CHECK-SAME: fp128 nofpclass(inf) [[F:%.*]], fp128 nofpclass(zero) [[G:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CALL:%.*]] = tail call fp128 @fmodl(fp128 [[F]], fp128 [[G]]) +; CHECK-NEXT: [[CALL:%.*]] = frem nnan fp128 [[F]], [[G]] ; CHECK-NEXT: ret fp128 [[CALL]] ; entry: @@ -103,11 +103,11 @@ define float @test_noinf_nozero_dazpreservesign(float nofpclass(inf) %f, float n ; CHECK-LABEL: define float @test_noinf_nozero_dazpreservesign( ; CHECK-SAME: float nofpclass(inf) [[F:%.*]], float nofpclass(zero) [[G:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CALL:%.*]] = tail call nnan float @fmodf(float [[F]], float [[G]]) +; CHECK-NEXT: [[CALL:%.*]] = tail call float @fmodf(float [[F]], float [[G]]) ; CHECK-NEXT: ret float [[CALL]] ; entry: - %call = tail call nnan float @fmodf(float %f, float %g) + %call = tail call float @fmodf(float %f, float %g) ret float %call } @@ -115,7 +115,19 @@ define float @test_noinf_nozero_dazdynamic(float nofpclass(inf) %f, float nofpcl ; CHECK-LABEL: define float @test_noinf_nozero_dazdynamic( ; CHECK-SAME: float nofpclass(inf) [[F:%.*]], float nofpclass(zero) [[G:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CALL:%.*]] = tail call nnan float @fmodf(float [[F]], float [[G]]) +; CHECK-NEXT: [[CALL:%.*]] = tail call float @fmodf(float [[F]], float [[G]]) +; CHECK-NEXT: ret float [[CALL]] +; +entry: + %call = tail call float @fmodf(float %f, float %g) + ret float %call +} + +define float @test_nnan(float %f, float %g) { +; CHECK-LABEL: define float @test_nnan( +; CHECK-SAME: float [[F:%.*]], float [[G:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CALL:%.*]] = frem nnan float [[F]], [[G]] ; CHECK-NEXT: ret float [[CALL]] ; entry: From 43c9203d4946b7911d2ba69369717979900d7bc2 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Wed, 18 Sep 2024 09:40:29 +0100 Subject: [PATCH 026/321] [TLI] Support inferring function attributes for sincos[f|l] (#108554) --- .../llvm/Analysis/TargetLibraryInfo.def | 15 ++++++++++++++ llvm/lib/Transforms/Utils/BuildLibCalls.cpp | 12 +++++++++++ .../Transforms/InferFunctionAttrs/annotate.ll | 9 +++++++++ .../tools/llvm-tli-checker/ps4-tli-check.yaml | 20 +++++++++++++++---- .../Analysis/TargetLibraryInfoTest.cpp | 3 +++ 5 files changed, 55 insertions(+), 4 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.def b/llvm/include/llvm/Analysis/TargetLibraryInfo.def index 5914324b286c05..ebc917ea53eb8d 100644 --- a/llvm/include/llvm/Analysis/TargetLibraryInfo.def +++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.def @@ -2183,6 +2183,21 @@ TLI_DEFINE_ENUM_INTERNAL(sinl) TLI_DEFINE_STRING_INTERNAL("sinl") TLI_DEFINE_SIG_INTERNAL(LDbl, LDbl) +/// void sincos(double x, double *sin_out, double *cos_out); +TLI_DEFINE_ENUM_INTERNAL(sincos) +TLI_DEFINE_STRING_INTERNAL("sincos") +TLI_DEFINE_SIG_INTERNAL(Void, Dbl, Ptr, Ptr) + +/// void sincosf(float x, float *sin_out, float *cos_out); +TLI_DEFINE_ENUM_INTERNAL(sincosf) +TLI_DEFINE_STRING_INTERNAL("sincosf") +TLI_DEFINE_SIG_INTERNAL(Void, Flt, Ptr, Ptr) + +/// void sincosl(long double x, long double *sin_out, long double *cos_out); +TLI_DEFINE_ENUM_INTERNAL(sincosl) +TLI_DEFINE_STRING_INTERNAL("sincosl") +TLI_DEFINE_SIG_INTERNAL(Void, LDbl, Ptr, Ptr) + /// int siprintf(char *str, const char *format, ...); TLI_DEFINE_ENUM_INTERNAL(siprintf) TLI_DEFINE_STRING_INTERNAL("siprintf") diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp index b0da19813f0a4b..f6448883287587 100644 --- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp @@ -1270,6 +1270,18 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F, Changed |= setOnlyWritesMemory(F); Changed |= setWillReturn(F); break; + case LibFunc_sincos: + case LibFunc_sincosf: + case LibFunc_sincosl: + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotFreeMemory(F); + Changed |= setOnlyWritesMemory(F); + Changed |= setOnlyWritesMemory(F, 1); + Changed |= setOnlyWritesMemory(F, 2); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setWillReturn(F); + break; default: // FIXME: It'd be really nice to cover all the library functions we're // aware of here. diff --git a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll index bc0d7a509e1f5d..40c512c81f0c9d 100644 --- a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll +++ b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll @@ -1106,6 +1106,15 @@ declare void @__cxa_throw(ptr, ptr, ptr) ; CHECK: declare void @_ZSt9terminatev() [[NOFREE_COLD_NORETURN:#[0-9]+]] declare void @_ZSt9terminatev() +; CHECK: declare void @sincos(double, ptr nocapture writeonly, ptr nocapture writeonly) [[NOFREE_NOUNWIND_WILLRETURN_WRITEONLY]] +declare void @sincos(double, ptr, ptr) + +; CHECK: declare void @sincosf(float, ptr nocapture writeonly, ptr nocapture writeonly) [[NOFREE_NOUNWIND_WILLRETURN_WRITEONLY]] +declare void @sincosf(float, ptr, ptr) + +; CHECK: declare void @sincosl(x86_fp80, ptr nocapture writeonly, ptr nocapture writeonly) [[NOFREE_NOUNWIND_WILLRETURN_WRITEONLY]] +declare void @sincosl(x86_fp80, ptr, ptr) + ; memset_pattern{4,8,16} aren't available everywhere. ; CHECK-DARWIN: declare void @memset_pattern4(ptr nocapture writeonly, ptr nocapture readonly, i64) [[ARGMEMONLY_NOFREE_NOUNWIND_WILLRETURN]] declare void @memset_pattern4(ptr, ptr, i64) diff --git a/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml b/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml index 47aeb0ad8fdef9..26efb2bc97cd14 100644 --- a/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml +++ b/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml @@ -34,7 +34,7 @@ # # CHECK: << Total TLI yes SDK no: 18 # CHECK: >> Total TLI no SDK yes: 0 -# CHECK: == Total TLI yes SDK yes: 250 +# CHECK: == Total TLI yes SDK yes: 253 # # WRONG_DETAIL: << TLI yes SDK no : '_ZdaPv' aka operator delete[](void*) # WRONG_DETAIL: >> TLI no SDK yes: '_ZdaPvj' aka operator delete[](void*, unsigned int) @@ -48,14 +48,14 @@ # WRONG_DETAIL: << TLI yes SDK no : 'fminimum_numl' # WRONG_SUMMARY: << Total TLI yes SDK no: 19{{$}} # WRONG_SUMMARY: >> Total TLI no SDK yes: 1{{$}} -# WRONG_SUMMARY: == Total TLI yes SDK yes: 249 +# WRONG_SUMMARY: == Total TLI yes SDK yes: 252 # ## The -COUNT suffix doesn't care if there are too many matches, so check ## the exact count first; the two directives should add up to that. ## Yes, this means additions to TLI will fail this test, but the argument ## to -COUNT can't be an expression. -# AVAIL: TLI knows 501 symbols, 268 available -# AVAIL-COUNT-268: {{^}} available +# AVAIL: TLI knows 504 symbols, 271 available +# AVAIL-COUNT-271: {{^}} available # AVAIL-NOT: {{^}} available # UNAVAIL-COUNT-233: not available # UNAVAIL-NOT: not available @@ -862,6 +862,18 @@ DynamicSymbols: Type: STT_FUNC Section: .text Binding: STB_GLOBAL + - Name: sincos + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + - Name: sincosf + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + - Name: sincosl + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL - Name: sinh Type: STT_FUNC Section: .text diff --git a/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp b/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp index c081c44ed35d00..ac8ccc03399e14 100644 --- a/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp +++ b/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp @@ -339,6 +339,9 @@ TEST_F(TargetLibraryInfoTest, ValidProto) { "declare float @sinhf(float)\n" "declare x86_fp80 @sinhl(x86_fp80)\n" "declare x86_fp80 @sinl(x86_fp80)\n" + "declare void @sincos(double, ptr, ptr)\n" + "declare void @sincosf(float, ptr, ptr)\n" + "declare void @sincosl(x86_fp80, ptr, ptr)\n" "declare i32 @snprintf(i8*, i64, i8*, ...)\n" "declare i32 @sprintf(i8*, i8*, ...)\n" "declare double @sqrt(double)\n" From c59ac1a2f67d41b7fb2988bf013d37046d655c2c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 18 Sep 2024 10:34:35 +0100 Subject: [PATCH 027/321] [X86] Cleanup AVX512 VBROADCAST subvector instruction names. (#108888) This patch makes the `VBROADCAST***X**` subvector broadcast instructions consistent - the `***X**` section represents the original subvector type/size, but we were not correctly using the AVX512 Z/Z256/Z128 suffix to consistently represent the destination width (or we missed it entirely). --- .../X86/MCTargetDesc/X86InstComments.cpp | 20 +-- .../Target/X86/X86FixupVectorConstants.cpp | 8 +- llvm/lib/Target/X86/X86InstrAVX512.td | 124 +++++++++--------- llvm/lib/Target/X86/X86InstrInfo.cpp | 8 +- llvm/lib/Target/X86/X86MCInstLower.cpp | 20 +-- llvm/lib/Target/X86/X86SchedIceLake.td | 20 +-- llvm/lib/Target/X86/X86SchedSapphireRapids.td | 14 +- llvm/lib/Target/X86/X86SchedSkylakeServer.td | 20 +-- 8 files changed, 117 insertions(+), 117 deletions(-) diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp index 95038ccf63b8b8..a4b72515252a08 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp @@ -1249,18 +1249,18 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::VBROADCASTF128rm: case X86::VBROADCASTI128rm: - CASE_AVX512_INS_COMMON(BROADCASTF64X2, Z128, rm) - CASE_AVX512_INS_COMMON(BROADCASTI64X2, Z128, rm) + CASE_AVX512_INS_COMMON(BROADCASTF64X2, Z256, rm) + CASE_AVX512_INS_COMMON(BROADCASTI64X2, Z256, rm) DecodeSubVectorBroadcast(4, 2, ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; - CASE_AVX512_INS_COMMON(BROADCASTF64X2, , rm) - CASE_AVX512_INS_COMMON(BROADCASTI64X2, , rm) + CASE_AVX512_INS_COMMON(BROADCASTF64X2, Z, rm) + CASE_AVX512_INS_COMMON(BROADCASTI64X2, Z, rm) DecodeSubVectorBroadcast(8, 2, ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; - CASE_AVX512_INS_COMMON(BROADCASTF64X4, , rm) - CASE_AVX512_INS_COMMON(BROADCASTI64X4, , rm) + CASE_AVX512_INS_COMMON(BROADCASTF64X4, Z, rm) + CASE_AVX512_INS_COMMON(BROADCASTI64X4, Z, rm) DecodeSubVectorBroadcast(8, 4, ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; @@ -1269,13 +1269,13 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, DecodeSubVectorBroadcast(8, 4, ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; - CASE_AVX512_INS_COMMON(BROADCASTF32X4, , rm) - CASE_AVX512_INS_COMMON(BROADCASTI32X4, , rm) + CASE_AVX512_INS_COMMON(BROADCASTF32X4, Z, rm) + CASE_AVX512_INS_COMMON(BROADCASTI32X4, Z, rm) DecodeSubVectorBroadcast(16, 4, ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; - CASE_AVX512_INS_COMMON(BROADCASTF32X8, , rm) - CASE_AVX512_INS_COMMON(BROADCASTI32X8, , rm) + CASE_AVX512_INS_COMMON(BROADCASTF32X8, Z, rm) + CASE_AVX512_INS_COMMON(BROADCASTI32X8, Z, rm) DecodeSubVectorBroadcast(16, 8, ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp index c9f79e1645f58b..68a4a0be3a1db7 100644 --- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp +++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp @@ -439,8 +439,8 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, case X86::VMOVUPSZrm: return FixupConstant({{X86::VBROADCASTSSZrm, 1, 32, rebuildSplatCst}, {X86::VBROADCASTSDZrm, 1, 64, rebuildSplatCst}, - {X86::VBROADCASTF32X4rm, 1, 128, rebuildSplatCst}, - {X86::VBROADCASTF64X4rm, 1, 256, rebuildSplatCst}}, + {X86::VBROADCASTF32X4Zrm, 1, 128, rebuildSplatCst}, + {X86::VBROADCASTF64X4Zrm, 1, 256, rebuildSplatCst}}, 512, 1); /* Integer Loads */ case X86::MOVDQArm: @@ -572,12 +572,12 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, {X86::VPBROADCASTQZrm, 1, 64, rebuildSplatCst}, {X86::VPMOVSXBQZrm, 8, 8, rebuildSExtCst}, {X86::VPMOVZXBQZrm, 8, 8, rebuildZExtCst}, - {X86::VBROADCASTI32X4rm, 1, 128, rebuildSplatCst}, + {X86::VBROADCASTI32X4Zrm, 1, 128, rebuildSplatCst}, {X86::VPMOVSXBDZrm, 16, 8, rebuildSExtCst}, {X86::VPMOVZXBDZrm, 16, 8, rebuildZExtCst}, {X86::VPMOVSXWQZrm, 8, 16, rebuildSExtCst}, {X86::VPMOVZXWQZrm, 8, 16, rebuildZExtCst}, - {X86::VBROADCASTI64X4rm, 1, 256, rebuildSplatCst}, + {X86::VBROADCASTI64X4Zrm, 1, 256, rebuildSplatCst}, {HasBWI ? X86::VPMOVSXBWZrm : 0, 32, 8, rebuildSExtCst}, {HasBWI ? X86::VPMOVZXBWZrm : 0, 32, 8, rebuildZExtCst}, {X86::VPMOVSXWDZrm, 16, 16, rebuildSExtCst}, diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 9ed59803c1f9d9..928abac46da866 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -1337,84 +1337,84 @@ let Predicates = [HasVLX, HasBWI] in { // AVX-512 BROADCAST SUBVECTORS // -defm VBROADCASTI32X4 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4", - X86SubVBroadcastld128, v16i32_info, v4i32x_info>, - EVEX_V512, EVEX_CD8<32, CD8VT4>; -defm VBROADCASTF32X4 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4", - X86SubVBroadcastld128, v16f32_info, v4f32x_info>, - EVEX_V512, EVEX_CD8<32, CD8VT4>; -defm VBROADCASTI64X4 : avx512_subvec_broadcast_rm<0x5b, "vbroadcasti64x4", - X86SubVBroadcastld256, v8i64_info, v4i64x_info>, REX_W, - EVEX_V512, EVEX_CD8<64, CD8VT4>; -defm VBROADCASTF64X4 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf64x4", - X86SubVBroadcastld256, v8f64_info, v4f64x_info>, REX_W, - EVEX_V512, EVEX_CD8<64, CD8VT4>; +defm VBROADCASTI32X4Z : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4", + X86SubVBroadcastld128, v16i32_info, v4i32x_info>, + EVEX_V512, EVEX_CD8<32, CD8VT4>; +defm VBROADCASTF32X4Z : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4", + X86SubVBroadcastld128, v16f32_info, v4f32x_info>, + EVEX_V512, EVEX_CD8<32, CD8VT4>; +defm VBROADCASTI64X4Z : avx512_subvec_broadcast_rm<0x5b, "vbroadcasti64x4", + X86SubVBroadcastld256, v8i64_info, v4i64x_info>, REX_W, + EVEX_V512, EVEX_CD8<64, CD8VT4>; +defm VBROADCASTF64X4Z : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf64x4", + X86SubVBroadcastld256, v8f64_info, v4f64x_info>, REX_W, + EVEX_V512, EVEX_CD8<64, CD8VT4>; let Predicates = [HasAVX512] in { def : Pat<(v8f64 (X86SubVBroadcastld256 addr:$src)), - (VBROADCASTF64X4rm addr:$src)>; + (VBROADCASTF64X4Zrm addr:$src)>; def : Pat<(v16f32 (X86SubVBroadcastld256 addr:$src)), - (VBROADCASTF64X4rm addr:$src)>; + (VBROADCASTF64X4Zrm addr:$src)>; def : Pat<(v32f16 (X86SubVBroadcastld256 addr:$src)), - (VBROADCASTF64X4rm addr:$src)>; + (VBROADCASTF64X4Zrm addr:$src)>; def : Pat<(v8i64 (X86SubVBroadcastld256 addr:$src)), - (VBROADCASTI64X4rm addr:$src)>; + (VBROADCASTI64X4Zrm addr:$src)>; def : Pat<(v16i32 (X86SubVBroadcastld256 addr:$src)), - (VBROADCASTI64X4rm addr:$src)>; + (VBROADCASTI64X4Zrm addr:$src)>; def : Pat<(v32i16 (X86SubVBroadcastld256 addr:$src)), - (VBROADCASTI64X4rm addr:$src)>; + (VBROADCASTI64X4Zrm addr:$src)>; def : Pat<(v64i8 (X86SubVBroadcastld256 addr:$src)), - (VBROADCASTI64X4rm addr:$src)>; + (VBROADCASTI64X4Zrm addr:$src)>; def : Pat<(v8f64 (X86SubVBroadcastld128 addr:$src)), - (VBROADCASTF32X4rm addr:$src)>; + (VBROADCASTF32X4Zrm addr:$src)>; def : Pat<(v16f32 (X86SubVBroadcastld128 addr:$src)), - (VBROADCASTF32X4rm addr:$src)>; + (VBROADCASTF32X4Zrm addr:$src)>; def : Pat<(v32f16 (X86SubVBroadcastld128 addr:$src)), - (VBROADCASTF32X4rm addr:$src)>; + (VBROADCASTF32X4Zrm addr:$src)>; def : Pat<(v8i64 (X86SubVBroadcastld128 addr:$src)), - (VBROADCASTI32X4rm addr:$src)>; + (VBROADCASTI32X4Zrm addr:$src)>; def : Pat<(v16i32 (X86SubVBroadcastld128 addr:$src)), - (VBROADCASTI32X4rm addr:$src)>; + (VBROADCASTI32X4Zrm addr:$src)>; def : Pat<(v32i16 (X86SubVBroadcastld128 addr:$src)), - (VBROADCASTI32X4rm addr:$src)>; + (VBROADCASTI32X4Zrm addr:$src)>; def : Pat<(v64i8 (X86SubVBroadcastld128 addr:$src)), - (VBROADCASTI32X4rm addr:$src)>; + (VBROADCASTI32X4Zrm addr:$src)>; // Patterns for selects of bitcasted operations. def : Pat<(vselect_mask VK16WM:$mask, (bc_v16f32 (v8f64 (X86SubVBroadcastld128 addr:$src))), (v16f32 immAllZerosV)), - (VBROADCASTF32X4rmkz VK16WM:$mask, addr:$src)>; + (VBROADCASTF32X4Zrmkz VK16WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK16WM:$mask, (bc_v16f32 (v8f64 (X86SubVBroadcastld128 addr:$src))), VR512:$src0), - (VBROADCASTF32X4rmk VR512:$src0, VK16WM:$mask, addr:$src)>; + (VBROADCASTF32X4Zrmk VR512:$src0, VK16WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK16WM:$mask, (bc_v16i32 (v8i64 (X86SubVBroadcastld128 addr:$src))), (v16i32 immAllZerosV)), - (VBROADCASTI32X4rmkz VK16WM:$mask, addr:$src)>; + (VBROADCASTI32X4Zrmkz VK16WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK16WM:$mask, (bc_v16i32 (v8i64 (X86SubVBroadcastld128 addr:$src))), VR512:$src0), - (VBROADCASTI32X4rmk VR512:$src0, VK16WM:$mask, addr:$src)>; + (VBROADCASTI32X4Zrmk VR512:$src0, VK16WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK8WM:$mask, (bc_v8f64 (v16f32 (X86SubVBroadcastld256 addr:$src))), (v8f64 immAllZerosV)), - (VBROADCASTF64X4rmkz VK8WM:$mask, addr:$src)>; + (VBROADCASTF64X4Zrmkz VK8WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK8WM:$mask, (bc_v8f64 (v16f32 (X86SubVBroadcastld256 addr:$src))), VR512:$src0), - (VBROADCASTF64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>; + (VBROADCASTF64X4Zrmk VR512:$src0, VK8WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK8WM:$mask, (bc_v8i64 (v16i32 (X86SubVBroadcastld256 addr:$src))), (v8i64 immAllZerosV)), - (VBROADCASTI64X4rmkz VK8WM:$mask, addr:$src)>; + (VBROADCASTI64X4Zrmkz VK8WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK8WM:$mask, (bc_v8i64 (v16i32 (X86SubVBroadcastld256 addr:$src))), VR512:$src0), - (VBROADCASTI64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>; + (VBROADCASTI64X4Zrmk VR512:$src0, VK8WM:$mask, addr:$src)>; } let Predicates = [HasVLX] in { @@ -1461,9 +1461,9 @@ def : Pat<(vselect_mask VK8WM:$mask, let Predicates = [HasBF16] in { def : Pat<(v32bf16 (X86SubVBroadcastld256 addr:$src)), - (VBROADCASTF64X4rm addr:$src)>; + (VBROADCASTF64X4Zrm addr:$src)>; def : Pat<(v32bf16 (X86SubVBroadcastld128 addr:$src)), - (VBROADCASTF32X4rm addr:$src)>; + (VBROADCASTF32X4Zrm addr:$src)>; } let Predicates = [HasBF16, HasVLX] in @@ -1471,10 +1471,10 @@ let Predicates = [HasBF16, HasVLX] in (VBROADCASTF32X4Z256rm addr:$src)>; let Predicates = [HasVLX, HasDQI] in { -defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2", +defm VBROADCASTI64X2Z256 : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2", X86SubVBroadcastld128, v4i64x_info, v2i64x_info>, EVEX_V256, EVEX_CD8<64, CD8VT2>, REX_W; -defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2", +defm VBROADCASTF64X2Z256 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2", X86SubVBroadcastld128, v4f64x_info, v2f64x_info>, EVEX_V256, EVEX_CD8<64, CD8VT2>, REX_W; @@ -1482,69 +1482,69 @@ defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2" def : Pat<(vselect_mask VK4WM:$mask, (bc_v4f64 (v8f32 (X86SubVBroadcastld128 addr:$src))), (v4f64 immAllZerosV)), - (VBROADCASTF64X2Z128rmkz VK4WM:$mask, addr:$src)>; + (VBROADCASTF64X2Z256rmkz VK4WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK4WM:$mask, (bc_v4f64 (v8f32 (X86SubVBroadcastld128 addr:$src))), VR256X:$src0), - (VBROADCASTF64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>; + (VBROADCASTF64X2Z256rmk VR256X:$src0, VK4WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK4WM:$mask, (bc_v4i64 (v8i32 (X86SubVBroadcastld128 addr:$src))), (v4i64 immAllZerosV)), - (VBROADCASTI64X2Z128rmkz VK4WM:$mask, addr:$src)>; + (VBROADCASTI64X2Z256rmkz VK4WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK4WM:$mask, (bc_v4i64 (v8i32 (X86SubVBroadcastld128 addr:$src))), VR256X:$src0), - (VBROADCASTI64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>; + (VBROADCASTI64X2Z256rmk VR256X:$src0, VK4WM:$mask, addr:$src)>; } let Predicates = [HasDQI] in { -defm VBROADCASTI64X2 : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2", - X86SubVBroadcastld128, v8i64_info, v2i64x_info>, REX_W, - EVEX_V512, EVEX_CD8<64, CD8VT2>; -defm VBROADCASTI32X8 : avx512_subvec_broadcast_rm_dq<0x5b, "vbroadcasti32x8", - X86SubVBroadcastld256, v16i32_info, v8i32x_info>, - EVEX_V512, EVEX_CD8<32, CD8VT8>; -defm VBROADCASTF64X2 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2", - X86SubVBroadcastld128, v8f64_info, v2f64x_info>, REX_W, - EVEX_V512, EVEX_CD8<64, CD8VT2>; -defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm_dq<0x1b, "vbroadcastf32x8", - X86SubVBroadcastld256, v16f32_info, v8f32x_info>, - EVEX_V512, EVEX_CD8<32, CD8VT8>; +defm VBROADCASTI64X2Z : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2", + X86SubVBroadcastld128, v8i64_info, v2i64x_info>, REX_W, + EVEX_V512, EVEX_CD8<64, CD8VT2>; +defm VBROADCASTI32X8Z : avx512_subvec_broadcast_rm_dq<0x5b, "vbroadcasti32x8", + X86SubVBroadcastld256, v16i32_info, v8i32x_info>, + EVEX_V512, EVEX_CD8<32, CD8VT8>; +defm VBROADCASTF64X2Z : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2", + X86SubVBroadcastld128, v8f64_info, v2f64x_info>, REX_W, + EVEX_V512, EVEX_CD8<64, CD8VT2>; +defm VBROADCASTF32X8Z : avx512_subvec_broadcast_rm_dq<0x1b, "vbroadcastf32x8", + X86SubVBroadcastld256, v16f32_info, v8f32x_info>, + EVEX_V512, EVEX_CD8<32, CD8VT8>; // Patterns for selects of bitcasted operations. def : Pat<(vselect_mask VK16WM:$mask, (bc_v16f32 (v8f64 (X86SubVBroadcastld256 addr:$src))), (v16f32 immAllZerosV)), - (VBROADCASTF32X8rmkz VK16WM:$mask, addr:$src)>; + (VBROADCASTF32X8Zrmkz VK16WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK16WM:$mask, (bc_v16f32 (v8f64 (X86SubVBroadcastld256 addr:$src))), VR512:$src0), - (VBROADCASTF32X8rmk VR512:$src0, VK16WM:$mask, addr:$src)>; + (VBROADCASTF32X8Zrmk VR512:$src0, VK16WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK16WM:$mask, (bc_v16i32 (v8i64 (X86SubVBroadcastld256 addr:$src))), (v16i32 immAllZerosV)), - (VBROADCASTI32X8rmkz VK16WM:$mask, addr:$src)>; + (VBROADCASTI32X8Zrmkz VK16WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK16WM:$mask, (bc_v16i32 (v8i64 (X86SubVBroadcastld256 addr:$src))), VR512:$src0), - (VBROADCASTI32X8rmk VR512:$src0, VK16WM:$mask, addr:$src)>; + (VBROADCASTI32X8Zrmk VR512:$src0, VK16WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK8WM:$mask, (bc_v8f64 (v16f32 (X86SubVBroadcastld128 addr:$src))), (v8f64 immAllZerosV)), - (VBROADCASTF64X2rmkz VK8WM:$mask, addr:$src)>; + (VBROADCASTF64X2Zrmkz VK8WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK8WM:$mask, (bc_v8f64 (v16f32 (X86SubVBroadcastld128 addr:$src))), VR512:$src0), - (VBROADCASTF64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>; + (VBROADCASTF64X2Zrmk VR512:$src0, VK8WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK8WM:$mask, (bc_v8i64 (v16i32 (X86SubVBroadcastld128 addr:$src))), (v8i64 immAllZerosV)), - (VBROADCASTI64X2rmkz VK8WM:$mask, addr:$src)>; + (VBROADCASTI64X2Zrmkz VK8WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK8WM:$mask, (bc_v8i64 (v16i32 (X86SubVBroadcastld128 addr:$src))), VR512:$src0), - (VBROADCASTI64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>; + (VBROADCASTI64X2Zrmk VR512:$src0, VK8WM:$mask, addr:$src)>; } multiclass avx512_common_broadcast_32x2 opc, string OpcodeStr, diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 57a894b09e0445..38ea1f35be2b9a 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -6246,16 +6246,16 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { } case X86::VMOVAPSZ128rm_NOVLX: return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSrm), - get(X86::VBROADCASTF32X4rm), X86::sub_xmm); + get(X86::VBROADCASTF32X4Zrm), X86::sub_xmm); case X86::VMOVUPSZ128rm_NOVLX: return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSrm), - get(X86::VBROADCASTF32X4rm), X86::sub_xmm); + get(X86::VBROADCASTF32X4Zrm), X86::sub_xmm); case X86::VMOVAPSZ256rm_NOVLX: return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSYrm), - get(X86::VBROADCASTF64X4rm), X86::sub_ymm); + get(X86::VBROADCASTF64X4Zrm), X86::sub_ymm); case X86::VMOVUPSZ256rm_NOVLX: return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSYrm), - get(X86::VBROADCASTF64X4rm), X86::sub_ymm); + get(X86::VBROADCASTF64X4Zrm), X86::sub_ymm); case X86::VMOVAPSZ128mr_NOVLX: return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSmr), get(X86::VEXTRACTF32x4Zmri), X86::sub_xmm); diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp index 77ddd2366e629e..55c237e2df2d2e 100644 --- a/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -2051,21 +2051,21 @@ static void addConstantComments(const MachineInstr *MI, case X86::VBROADCASTF128rm: case X86::VBROADCASTI128rm: MASK_AVX512_CASE(X86::VBROADCASTF32X4Z256rm) - MASK_AVX512_CASE(X86::VBROADCASTF64X2Z128rm) + MASK_AVX512_CASE(X86::VBROADCASTF64X2Z256rm) MASK_AVX512_CASE(X86::VBROADCASTI32X4Z256rm) - MASK_AVX512_CASE(X86::VBROADCASTI64X2Z128rm) + MASK_AVX512_CASE(X86::VBROADCASTI64X2Z256rm) printBroadcast(MI, OutStreamer, 2, 128); break; - MASK_AVX512_CASE(X86::VBROADCASTF32X4rm) - MASK_AVX512_CASE(X86::VBROADCASTF64X2rm) - MASK_AVX512_CASE(X86::VBROADCASTI32X4rm) - MASK_AVX512_CASE(X86::VBROADCASTI64X2rm) + MASK_AVX512_CASE(X86::VBROADCASTF32X4Zrm) + MASK_AVX512_CASE(X86::VBROADCASTF64X2Zrm) + MASK_AVX512_CASE(X86::VBROADCASTI32X4Zrm) + MASK_AVX512_CASE(X86::VBROADCASTI64X2Zrm) printBroadcast(MI, OutStreamer, 4, 128); break; - MASK_AVX512_CASE(X86::VBROADCASTF32X8rm) - MASK_AVX512_CASE(X86::VBROADCASTF64X4rm) - MASK_AVX512_CASE(X86::VBROADCASTI32X8rm) - MASK_AVX512_CASE(X86::VBROADCASTI64X4rm) + MASK_AVX512_CASE(X86::VBROADCASTF32X8Zrm) + MASK_AVX512_CASE(X86::VBROADCASTF64X4Zrm) + MASK_AVX512_CASE(X86::VBROADCASTI32X8Zrm) + MASK_AVX512_CASE(X86::VBROADCASTI64X4Zrm) printBroadcast(MI, OutStreamer, 2, 256); break; diff --git a/llvm/lib/Target/X86/X86SchedIceLake.td b/llvm/lib/Target/X86/X86SchedIceLake.td index 72fbcc5598108f..a5051d932d4e21 100644 --- a/llvm/lib/Target/X86/X86SchedIceLake.td +++ b/llvm/lib/Target/X86/X86SchedIceLake.td @@ -1576,19 +1576,19 @@ def: InstRW<[ICXWriteResGroup121, ReadAfterVecYLd], "VBROADCASTF32X2Z256rm(b?)", "VBROADCASTF32X2Zrm(b?)", "VBROADCASTF32X4Z256rm(b?)", - "VBROADCASTF32X4rm(b?)", - "VBROADCASTF32X8rm(b?)", - "VBROADCASTF64X2Z128rm(b?)", - "VBROADCASTF64X2rm(b?)", - "VBROADCASTF64X4rm(b?)", + "VBROADCASTF32X4Zrm(b?)", + "VBROADCASTF32X8Zrm(b?)", + "VBROADCASTF64X2Z256rm(b?)", + "VBROADCASTF64X2Zrm(b?)", + "VBROADCASTF64X4Zrm(b?)", "VBROADCASTI32X2Z256rm(b?)", "VBROADCASTI32X2Zrm(b?)", "VBROADCASTI32X4Z256rm(b?)", - "VBROADCASTI32X4rm(b?)", - "VBROADCASTI32X8rm(b?)", - "VBROADCASTI64X2Z128rm(b?)", - "VBROADCASTI64X2rm(b?)", - "VBROADCASTI64X4rm(b?)", + "VBROADCASTI32X4Zrm(b?)", + "VBROADCASTI32X8Zrm(b?)", + "VBROADCASTI64X2Z256rm(b?)", + "VBROADCASTI64X2Zrm(b?)", + "VBROADCASTI64X4Zrm(b?)", "VBROADCASTSD(Z|Z256)rm(b?)", "VBROADCASTSS(Z|Z256)rm(b?)", "VINSERTF32x4(Z|Z256)rm(b?)", diff --git a/llvm/lib/Target/X86/X86SchedSapphireRapids.td b/llvm/lib/Target/X86/X86SchedSapphireRapids.td index 9818f4c01ea678..6e292da4e293db 100644 --- a/llvm/lib/Target/X86/X86SchedSapphireRapids.td +++ b/llvm/lib/Target/X86/X86SchedSapphireRapids.td @@ -1601,9 +1601,9 @@ def SPRWriteResGroup126 : SchedWriteRes<[SPRPort02_03_11]> { def : InstRW<[SPRWriteResGroup126], (instregex "^MMX_MOV(D|Q)64rm$", "^VBROADCAST(F|I)128rm$", "^VBROADCAST(F|I)32X(2|4)Z256rm$", - "^VBROADCAST(F|I)32X(8|2Z)rm$", - "^VBROADCAST(F|I)(32|64)X4rm$", - "^VBROADCAST(F|I)64X2((Z128)?)rm$", + "^VBROADCAST(F|I)32X(8|2)Zrm$", + "^VBROADCAST(F|I)(32|64)X4Zrm$", + "^VBROADCAST(F|I)64X2(Z|Z256)rm$", "^VBROADCASTS(DY|SZ)rm$", "^VBROADCASTS(D|S)Z256rm$", "^VBROADCASTS(DZ|SY)rm$", @@ -1652,9 +1652,9 @@ def SPRWriteResGroup131 : SchedWriteRes<[SPRPort00_05, SPRPort02_03_11]> { let Latency = 9; let NumMicroOps = 2; } -def : InstRW<[SPRWriteResGroup131], (instregex "^VBROADCAST(F|I)32X(8|2Z)rmk(z?)$", - "^VBROADCAST(F|I)(32|64)X4rmk(z?)$", - "^VBROADCAST(F|I)64X2rmk(z?)$", +def : InstRW<[SPRWriteResGroup131], (instregex "^VBROADCAST(F|I)32X(8|2)Zrmk(z?)$", + "^VBROADCAST(F|I)(32|64)X4Zrmk(z?)$", + "^VBROADCAST(F|I)64X2Zrmk(z?)$", "^VBROADCASTS(D|S)Zrmk(z?)$", "^VMOV(A|U)P(D|S)Zrmk(z?)$", "^VMOV(D|SH|SL)DUPZrmk(z?)$", @@ -2698,7 +2698,7 @@ def SPRWriteResGroup262 : SchedWriteRes<[SPRPort00_01_05, SPRPort02_03_11]> { let NumMicroOps = 2; } def : InstRW<[SPRWriteResGroup262], (instregex "^VBROADCAST(F|I)32X(2|4)Z256rmk(z?)$", - "^VBROADCAST(F|I)64X2Z128rmk(z?)$", + "^VBROADCAST(F|I)64X2Z256rmk(z?)$", "^VBROADCASTS(D|S)Z256rmk(z?)$", "^VMOV(A|U)P(D|S)Z256rmk(z?)$", "^VMOV(D|SH|SL)DUPZ256rmk(z?)$", diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td index 26e290a2250c9f..e733d9ac74dd84 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td @@ -1547,19 +1547,19 @@ def: InstRW<[SKXWriteResGroup121, ReadAfterVecYLd], "VBROADCASTF32X2Z256rm(b?)", "VBROADCASTF32X2Zrm(b?)", "VBROADCASTF32X4Z256rm(b?)", - "VBROADCASTF32X4rm(b?)", - "VBROADCASTF32X8rm(b?)", - "VBROADCASTF64X2Z128rm(b?)", - "VBROADCASTF64X2rm(b?)", - "VBROADCASTF64X4rm(b?)", + "VBROADCASTF32X4Zrm(b?)", + "VBROADCASTF32X8Zrm(b?)", + "VBROADCASTF64X2Z256rm(b?)", + "VBROADCASTF64X2Zrm(b?)", + "VBROADCASTF64X4Zrm(b?)", "VBROADCASTI32X2Z256rm(b?)", "VBROADCASTI32X2Zrm(b?)", "VBROADCASTI32X4Z256rm(b?)", - "VBROADCASTI32X4rm(b?)", - "VBROADCASTI32X8rm(b?)", - "VBROADCASTI64X2Z128rm(b?)", - "VBROADCASTI64X2rm(b?)", - "VBROADCASTI64X4rm(b?)", + "VBROADCASTI32X4Zrm(b?)", + "VBROADCASTI32X8Zrm(b?)", + "VBROADCASTI64X2Z256rm(b?)", + "VBROADCASTI64X2Zrm(b?)", + "VBROADCASTI64X4Zrm(b?)", "VBROADCASTSD(Z|Z256)rm(b?)", "VBROADCASTSS(Z|Z256)rm(b?)", "VINSERTF32x4(Z|Z256)rm(b?)", From e32a62c0d31cdfd622461eb4758d34adca509a62 Mon Sep 17 00:00:00 2001 From: Balazs Benics Date: Wed, 18 Sep 2024 11:37:27 +0200 Subject: [PATCH 028/321] [clang][NFC] Add regression tests for GH63782 (#109104) Patch by Alejandro Alvarez Ayllon! CPP-5380 --- .../SemaTemplate/concepts-out-of-line-def.cpp | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/clang/test/SemaTemplate/concepts-out-of-line-def.cpp b/clang/test/SemaTemplate/concepts-out-of-line-def.cpp index 333187b0d74ad6..5450d105a6f54a 100644 --- a/clang/test/SemaTemplate/concepts-out-of-line-def.cpp +++ b/clang/test/SemaTemplate/concepts-out-of-line-def.cpp @@ -622,3 +622,47 @@ void A::method(Ts&... ts) } {} } + +namespace GH63782 { +// GH63782 was also fixed by PR #80594, so let's add a test for it. + +template +constexpr bool All = (Vals && ...); + +template +class Class { + template + requires All + void Foo(); +}; + +template +template +requires All +void Class::Foo() { +}; + +} // namespace GH63782 + +namespace eve { +// Reduced from the "eve" project + +template +struct tuple { + template requires(I0 <= sizeof...(Ts)) + constexpr auto split(); +}; + +template +template +requires(I0 <= sizeof...(Ts)) +constexpr auto tuple::split(){ + return 0; +} + +int foo() { + tuple x; + return x.split<0>(); +} + +} // namespace eve From edac1b2d63b27f83bef99f9d51f1230ea2f3f0fa Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 18 Sep 2024 17:39:40 +0800 Subject: [PATCH 029/321] [RISCV] Promote bf16 ops to f32 with zvfbfmin (#108937) For f16 with zvfhmin, we promote most ops and VP ops to f32. This does the same for bf16 with zvfbfmin, so the two fp types should now be in sync. There are a few places in the custom lowering where we need to check for a LMUL 8 f16/bf16 vector that can't be promoted and must be split, this extracts that out into isPromotedOpNeedingSplit. In a follow up NFC we can deduplicate the code that sets up the promotions. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 145 +- llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll | 536 +++- llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll | 208 +- llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll | 214 +- llvm/test/CodeGen/RISCV/rvv/floor-vp.ll | 536 +++- .../test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll | 195 +- llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll | 562 +++- .../test/CodeGen/RISCV/rvv/fminimum-sdnode.ll | 195 +- llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll | 562 +++- .../CodeGen/RISCV/rvv/fnearbyint-sdnode.ll | 223 +- llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll | 196 +- llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll | 208 +- .../CodeGen/RISCV/rvv/froundeven-sdnode.ll | 207 +- llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll | 196 +- llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll | 530 +++- llvm/test/CodeGen/RISCV/rvv/rint-vp.ll | 507 +++- llvm/test/CodeGen/RISCV/rvv/round-vp.ll | 538 +++- llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll | 538 +++- llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll | 538 +++- llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll | 1913 +++++++++++-- llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll | 1162 +++++++- .../RISCV/rvv/vfadd-constrained-sdnode.ll | 243 +- llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll | 256 +- llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll | 681 ++++- llvm/test/CodeGen/RISCV/rvv/vfclass-sdnode.ll | 55 +- .../RISCV/rvv/vfdiv-constrained-sdnode.ll | 262 +- llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll | 244 +- llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll | 643 ++++- llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll | 2449 ++++++++++++++--- .../RISCV/rvv/vfmadd-constrained-sdnode.ll | 402 ++- llvm/test/CodeGen/RISCV/rvv/vfmadd-sdnode.ll | 565 +++- llvm/test/CodeGen/RISCV/rvv/vfmax-sdnode.ll | 247 +- llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll | 291 +- llvm/test/CodeGen/RISCV/rvv/vfmin-sdnode.ll | 247 +- llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll | 291 +- .../RISCV/rvv/vfmul-constrained-sdnode.ll | 243 +- llvm/test/CodeGen/RISCV/rvv/vfmul-sdnode.ll | 256 +- .../RISCV/rvv/vfsqrt-constrained-sdnode.ll | 114 +- llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll | 109 +- llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll | 257 +- .../RISCV/rvv/vfsub-constrained-sdnode.ll | 262 +- llvm/test/CodeGen/RISCV/rvv/vfsub-sdnode.ll | 256 +- llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll | 643 ++++- .../CodeGen/RISCV/rvv/vreductions-fp-vp.ll | 340 ++- 44 files changed, 17683 insertions(+), 1582 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 42b14c669d0c80..3b7e24414c490c 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -941,7 +941,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, }; // TODO: support more ops. - static const unsigned ZvfhminPromoteOps[] = { + static const unsigned ZvfhminZvfbfminPromoteOps[] = { ISD::FMINNUM, ISD::FMAXNUM, ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FMA, ISD::FDIV, ISD::FSQRT, ISD::FCEIL, ISD::FTRUNC, ISD::FFLOOR, ISD::FROUND, @@ -951,30 +951,31 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::STRICT_FMA}; // TODO: support more vp ops. - static const unsigned ZvfhminPromoteVPOps[] = {ISD::VP_FADD, - ISD::VP_FSUB, - ISD::VP_FMUL, - ISD::VP_FDIV, - ISD::VP_FMA, - ISD::VP_REDUCE_FADD, - ISD::VP_REDUCE_SEQ_FADD, - ISD::VP_REDUCE_FMIN, - ISD::VP_REDUCE_FMAX, - ISD::VP_SQRT, - ISD::VP_FMINNUM, - ISD::VP_FMAXNUM, - ISD::VP_FCEIL, - ISD::VP_FFLOOR, - ISD::VP_FROUND, - ISD::VP_FROUNDEVEN, - ISD::VP_FROUNDTOZERO, - ISD::VP_FRINT, - ISD::VP_FNEARBYINT, - ISD::VP_SETCC, - ISD::VP_FMINIMUM, - ISD::VP_FMAXIMUM, - ISD::VP_REDUCE_FMINIMUM, - ISD::VP_REDUCE_FMAXIMUM}; + static const unsigned ZvfhminZvfbfminPromoteVPOps[] = { + ISD::VP_FADD, + ISD::VP_FSUB, + ISD::VP_FMUL, + ISD::VP_FDIV, + ISD::VP_FMA, + ISD::VP_REDUCE_FADD, + ISD::VP_REDUCE_SEQ_FADD, + ISD::VP_REDUCE_FMIN, + ISD::VP_REDUCE_FMAX, + ISD::VP_SQRT, + ISD::VP_FMINNUM, + ISD::VP_FMAXNUM, + ISD::VP_FCEIL, + ISD::VP_FFLOOR, + ISD::VP_FROUND, + ISD::VP_FROUNDEVEN, + ISD::VP_FROUNDTOZERO, + ISD::VP_FRINT, + ISD::VP_FNEARBYINT, + ISD::VP_SETCC, + ISD::VP_FMINIMUM, + ISD::VP_FMAXIMUM, + ISD::VP_REDUCE_FMINIMUM, + ISD::VP_REDUCE_FMAXIMUM}; // Sets common operation actions on RVV floating-point vector types. const auto SetCommonVFPActions = [&](MVT VT) { @@ -1097,20 +1098,20 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FABS, VT, Expand); setOperationAction(ISD::FCOPYSIGN, VT, Expand); - // Custom split nxv32f16 since nxv32f32 if not legal. + // Custom split nxv32f16 since nxv32f32 is not legal. if (VT == MVT::nxv32f16) { - setOperationAction(ZvfhminPromoteOps, VT, Custom); - setOperationAction(ZvfhminPromoteVPOps, VT, Custom); + setOperationAction(ZvfhminZvfbfminPromoteOps, VT, Custom); + setOperationAction(ZvfhminZvfbfminPromoteVPOps, VT, Custom); continue; } // Add more promote ops. MVT F32VecVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount()); - setOperationPromotedToType(ZvfhminPromoteOps, VT, F32VecVT); - setOperationPromotedToType(ZvfhminPromoteVPOps, VT, F32VecVT); + setOperationPromotedToType(ZvfhminZvfbfminPromoteOps, VT, F32VecVT); + setOperationPromotedToType(ZvfhminZvfbfminPromoteVPOps, VT, F32VecVT); } } - // TODO: Could we merge some code with zvfhmin? + // TODO: merge with zvfhmin if (Subtarget.hasVInstructionsBF16Minimal()) { for (MVT VT : BF16VecVTs) { if (!isTypeLegal(VT)) @@ -1139,7 +1140,16 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FABS, VT, Expand); setOperationAction(ISD::FCOPYSIGN, VT, Expand); - // TODO: Promote to fp32. + // Custom split nxv32f16 since nxv32f32 is not legal. + if (VT == MVT::nxv32bf16) { + setOperationAction(ZvfhminZvfbfminPromoteOps, VT, Custom); + setOperationAction(ZvfhminZvfbfminPromoteVPOps, VT, Custom); + continue; + } + // Add more promote ops. + MVT F32VecVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount()); + setOperationPromotedToType(ZvfhminZvfbfminPromoteOps, VT, F32VecVT); + setOperationPromotedToType(ZvfhminZvfbfminPromoteVPOps, VT, F32VecVT); } } @@ -1375,8 +1385,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, // TODO: could split the f16 vector into two vectors and do promotion. if (!isTypeLegal(F32VecVT)) continue; - setOperationPromotedToType(ZvfhminPromoteOps, VT, F32VecVT); - setOperationPromotedToType(ZvfhminPromoteVPOps, VT, F32VecVT); + setOperationPromotedToType(ZvfhminZvfbfminPromoteOps, VT, F32VecVT); + setOperationPromotedToType(ZvfhminZvfbfminPromoteVPOps, VT, F32VecVT); continue; } @@ -6333,6 +6343,17 @@ static bool hasMaskOp(unsigned Opcode) { return false; } +static bool isPromotedOpNeedingSplit(SDValue Op, + const RISCVSubtarget &Subtarget) { + if (Op.getValueType() == MVT::nxv32f16 && + (Subtarget.hasVInstructionsF16Minimal() && + !Subtarget.hasVInstructionsF16())) + return true; + if (Op.getValueType() == MVT::nxv32bf16) + return true; + return false; +} + static SDValue SplitVectorOp(SDValue Op, SelectionDAG &DAG) { auto [LoVT, HiVT] = DAG.GetSplitDestVTs(Op.getValueType()); SDLoc DL(Op); @@ -6670,9 +6691,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, } case ISD::FMAXIMUM: case ISD::FMINIMUM: - if (Op.getValueType() == MVT::nxv32f16 && - (Subtarget.hasVInstructionsF16Minimal() && - !Subtarget.hasVInstructionsF16())) + if (isPromotedOpNeedingSplit(Op, Subtarget)) return SplitVectorOp(Op, DAG); return lowerFMAXIMUM_FMINIMUM(Op, DAG, Subtarget); case ISD::FP_EXTEND: @@ -6688,8 +6707,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, (Subtarget.hasVInstructionsF16Minimal() && !Subtarget.hasVInstructionsF16())) || Op.getValueType().getScalarType() == MVT::bf16)) { - if (Op.getValueType() == MVT::nxv32f16 || - Op.getValueType() == MVT::nxv32bf16) + if (isPromotedOpNeedingSplit(Op, Subtarget)) return SplitVectorOp(Op, DAG); // int -> f32 SDLoc DL(Op); @@ -6709,8 +6727,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, (Subtarget.hasVInstructionsF16Minimal() && !Subtarget.hasVInstructionsF16())) || Op1.getValueType().getScalarType() == MVT::bf16)) { - if (Op1.getValueType() == MVT::nxv32f16 || - Op1.getValueType() == MVT::nxv32bf16) + if (isPromotedOpNeedingSplit(Op1, Subtarget)) return SplitVectorOp(Op, DAG); // [b]f16 -> f32 SDLoc DL(Op); @@ -6942,9 +6959,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, case ISD::FRINT: case ISD::FROUND: case ISD::FROUNDEVEN: - if (Op.getValueType() == MVT::nxv32f16 && - (Subtarget.hasVInstructionsF16Minimal() && - !Subtarget.hasVInstructionsF16())) + if (isPromotedOpNeedingSplit(Op, Subtarget)) return SplitVectorOp(Op, DAG); return lowerFTRUNC_FCEIL_FFLOOR_FROUND(Op, DAG, Subtarget); case ISD::LRINT: @@ -7002,9 +7017,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, case ISD::VP_REDUCE_FMAX: case ISD::VP_REDUCE_FMINIMUM: case ISD::VP_REDUCE_FMAXIMUM: - if (Op.getOperand(1).getValueType() == MVT::nxv32f16 && - (Subtarget.hasVInstructionsF16Minimal() && - !Subtarget.hasVInstructionsF16())) + if (isPromotedOpNeedingSplit(Op.getOperand(1), Subtarget)) return SplitVectorReductionOp(Op, DAG); return lowerVPREDUCE(Op, DAG); case ISD::VP_REDUCE_AND: @@ -7251,9 +7264,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, return DAG.getSetCC(DL, VT, RHS, LHS, CCVal); } - if (Op.getOperand(0).getSimpleValueType() == MVT::nxv32f16 && - (Subtarget.hasVInstructionsF16Minimal() && - !Subtarget.hasVInstructionsF16())) + if (isPromotedOpNeedingSplit(Op.getOperand(0), Subtarget)) return SplitVectorOp(Op, DAG); return lowerFixedLengthVectorSetccToRVV(Op, DAG); @@ -7295,9 +7306,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, case ISD::FMA: case ISD::FMINNUM: case ISD::FMAXNUM: - if (Op.getValueType() == MVT::nxv32f16 && - (Subtarget.hasVInstructionsF16Minimal() && - !Subtarget.hasVInstructionsF16())) + if (isPromotedOpNeedingSplit(Op, Subtarget)) return SplitVectorOp(Op, DAG); [[fallthrough]]; case ISD::AVGFLOORS: @@ -7345,9 +7354,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, case ISD::FCOPYSIGN: if (Op.getValueType() == MVT::f16 || Op.getValueType() == MVT::bf16) return lowerFCOPYSIGN(Op, DAG, Subtarget); - if (Op.getValueType() == MVT::nxv32f16 && - (Subtarget.hasVInstructionsF16Minimal() && - !Subtarget.hasVInstructionsF16())) + if (isPromotedOpNeedingSplit(Op, Subtarget)) return SplitVectorOp(Op, DAG); return lowerFixedLengthVectorFCOPYSIGNToRVV(Op, DAG); case ISD::STRICT_FADD: @@ -7356,9 +7363,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, case ISD::STRICT_FDIV: case ISD::STRICT_FSQRT: case ISD::STRICT_FMA: - if (Op.getValueType() == MVT::nxv32f16 && - (Subtarget.hasVInstructionsF16Minimal() && - !Subtarget.hasVInstructionsF16())) + if (isPromotedOpNeedingSplit(Op, Subtarget)) return SplitStrictFPVectorOp(Op, DAG); return lowerToScalableOp(Op, DAG); case ISD::STRICT_FSETCC: @@ -7415,9 +7420,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, case ISD::VP_FMINNUM: case ISD::VP_FMAXNUM: case ISD::VP_FCOPYSIGN: - if (Op.getValueType() == MVT::nxv32f16 && - (Subtarget.hasVInstructionsF16Minimal() && - !Subtarget.hasVInstructionsF16())) + if (isPromotedOpNeedingSplit(Op, Subtarget)) return SplitVPOp(Op, DAG); [[fallthrough]]; case ISD::VP_SRA: @@ -7443,8 +7446,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, (Subtarget.hasVInstructionsF16Minimal() && !Subtarget.hasVInstructionsF16())) || Op.getValueType().getScalarType() == MVT::bf16)) { - if (Op.getValueType() == MVT::nxv32f16 || - Op.getValueType() == MVT::nxv32bf16) + if (isPromotedOpNeedingSplit(Op, Subtarget)) return SplitVectorOp(Op, DAG); // int -> f32 SDLoc DL(Op); @@ -7464,8 +7466,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, (Subtarget.hasVInstructionsF16Minimal() && !Subtarget.hasVInstructionsF16())) || Op1.getValueType().getScalarType() == MVT::bf16)) { - if (Op1.getValueType() == MVT::nxv32f16 || - Op1.getValueType() == MVT::nxv32bf16) + if (isPromotedOpNeedingSplit(Op1, Subtarget)) return SplitVectorOp(Op, DAG); // [b]f16 -> f32 SDLoc DL(Op); @@ -7478,9 +7479,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, } return lowerVPFPIntConvOp(Op, DAG); case ISD::VP_SETCC: - if (Op.getOperand(0).getSimpleValueType() == MVT::nxv32f16 && - (Subtarget.hasVInstructionsF16Minimal() && - !Subtarget.hasVInstructionsF16())) + if (isPromotedOpNeedingSplit(Op.getOperand(0), Subtarget)) return SplitVPOp(Op, DAG); if (Op.getOperand(0).getSimpleValueType().getVectorElementType() == MVT::i1) return lowerVPSetCCMaskOp(Op, DAG); @@ -7515,16 +7514,12 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, case ISD::VP_FROUND: case ISD::VP_FROUNDEVEN: case ISD::VP_FROUNDTOZERO: - if (Op.getValueType() == MVT::nxv32f16 && - (Subtarget.hasVInstructionsF16Minimal() && - !Subtarget.hasVInstructionsF16())) + if (isPromotedOpNeedingSplit(Op, Subtarget)) return SplitVPOp(Op, DAG); return lowerVectorFTRUNC_FCEIL_FFLOOR_FROUND(Op, DAG, Subtarget); case ISD::VP_FMAXIMUM: case ISD::VP_FMINIMUM: - if (Op.getValueType() == MVT::nxv32f16 && - (Subtarget.hasVInstructionsF16Minimal() && - !Subtarget.hasVInstructionsF16())) + if (isPromotedOpNeedingSplit(Op, Subtarget)) return SplitVPOp(Op, DAG); return lowerFMAXIMUM_FMINIMUM(Op, DAG, Subtarget); case ISD::EXPERIMENTAL_VP_SPLICE: diff --git a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll index d613e4ee0bc256..15cff650765efa 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll @@ -1,22 +1,428 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN +declare @llvm.vp.ceil.nxv1bf16(, , i32) + +define @vp_ceil_vv_nxv1bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ceil_vv_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfabs.v v8, v9, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t +; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.ceil.nxv1bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_ceil_vv_nxv1bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ceil_vv_nxv1bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfabs.v v8, v9 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.ceil.nxv1bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.ceil.nxv2bf16(, , i32) + +define @vp_ceil_vv_nxv2bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ceil_vv_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfabs.v v8, v9, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t +; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.ceil.nxv2bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_ceil_vv_nxv2bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ceil_vv_nxv2bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfabs.v v8, v9 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.ceil.nxv2bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.ceil.nxv4bf16(, , i32) + +define @vp_ceil_vv_nxv4bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ceil_vv_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfabs.v v12, v10, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vmflt.vf v9, v12, fa5, v0.t +; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v12, v10, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vfsgnj.vv v10, v12, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.vp.ceil.nxv4bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_ceil_vv_nxv4bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ceil_vv_nxv4bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfabs.v v8, v10 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.vp.ceil.nxv4bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.ceil.nxv8bf16(, , i32) + +define @vp_ceil_vv_nxv8bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ceil_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfabs.v v16, v12, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; CHECK-NEXT: vmflt.vf v10, v16, fa5, v0.t +; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v16, v12, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; CHECK-NEXT: vfsgnj.vv v12, v16, v12, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.vp.ceil.nxv8bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_ceil_vv_nxv8bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ceil_vv_nxv8bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfabs.v v8, v12 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; CHECK-NEXT: vfsgnj.vv v12, v8, v12, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.vp.ceil.nxv8bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.ceil.nxv16bf16(, , i32) + +define @vp_ceil_vv_nxv16bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ceil_vv_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vmflt.vf v12, v24, fa5, v0.t +; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %v = call @llvm.vp.ceil.nxv16bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_ceil_vv_nxv16bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ceil_vv_nxv16bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v8, v16 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %v = call @llvm.vp.ceil.nxv16bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.ceil.nxv32bf16(, , i32) + +define @vp_ceil_vv_nxv32bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_ceil_vv_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vmv1r.v v16, v0 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v17, v0, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v8, v24, v0.t +; CHECK-NEXT: lui a2, 307200 +; CHECK-NEXT: fmv.w.x fa5, a2 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vmflt.vf v17, v8, fa5, v0.t +; CHECK-NEXT: fsrmi a2, 3 +; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t +; CHECK-NEXT: fsrm a2 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 +; CHECK-NEXT: bltu a0, a1, .LBB10_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB10_2: +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0 +; CHECK-NEXT: vmv1r.v v8, v16 +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v16, v24, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vmflt.vf v8, v16, fa5, v0.t +; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.ceil.nxv32bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_ceil_vv_nxv32bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_ceil_vv_nxv32bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma +; CHECK-NEXT: vmset.m v16 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v16, v16, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v8, v24, v0.t +; CHECK-NEXT: lui a2, 307200 +; CHECK-NEXT: fmv.w.x fa5, a2 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vmflt.vf v16, v8, fa5, v0.t +; CHECK-NEXT: fsrmi a2, 3 +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t +; CHECK-NEXT: fsrm a2 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 +; CHECK-NEXT: bltu a0, a1, .LBB11_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB11_2: +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 +; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.ceil.nxv32bf16( %va, splat (i1 true), i32 %evl) + ret %v +} declare @llvm.vp.ceil.nxv1f16(, , i32) define @vp_ceil_vv_nxv1f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_vv_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI0_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI12_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI12_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu @@ -57,8 +463,8 @@ define @vp_ceil_vv_nxv1f16( %va, @vp_ceil_vv_nxv1f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_vv_nxv1f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI1_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI13_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI13_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -97,8 +503,8 @@ declare @llvm.vp.ceil.nxv2f16(, @vp_ceil_vv_nxv2f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_vv_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI2_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI14_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI14_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu @@ -139,8 +545,8 @@ define @vp_ceil_vv_nxv2f16( %va, @vp_ceil_vv_nxv2f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_vv_nxv2f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI3_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI15_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI15_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -179,8 +585,8 @@ declare @llvm.vp.ceil.nxv4f16(, @vp_ceil_vv_nxv4f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_vv_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI4_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI16_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI16_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu @@ -223,8 +629,8 @@ define @vp_ceil_vv_nxv4f16( %va, @vp_ceil_vv_nxv4f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_vv_nxv4f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI5_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI17_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI17_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -263,8 +669,8 @@ declare @llvm.vp.ceil.nxv8f16(, @vp_ceil_vv_nxv8f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_vv_nxv8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI6_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI18_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI18_0)(a1) ; ZVFH-NEXT: vmv1r.v v10, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8, v0.t @@ -309,8 +715,8 @@ define @vp_ceil_vv_nxv8f16( %va, @vp_ceil_vv_nxv8f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_vv_nxv8f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI7_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI19_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI19_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 @@ -349,8 +755,8 @@ declare @llvm.vp.ceil.nxv16f16(, @vp_ceil_vv_nxv16f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_vv_nxv16f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI8_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI20_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI20_0)(a1) ; ZVFH-NEXT: vmv1r.v v12, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8, v0.t @@ -395,8 +801,8 @@ define @vp_ceil_vv_nxv16f16( %va, @vp_ceil_vv_nxv16f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_vv_nxv16f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI9_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI21_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI21_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 ; ZVFH-NEXT: vmflt.vf v0, v12, fa5 @@ -435,8 +841,8 @@ declare @llvm.vp.ceil.nxv32f16(, @vp_ceil_vv_nxv32f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_vv_nxv32f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI10_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI22_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI22_0)(a1) ; ZVFH-NEXT: vmv1r.v v16, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v24, v8, v0.t @@ -491,10 +897,10 @@ define @vp_ceil_vv_nxv32f16( %va, @vp_ceil_vv_nxv32f16( %va, @vp_ceil_vv_nxv32f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_vv_nxv32f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI11_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI23_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI23_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 ; ZVFH-NEXT: vmflt.vf v0, v16, fa5 @@ -586,10 +992,10 @@ define @vp_ceil_vv_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 -; ZVFHMIN-NEXT: bltu a0, a1, .LBB11_2 +; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB11_2: +; ZVFHMIN-NEXT: .LBB23_2: ; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 @@ -834,8 +1240,8 @@ declare @llvm.vp.ceil.nxv1f64(, @vp_ceil_vv_nxv1f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_vv_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI22_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI34_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI34_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu @@ -855,8 +1261,8 @@ define @vp_ceil_vv_nxv1f64( %va, @vp_ceil_vv_nxv1f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_vv_nxv1f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI23_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI35_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI35_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 @@ -876,8 +1282,8 @@ declare @llvm.vp.ceil.nxv2f64(, @vp_ceil_vv_nxv2f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_vv_nxv2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI24_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI36_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI36_0)(a1) ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v12, v8, v0.t @@ -899,8 +1305,8 @@ define @vp_ceil_vv_nxv2f64( %va, @vp_ceil_vv_nxv2f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_vv_nxv2f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI25_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI37_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI37_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 @@ -920,8 +1326,8 @@ declare @llvm.vp.ceil.nxv4f64(, @vp_ceil_vv_nxv4f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_vv_nxv4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI26_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI38_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI38_0)(a1) ; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v16, v8, v0.t @@ -943,8 +1349,8 @@ define @vp_ceil_vv_nxv4f64( %va, @vp_ceil_vv_nxv4f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_vv_nxv4f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI27_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI39_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI39_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 @@ -964,8 +1370,8 @@ declare @llvm.vp.ceil.nxv7f64(, @vp_ceil_vv_nxv7f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_vv_nxv7f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI28_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI28_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI40_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI40_0)(a1) ; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t @@ -987,8 +1393,8 @@ define @vp_ceil_vv_nxv7f64( %va, @vp_ceil_vv_nxv7f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_vv_nxv7f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI29_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI29_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI41_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI41_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 @@ -1008,8 +1414,8 @@ declare @llvm.vp.ceil.nxv8f64(, @vp_ceil_vv_nxv8f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_vv_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI30_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI30_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI42_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI42_0)(a1) ; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t @@ -1031,8 +1437,8 @@ define @vp_ceil_vv_nxv8f64( %va, @vp_ceil_vv_nxv8f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_vv_nxv8f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI31_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI31_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI43_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI43_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 @@ -1065,8 +1471,8 @@ define @vp_ceil_vv_nxv16f64( %va, < ; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vx v6, v0, a2 ; CHECK-NEXT: sub a2, a0, a1 -; CHECK-NEXT: lui a3, %hi(.LCPI32_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI32_0)(a3) +; CHECK-NEXT: lui a3, %hi(.LCPI44_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI44_0)(a3) ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 ; CHECK-NEXT: and a2, a3, a2 @@ -1087,10 +1493,10 @@ define @vp_ceil_vv_nxv16f64( %va, < ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: bltu a0, a1, .LBB32_2 +; CHECK-NEXT: bltu a0, a1, .LBB44_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: .LBB32_2: +; CHECK-NEXT: .LBB44_2: ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t @@ -1118,8 +1524,8 @@ define @vp_ceil_vv_nxv16f64_unmasked( @vp_ceil_vv_nxv16f64_unmasked( @ceil_nxv1bf16( %x) { +; CHECK-LABEL: ceil_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfabs.v v8, v9 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %a = call @llvm.ceil.nxv1bf16( %x) + ret %a +} + +define @ceil_nxv2bf16( %x) { +; CHECK-LABEL: ceil_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfabs.v v8, v9 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %a = call @llvm.ceil.nxv2bf16( %x) + ret %a +} + +define @ceil_nxv4bf16( %x) { +; CHECK-LABEL: ceil_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfabs.v v8, v10 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %a = call @llvm.ceil.nxv4bf16( %x) + ret %a +} + +define @ceil_nxv8bf16( %x) { +; CHECK-LABEL: ceil_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfabs.v v8, v12 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; CHECK-NEXT: vfsgnj.vv v12, v8, v12, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %a = call @llvm.ceil.nxv8bf16( %x) + ret %a +} + +define @ceil_nxv16bf16( %x) { +; CHECK-LABEL: ceil_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v8, v16 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %a = call @llvm.ceil.nxv16bf16( %x) + ret %a +} + +define @ceil_nxv32bf16( %x) { +; CHECK-LABEL: ceil_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 +; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 +; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: ret + %a = call @llvm.ceil.nxv32bf16( %x) + ret %a +} + define @ceil_nxv1f16( %x) { ; ZVFH-LABEL: ceil_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI0_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -52,8 +204,8 @@ declare @llvm.ceil.nxv1f16() define @ceil_nxv2f16( %x) { ; ZVFH-LABEL: ceil_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI1_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI7_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -91,8 +243,8 @@ declare @llvm.ceil.nxv2f16() define @ceil_nxv4f16( %x) { ; ZVFH-LABEL: ceil_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI2_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI8_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -130,8 +282,8 @@ declare @llvm.ceil.nxv4f16() define @ceil_nxv8f16( %x) { ; ZVFH-LABEL: ceil_nxv8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI3_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI9_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 @@ -169,8 +321,8 @@ declare @llvm.ceil.nxv8f16() define @ceil_nxv16f16( %x) { ; ZVFH-LABEL: ceil_nxv16f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI4_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI10_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 ; ZVFH-NEXT: vmflt.vf v0, v12, fa5 @@ -208,8 +360,8 @@ declare @llvm.ceil.nxv16f16() define @ceil_nxv32f16( %x) { ; ZVFH-LABEL: ceil_nxv32f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI5_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI11_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 ; ZVFH-NEXT: vmflt.vf v0, v16, fa5 @@ -359,8 +511,8 @@ declare @llvm.ceil.nxv16f32() define @ceil_nxv1f64( %x) { ; CHECK-LABEL: ceil_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI17_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 @@ -379,8 +531,8 @@ declare @llvm.ceil.nxv1f64() define @ceil_nxv2f64( %x) { ; CHECK-LABEL: ceil_nxv2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI18_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 @@ -399,8 +551,8 @@ declare @llvm.ceil.nxv2f64() define @ceil_nxv4f64( %x) { ; CHECK-LABEL: ceil_nxv4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI19_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 @@ -419,8 +571,8 @@ declare @llvm.ceil.nxv4f64() define @ceil_nxv8f64( %x) { ; CHECK-LABEL: ceil_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI14_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI20_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 diff --git a/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll index 97d84e91744038..00e21ce8992b0c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll @@ -1,20 +1,178 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN +define @floor_nxv1bf16( %x) { +; CHECK-LABEL: floor_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfabs.v v8, v9 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %a = call @llvm.floor.nxv1bf16( %x) + ret %a +} +declare @llvm.floor.nxv1bf16() + +define @floor_nxv2bf16( %x) { +; CHECK-LABEL: floor_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfabs.v v8, v9 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %a = call @llvm.floor.nxv2bf16( %x) + ret %a +} +declare @llvm.floor.nxv2bf16() + +define @floor_nxv4bf16( %x) { +; CHECK-LABEL: floor_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfabs.v v8, v10 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %a = call @llvm.floor.nxv4bf16( %x) + ret %a +} +declare @llvm.floor.nxv4bf16() + +define @floor_nxv8bf16( %x) { +; CHECK-LABEL: floor_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfabs.v v8, v12 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; CHECK-NEXT: vfsgnj.vv v12, v8, v12, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %a = call @llvm.floor.nxv8bf16( %x) + ret %a +} +declare @llvm.floor.nxv8bf16() + +define @floor_nxv16bf16( %x) { +; CHECK-LABEL: floor_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v8, v16 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %a = call @llvm.floor.nxv16bf16( %x) + ret %a +} +declare @llvm.floor.nxv16bf16() + +define @floor_nxv32bf16( %x) { +; CHECK-LABEL: floor_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 +; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 +; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: ret + %a = call @llvm.floor.nxv32bf16( %x) + ret %a +} +declare @llvm.floor.nxv32bf16() + define @floor_nxv1f16( %x) { ; ZVFH-LABEL: floor_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI0_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -52,8 +210,8 @@ declare @llvm.floor.nxv1f16() define @floor_nxv2f16( %x) { ; ZVFH-LABEL: floor_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI1_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI7_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -91,8 +249,8 @@ declare @llvm.floor.nxv2f16() define @floor_nxv4f16( %x) { ; ZVFH-LABEL: floor_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI2_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI8_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -130,8 +288,8 @@ declare @llvm.floor.nxv4f16() define @floor_nxv8f16( %x) { ; ZVFH-LABEL: floor_nxv8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI3_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI9_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 @@ -169,8 +327,8 @@ declare @llvm.floor.nxv8f16() define @floor_nxv16f16( %x) { ; ZVFH-LABEL: floor_nxv16f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI4_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI10_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 ; ZVFH-NEXT: vmflt.vf v0, v12, fa5 @@ -208,8 +366,8 @@ declare @llvm.floor.nxv16f16() define @floor_nxv32f16( %x) { ; ZVFH-LABEL: floor_nxv32f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI5_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI11_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 ; ZVFH-NEXT: vmflt.vf v0, v16, fa5 @@ -359,8 +517,8 @@ declare @llvm.floor.nxv16f32() define @floor_nxv1f64( %x) { ; CHECK-LABEL: floor_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI17_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 @@ -379,8 +537,8 @@ declare @llvm.floor.nxv1f64() define @floor_nxv2f64( %x) { ; CHECK-LABEL: floor_nxv2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI18_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 @@ -399,8 +557,8 @@ declare @llvm.floor.nxv2f64() define @floor_nxv4f64( %x) { ; CHECK-LABEL: floor_nxv4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI19_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 @@ -419,8 +577,8 @@ declare @llvm.floor.nxv4f64() define @floor_nxv8f64( %x) { ; CHECK-LABEL: floor_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI14_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI20_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 diff --git a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll index 45334ea8648f74..03d1fb6c8d297f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll @@ -1,22 +1,428 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN +declare @llvm.vp.floor.nxv1bf16(, , i32) + +define @vp_floor_nxv1bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_floor_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfabs.v v8, v9, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t +; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.floor.nxv1bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_floor_nxv1bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_floor_nxv1bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfabs.v v8, v9 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.floor.nxv1bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.floor.nxv2bf16(, , i32) + +define @vp_floor_nxv2bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_floor_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfabs.v v8, v9, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t +; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.floor.nxv2bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_floor_nxv2bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_floor_nxv2bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfabs.v v8, v9 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.floor.nxv2bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.floor.nxv4bf16(, , i32) + +define @vp_floor_nxv4bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_floor_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfabs.v v12, v10, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vmflt.vf v9, v12, fa5, v0.t +; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v12, v10, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vfsgnj.vv v10, v12, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.vp.floor.nxv4bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_floor_nxv4bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_floor_nxv4bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfabs.v v8, v10 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.vp.floor.nxv4bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.floor.nxv8bf16(, , i32) + +define @vp_floor_nxv8bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_floor_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfabs.v v16, v12, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; CHECK-NEXT: vmflt.vf v10, v16, fa5, v0.t +; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v16, v12, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; CHECK-NEXT: vfsgnj.vv v12, v16, v12, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.vp.floor.nxv8bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_floor_nxv8bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_floor_nxv8bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfabs.v v8, v12 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; CHECK-NEXT: vfsgnj.vv v12, v8, v12, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.vp.floor.nxv8bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.floor.nxv16bf16(, , i32) + +define @vp_floor_nxv16bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_floor_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vmflt.vf v12, v24, fa5, v0.t +; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %v = call @llvm.vp.floor.nxv16bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_floor_nxv16bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_floor_nxv16bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v8, v16 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %v = call @llvm.vp.floor.nxv16bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.floor.nxv32bf16(, , i32) + +define @vp_floor_nxv32bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_floor_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vmv1r.v v16, v0 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v17, v0, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v8, v24, v0.t +; CHECK-NEXT: lui a2, 307200 +; CHECK-NEXT: fmv.w.x fa5, a2 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vmflt.vf v17, v8, fa5, v0.t +; CHECK-NEXT: fsrmi a2, 2 +; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t +; CHECK-NEXT: fsrm a2 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 +; CHECK-NEXT: bltu a0, a1, .LBB10_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB10_2: +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0 +; CHECK-NEXT: vmv1r.v v8, v16 +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v16, v24, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vmflt.vf v8, v16, fa5, v0.t +; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.floor.nxv32bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_floor_nxv32bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_floor_nxv32bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma +; CHECK-NEXT: vmset.m v16 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v16, v16, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v8, v24, v0.t +; CHECK-NEXT: lui a2, 307200 +; CHECK-NEXT: fmv.w.x fa5, a2 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vmflt.vf v16, v8, fa5, v0.t +; CHECK-NEXT: fsrmi a2, 2 +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t +; CHECK-NEXT: fsrm a2 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 +; CHECK-NEXT: bltu a0, a1, .LBB11_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB11_2: +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 +; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.floor.nxv32bf16( %va, splat (i1 true), i32 %evl) + ret %v +} declare @llvm.vp.floor.nxv1f16(, , i32) define @vp_floor_nxv1f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI0_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI12_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI12_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu @@ -57,8 +463,8 @@ define @vp_floor_nxv1f16( %va, @vp_floor_nxv1f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_nxv1f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI1_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI13_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI13_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -97,8 +503,8 @@ declare @llvm.vp.floor.nxv2f16(, @vp_floor_nxv2f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI2_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI14_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI14_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu @@ -139,8 +545,8 @@ define @vp_floor_nxv2f16( %va, @vp_floor_nxv2f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_nxv2f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI3_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI15_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI15_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -179,8 +585,8 @@ declare @llvm.vp.floor.nxv4f16(, @vp_floor_nxv4f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI4_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI16_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI16_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu @@ -223,8 +629,8 @@ define @vp_floor_nxv4f16( %va, @vp_floor_nxv4f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_nxv4f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI5_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI17_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI17_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -263,8 +669,8 @@ declare @llvm.vp.floor.nxv8f16(, @vp_floor_nxv8f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_nxv8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI6_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI18_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI18_0)(a1) ; ZVFH-NEXT: vmv1r.v v10, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8, v0.t @@ -309,8 +715,8 @@ define @vp_floor_nxv8f16( %va, @vp_floor_nxv8f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_nxv8f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI7_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI19_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI19_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 @@ -349,8 +755,8 @@ declare @llvm.vp.floor.nxv16f16(, @vp_floor_nxv16f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_nxv16f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI8_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI20_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI20_0)(a1) ; ZVFH-NEXT: vmv1r.v v12, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8, v0.t @@ -395,8 +801,8 @@ define @vp_floor_nxv16f16( %va, @vp_floor_nxv16f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_nxv16f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI9_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI21_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI21_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 ; ZVFH-NEXT: vmflt.vf v0, v12, fa5 @@ -435,8 +841,8 @@ declare @llvm.vp.floor.nxv32f16(, @vp_floor_nxv32f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_nxv32f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI10_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI22_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI22_0)(a1) ; ZVFH-NEXT: vmv1r.v v16, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v24, v8, v0.t @@ -491,10 +897,10 @@ define @vp_floor_nxv32f16( %va, @vp_floor_nxv32f16( %va, @vp_floor_nxv32f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_nxv32f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI11_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI23_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI23_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 ; ZVFH-NEXT: vmflt.vf v0, v16, fa5 @@ -586,10 +992,10 @@ define @vp_floor_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 -; ZVFHMIN-NEXT: bltu a0, a1, .LBB11_2 +; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB11_2: +; ZVFHMIN-NEXT: .LBB23_2: ; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 @@ -834,8 +1240,8 @@ declare @llvm.vp.floor.nxv1f64(, @vp_floor_nxv1f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI22_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI34_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI34_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu @@ -855,8 +1261,8 @@ define @vp_floor_nxv1f64( %va, @vp_floor_nxv1f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_nxv1f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI23_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI35_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI35_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 @@ -876,8 +1282,8 @@ declare @llvm.vp.floor.nxv2f64(, @vp_floor_nxv2f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_nxv2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI24_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI36_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI36_0)(a1) ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v12, v8, v0.t @@ -899,8 +1305,8 @@ define @vp_floor_nxv2f64( %va, @vp_floor_nxv2f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_nxv2f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI25_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI37_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI37_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 @@ -920,8 +1326,8 @@ declare @llvm.vp.floor.nxv4f64(, @vp_floor_nxv4f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_nxv4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI26_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI38_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI38_0)(a1) ; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v16, v8, v0.t @@ -943,8 +1349,8 @@ define @vp_floor_nxv4f64( %va, @vp_floor_nxv4f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_nxv4f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI27_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI39_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI39_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 @@ -964,8 +1370,8 @@ declare @llvm.vp.floor.nxv7f64(, @vp_floor_nxv7f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_nxv7f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI28_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI28_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI40_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI40_0)(a1) ; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t @@ -987,8 +1393,8 @@ define @vp_floor_nxv7f64( %va, @vp_floor_nxv7f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_nxv7f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI29_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI29_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI41_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI41_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 @@ -1008,8 +1414,8 @@ declare @llvm.vp.floor.nxv8f64(, @vp_floor_nxv8f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI30_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI30_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI42_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI42_0)(a1) ; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t @@ -1031,8 +1437,8 @@ define @vp_floor_nxv8f64( %va, @vp_floor_nxv8f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_nxv8f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI31_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI31_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI43_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI43_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 @@ -1065,8 +1471,8 @@ define @vp_floor_nxv16f64( %va, @vp_floor_nxv16f64( %va, @vp_floor_nxv16f64_unmasked( ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: sub a2, a0, a1 -; CHECK-NEXT: lui a3, %hi(.LCPI33_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI33_0)(a3) +; CHECK-NEXT: lui a3, %hi(.LCPI45_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI45_0)(a3) ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 ; CHECK-NEXT: and a2, a3, a2 @@ -1132,10 +1538,10 @@ define @vp_floor_nxv16f64_unmasked( ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: bltu a0, a1, .LBB33_2 +; CHECK-NEXT: bltu a0, a1, .LBB45_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: .LBB33_2: +; CHECK-NEXT: .LBB45_2: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8 ; CHECK-NEXT: vmflt.vf v0, v24, fa5 diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll index 05896d8ef6ffdf..d8c3ab27cfad12 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll @@ -1,21 +1,200 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs -early-live-intervals < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs -early-live-intervals < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m -target-abi=ilp32d \ ; RUN: -verify-machineinstrs -early-live-intervals < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m -target-abi=lp64d \ ; RUN: -verify-machineinstrs -early-live-intervals < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +declare @llvm.maximum.nxv1bf16(, ) + +define @vfmax_nxv1bf16_vv( %a, %b) { +; CHECK-LABEL: vfmax_nxv1bf16_vv: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v9, v9 +; CHECK-NEXT: vmfeq.vv v8, v10, v10 +; CHECK-NEXT: vmerge.vvm v11, v9, v10, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v9, v0 +; CHECK-NEXT: vfmax.vv v9, v8, v11 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.maximum.nxv1bf16( %a, %b) + ret %v +} + +declare @llvm.maximum.nxv2bf16(, ) + +define @vfmax_nxv2bf16_vv( %a, %b) { +; CHECK-LABEL: vfmax_nxv2bf16_vv: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v9, v9 +; CHECK-NEXT: vmfeq.vv v8, v10, v10 +; CHECK-NEXT: vmerge.vvm v11, v9, v10, v0 +; CHECK-NEXT: vmv.v.v v0, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v9, v0 +; CHECK-NEXT: vfmax.vv v9, v8, v11 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.maximum.nxv2bf16( %a, %b) + ret %v +} + +declare @llvm.maximum.nxv4bf16(, ) + +define @vfmax_nxv4bf16_vv( %a, %b) { +; CHECK-LABEL: vfmax_nxv4bf16_vv: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v12, v12 +; CHECK-NEXT: vmfeq.vv v8, v10, v10 +; CHECK-NEXT: vmerge.vvm v14, v12, v10, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 +; CHECK-NEXT: vfmax.vv v10, v8, v14 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.maximum.nxv4bf16( %a, %b) + ret %v +} + +declare @llvm.maximum.nxv8bf16(, ) + +define @vfmax_nxv8bf16_vv( %a, %b) { +; CHECK-LABEL: vfmax_nxv8bf16_vv: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v16, v16 +; CHECK-NEXT: vmfeq.vv v8, v12, v12 +; CHECK-NEXT: vmerge.vvm v20, v16, v12, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 +; CHECK-NEXT: vfmax.vv v12, v8, v20 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.maximum.nxv8bf16( %a, %b) + ret %v +} + +declare @llvm.maximum.nxv16bf16(, ) + +define @vfmax_nxv16bf16_vv( %a, %b) { +; CHECK-LABEL: vfmax_nxv16bf16_vv: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v24, v24 +; CHECK-NEXT: vmfeq.vv v7, v16, v16 +; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmax.vv v16, v8, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.maximum.nxv16bf16( %a, %b) + ret %v +} + +declare @llvm.maximum.nxv32bf16(, ) + +define @vfmax_nxv32bf16_vv( %a, %b) nounwind { +; CHECK-LABEL: vfmax_nxv32bf16_vv: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vmv8r.v v0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v0 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v8, v8 +; CHECK-NEXT: vmfeq.vv v3, v24, v24 +; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0 +; CHECK-NEXT: vmv1r.v v0, v3 +; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 +; CHECK-NEXT: vfmax.vv v8, v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v20 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v4 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v16, v16 +; CHECK-NEXT: vmfeq.vv v7, v8, v8 +; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0 +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 +; CHECK-NEXT: vfmax.vv v16, v8, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.maximum.nxv32bf16( %a, %b) + ret %v +} + declare @llvm.maximum.nxv1f16(, ) define @vfmax_nxv1f16_vv( %a, %b) { diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll index ab07fff59b218f..320db35770cb82 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll @@ -1,13 +1,541 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v,+m \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v,+m \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 \ +; RUN: -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 \ +; RUN: -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +declare @llvm.vp.maximum.nxv1bf16(, , , i32) + +define @vfmax_vv_nxv1bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfmax_vv_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v11, v11, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmerge.vvm v9, v11, v8, v0 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vmfeq.vv v0, v8, v8, v0.t +; CHECK-NEXT: vmerge.vvm v8, v8, v11, v0 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vfmax.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.maximum.nxv1bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfmax_vv_nxv1bf16_unmasked( %va, %vb, i32 zeroext %evl) { +; CHECK-LABEL: vfmax_vv_nxv1bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v10, v10 +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfeq.vv v8, v11, v11 +; CHECK-NEXT: vmerge.vvm v9, v10, v11, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vvm v8, v11, v10, v0 +; CHECK-NEXT: vfmax.vv v9, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.maximum.nxv1bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.maximum.nxv2bf16(, , , i32) + +define @vfmax_vv_nxv2bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfmax_vv_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v11, v11, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vmerge.vvm v9, v11, v8, v0 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vmfeq.vv v0, v8, v8, v0.t +; CHECK-NEXT: vmerge.vvm v8, v8, v11, v0 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vfmax.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.maximum.nxv2bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfmax_vv_nxv2bf16_unmasked( %va, %vb, i32 zeroext %evl) { +; CHECK-LABEL: vfmax_vv_nxv2bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v10, v10 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vmfeq.vv v8, v11, v11 +; CHECK-NEXT: vmerge.vvm v9, v10, v11, v0 +; CHECK-NEXT: vmv.v.v v0, v8 +; CHECK-NEXT: vmerge.vvm v8, v11, v10, v0 +; CHECK-NEXT: vfmax.vv v9, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.maximum.nxv2bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.maximum.nxv4bf16(, , , i32) + +define @vfmax_vv_nxv4bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfmax_vv_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vmfeq.vv v8, v12, v12, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vmerge.vvm v16, v12, v14, v0 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vmfeq.vv v8, v14, v14, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vvm v8, v14, v12, v0 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vfmax.vv v10, v8, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.vp.maximum.nxv4bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfmax_vv_nxv4bf16_unmasked( %va, %vb, i32 zeroext %evl) { +; CHECK-LABEL: vfmax_vv_nxv4bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v10, v10 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vmfeq.vv v8, v12, v12 +; CHECK-NEXT: vmerge.vvm v14, v10, v12, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vvm v8, v12, v10, v0 +; CHECK-NEXT: vfmax.vv v10, v8, v14 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.vp.maximum.nxv4bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.maximum.nxv8bf16(, , , i32) + +define @vfmax_vv_nxv8bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfmax_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v8, v16, v16, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v20, v10 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmerge.vvm v24, v16, v20, v0 +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: vmfeq.vv v8, v20, v20, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vvm v8, v20, v16, v0 +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: vfmax.vv v12, v8, v24, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.vp.maximum.nxv8bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfmax_vv_nxv8bf16_unmasked( %va, %vb, i32 zeroext %evl) { +; CHECK-LABEL: vfmax_vv_nxv8bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v12, v12 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v8, v16, v16 +; CHECK-NEXT: vmerge.vvm v20, v12, v16, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vvm v8, v16, v12, v0 +; CHECK-NEXT: vfmax.vv v12, v8, v20 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.vp.maximum.nxv8bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.maximum.nxv16bf16(, , , i32) + +define @vfmax_vv_nxv16bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfmax_vv_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: vmv1r.v v7, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vmfeq.vv v8, v24, v24, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmfeq.vv v8, v16, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmax.vv v16, v8, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.maximum.nxv16bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfmax_vv_nxv16bf16_unmasked( %va, %vb, i32 zeroext %evl) { +; CHECK-LABEL: vfmax_vv_nxv16bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v16, v16 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vmfeq.vv v7, v24, v24 +; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmax.vv v16, v8, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.maximum.nxv16bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.maximum.nxv32bf16(, , , i32) + +define @vfmax_vv_nxv32bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfmax_vv_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: li a2, 34 +; CHECK-NEXT: mul a1, a1, a2 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 34 * vlenb +; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: vmv8r.v v0, v8 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: slli a4, a4, 5 +; CHECK-NEXT: add a4, sp, a4 +; CHECK-NEXT: addi a4, a4, 16 +; CHECK-NEXT: vs1r.v v24, (a4) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v24, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v0, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v4 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vmfeq.vv v12, v24, v24, v0.t +; CHECK-NEXT: vmv8r.v v0, v16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: li a4, 24 +; CHECK-NEXT: mul a2, a2, a4 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v4 +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vl1r.v v8, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmfeq.vv v12, v16, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vfmax.vv v16, v16, v24, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: bltu a0, a1, .LBB10_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB10_2: +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 5 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vmfeq.vv v8, v24, v24, v0.t +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: li a2, 24 +; CHECK-NEXT: mul a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vmerge.vvm v24, v24, v16, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vmfeq.vv v8, v16, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmax.vv v16, v16, v24, v0.t +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 34 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.maximum.nxv32bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfmax_vv_nxv32bf16_unmasked( %va, %vb, i32 zeroext %evl) { +; CHECK-LABEL: vfmax_vv_nxv32bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 5 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma +; CHECK-NEXT: vmset.m v24 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v7, v24, a2 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vmfeq.vv v12, v24, v24, v0.t +; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: li a4, 24 +; CHECK-NEXT: mul a2, a2, a4 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmfeq.vv v12, v24, v24, v0.t +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0 +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vfmax.vv v16, v16, v8, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v4, v16 +; CHECK-NEXT: bltu a0, a1, .LBB11_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB11_2: +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v16 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v8, v8 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: li a2, 24 +; CHECK-NEXT: mul a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vmfeq.vv v3, v16, v16 +; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0 +; CHECK-NEXT: vmv1r.v v0, v3 +; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 +; CHECK-NEXT: vfmax.vv v16, v16, v24 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v0, v16 +; CHECK-NEXT: vmv8r.v v8, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.maximum.nxv32bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} declare @llvm.vp.maximum.nxv1f16(, , , i32) define @vfmax_vv_nxv1f16( %va, %vb, %m, i32 zeroext %evl) { @@ -509,10 +1037,10 @@ define @vfmax_vv_nxv32f16( %va, @vfmax_vv_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: vfmax.vv v16, v16, v8, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16 -; ZVFHMIN-NEXT: bltu a0, a1, .LBB11_2 +; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB11_2: +; ZVFHMIN-NEXT: .LBB23_2: ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 4 ; ZVFHMIN-NEXT: add a1, sp, a1 @@ -1093,10 +1621,10 @@ define @vfmax_vv_nxv16f64( %va, @vfmax_vv_nxv16f64_unmasked( ; CHECK-NEXT: vfmax.vv v8, v16, v8 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: bltu a2, a1, .LBB29_2 +; CHECK-NEXT: bltu a2, a1, .LBB41_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: .LBB29_2: +; CHECK-NEXT: .LBB41_2: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll index e9425939249878..2371840002f40b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll @@ -1,21 +1,200 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs -early-live-intervals < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs -early-live-intervals < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m -target-abi=ilp32d \ ; RUN: -verify-machineinstrs -early-live-intervals < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m -target-abi=lp64d \ ; RUN: -verify-machineinstrs -early-live-intervals < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +declare @llvm.minimum.nxv1bf16(, ) + +define @vfmin_nxv1bf16_vv( %a, %b) { +; CHECK-LABEL: vfmin_nxv1bf16_vv: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v9, v9 +; CHECK-NEXT: vmfeq.vv v8, v10, v10 +; CHECK-NEXT: vmerge.vvm v11, v9, v10, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v9, v0 +; CHECK-NEXT: vfmin.vv v9, v8, v11 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.minimum.nxv1bf16( %a, %b) + ret %v +} + +declare @llvm.minimum.nxv2bf16(, ) + +define @vfmin_nxv2bf16_vv( %a, %b) { +; CHECK-LABEL: vfmin_nxv2bf16_vv: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v9, v9 +; CHECK-NEXT: vmfeq.vv v8, v10, v10 +; CHECK-NEXT: vmerge.vvm v11, v9, v10, v0 +; CHECK-NEXT: vmv.v.v v0, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v9, v0 +; CHECK-NEXT: vfmin.vv v9, v8, v11 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.minimum.nxv2bf16( %a, %b) + ret %v +} + +declare @llvm.minimum.nxv4bf16(, ) + +define @vfmin_nxv4bf16_vv( %a, %b) { +; CHECK-LABEL: vfmin_nxv4bf16_vv: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v12, v12 +; CHECK-NEXT: vmfeq.vv v8, v10, v10 +; CHECK-NEXT: vmerge.vvm v14, v12, v10, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 +; CHECK-NEXT: vfmin.vv v10, v8, v14 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.minimum.nxv4bf16( %a, %b) + ret %v +} + +declare @llvm.minimum.nxv8bf16(, ) + +define @vfmin_nxv8bf16_vv( %a, %b) { +; CHECK-LABEL: vfmin_nxv8bf16_vv: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v16, v16 +; CHECK-NEXT: vmfeq.vv v8, v12, v12 +; CHECK-NEXT: vmerge.vvm v20, v16, v12, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 +; CHECK-NEXT: vfmin.vv v12, v8, v20 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.minimum.nxv8bf16( %a, %b) + ret %v +} + +declare @llvm.minimum.nxv16bf16(, ) + +define @vfmin_nxv16bf16_vv( %a, %b) { +; CHECK-LABEL: vfmin_nxv16bf16_vv: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v24, v24 +; CHECK-NEXT: vmfeq.vv v7, v16, v16 +; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmin.vv v16, v8, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.minimum.nxv16bf16( %a, %b) + ret %v +} + +declare @llvm.minimum.nxv32bf16(, ) + +define @vfmin_nxv32bf16_vv( %a, %b) nounwind { +; CHECK-LABEL: vfmin_nxv32bf16_vv: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vmv8r.v v0, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v0 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v8, v8 +; CHECK-NEXT: vmfeq.vv v3, v24, v24 +; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0 +; CHECK-NEXT: vmv1r.v v0, v3 +; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 +; CHECK-NEXT: vfmin.vv v8, v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v20 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v4 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v16, v16 +; CHECK-NEXT: vmfeq.vv v7, v8, v8 +; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0 +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 +; CHECK-NEXT: vfmin.vv v16, v8, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.minimum.nxv32bf16( %a, %b) + ret %v +} + declare @llvm.minimum.nxv1f16(, ) define @vfmin_nxv1f16_vv( %a, %b) { diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll index fc5b11284dab0c..03e3969f9141e8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll @@ -1,13 +1,541 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v,+m \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v,+m \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 \ +; RUN: -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 \ +; RUN: -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +declare @llvm.vp.minimum.nxv1bf16(, , , i32) + +define @vfmin_vv_nxv1bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfmin_vv_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v11, v11, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmerge.vvm v9, v11, v8, v0 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vmfeq.vv v0, v8, v8, v0.t +; CHECK-NEXT: vmerge.vvm v8, v8, v11, v0 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vfmin.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.minimum.nxv1bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfmin_vv_nxv1bf16_unmasked( %va, %vb, i32 zeroext %evl) { +; CHECK-LABEL: vfmin_vv_nxv1bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v10, v10 +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfeq.vv v8, v11, v11 +; CHECK-NEXT: vmerge.vvm v9, v10, v11, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vvm v8, v11, v10, v0 +; CHECK-NEXT: vfmin.vv v9, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.minimum.nxv1bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.minimum.nxv2bf16(, , , i32) + +define @vfmin_vv_nxv2bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfmin_vv_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v11, v11, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vmerge.vvm v9, v11, v8, v0 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vmfeq.vv v0, v8, v8, v0.t +; CHECK-NEXT: vmerge.vvm v8, v8, v11, v0 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vfmin.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.minimum.nxv2bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfmin_vv_nxv2bf16_unmasked( %va, %vb, i32 zeroext %evl) { +; CHECK-LABEL: vfmin_vv_nxv2bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v10, v10 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vmfeq.vv v8, v11, v11 +; CHECK-NEXT: vmerge.vvm v9, v10, v11, v0 +; CHECK-NEXT: vmv.v.v v0, v8 +; CHECK-NEXT: vmerge.vvm v8, v11, v10, v0 +; CHECK-NEXT: vfmin.vv v9, v8, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.minimum.nxv2bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.minimum.nxv4bf16(, , , i32) + +define @vfmin_vv_nxv4bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfmin_vv_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vmfeq.vv v8, v12, v12, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vmerge.vvm v16, v12, v14, v0 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vmfeq.vv v8, v14, v14, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vvm v8, v14, v12, v0 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vfmin.vv v10, v8, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.vp.minimum.nxv4bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfmin_vv_nxv4bf16_unmasked( %va, %vb, i32 zeroext %evl) { +; CHECK-LABEL: vfmin_vv_nxv4bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v10, v10 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vmfeq.vv v8, v12, v12 +; CHECK-NEXT: vmerge.vvm v14, v10, v12, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vvm v8, v12, v10, v0 +; CHECK-NEXT: vfmin.vv v10, v8, v14 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.vp.minimum.nxv4bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.minimum.nxv8bf16(, , , i32) + +define @vfmin_vv_nxv8bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfmin_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v8, v16, v16, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v20, v10 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmerge.vvm v24, v16, v20, v0 +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: vmfeq.vv v8, v20, v20, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vvm v8, v20, v16, v0 +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: vfmin.vv v12, v8, v24, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.vp.minimum.nxv8bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfmin_vv_nxv8bf16_unmasked( %va, %vb, i32 zeroext %evl) { +; CHECK-LABEL: vfmin_vv_nxv8bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v12, v12 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v8, v16, v16 +; CHECK-NEXT: vmerge.vvm v20, v12, v16, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vvm v8, v16, v12, v0 +; CHECK-NEXT: vfmin.vv v12, v8, v20 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.vp.minimum.nxv8bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.minimum.nxv16bf16(, , , i32) + +define @vfmin_vv_nxv16bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfmin_vv_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: vmv1r.v v7, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vmfeq.vv v8, v24, v24, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmfeq.vv v8, v16, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmin.vv v16, v8, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.minimum.nxv16bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfmin_vv_nxv16bf16_unmasked( %va, %vb, i32 zeroext %evl) { +; CHECK-LABEL: vfmin_vv_nxv16bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v16, v16 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vmfeq.vv v7, v24, v24 +; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmin.vv v16, v8, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.minimum.nxv16bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.minimum.nxv32bf16(, , , i32) + +define @vfmin_vv_nxv32bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfmin_vv_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: li a2, 34 +; CHECK-NEXT: mul a1, a1, a2 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 34 * vlenb +; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: vmv8r.v v0, v8 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: slli a4, a4, 5 +; CHECK-NEXT: add a4, sp, a4 +; CHECK-NEXT: addi a4, a4, 16 +; CHECK-NEXT: vs1r.v v24, (a4) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v24, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v0, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v4 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vmfeq.vv v12, v24, v24, v0.t +; CHECK-NEXT: vmv8r.v v0, v16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: li a4, 24 +; CHECK-NEXT: mul a2, a2, a4 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v4 +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vl1r.v v8, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmfeq.vv v12, v16, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vfmin.vv v16, v16, v24, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: bltu a0, a1, .LBB10_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB10_2: +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 5 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vmfeq.vv v8, v24, v24, v0.t +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: li a2, 24 +; CHECK-NEXT: mul a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vmerge.vvm v24, v24, v16, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vmfeq.vv v8, v16, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmin.vv v16, v16, v24, v0.t +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 34 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.minimum.nxv32bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfmin_vv_nxv32bf16_unmasked( %va, %vb, i32 zeroext %evl) { +; CHECK-LABEL: vfmin_vv_nxv32bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 5 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma +; CHECK-NEXT: vmset.m v24 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v7, v24, a2 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vmfeq.vv v12, v24, v24, v0.t +; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: li a4, 24 +; CHECK-NEXT: mul a2, a2, a4 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmfeq.vv v12, v24, v24, v0.t +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0 +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vfmin.vv v16, v16, v8, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v4, v16 +; CHECK-NEXT: bltu a0, a1, .LBB11_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB11_2: +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v16 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v8, v8 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: li a2, 24 +; CHECK-NEXT: mul a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vmfeq.vv v3, v16, v16 +; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0 +; CHECK-NEXT: vmv1r.v v0, v3 +; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 +; CHECK-NEXT: vfmin.vv v16, v16, v24 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v0, v16 +; CHECK-NEXT: vmv8r.v v8, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.minimum.nxv32bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} declare @llvm.vp.minimum.nxv1f16(, , , i32) define @vfmin_vv_nxv1f16( %va, %vb, %m, i32 zeroext %evl) { @@ -509,10 +1037,10 @@ define @vfmin_vv_nxv32f16( %va, @vfmin_vv_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: vfmin.vv v16, v16, v8, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16 -; ZVFHMIN-NEXT: bltu a0, a1, .LBB11_2 +; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB11_2: +; ZVFHMIN-NEXT: .LBB23_2: ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 4 ; ZVFHMIN-NEXT: add a1, sp, a1 @@ -1093,10 +1621,10 @@ define @vfmin_vv_nxv16f64( %va, @vfmin_vv_nxv16f64_unmasked( ; CHECK-NEXT: vfmin.vv v8, v16, v8 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: bltu a2, a1, .LBB29_2 +; CHECK-NEXT: bltu a2, a1, .LBB41_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: .LBB29_2: +; CHECK-NEXT: .LBB41_2: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll index 0655b9d099cbb7..9498c65ba9a176 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll @@ -1,20 +1,187 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN +define @nearbyint_nxv1bf16( %x) { +; CHECK-LABEL: nearbyint_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfabs.v v8, v9 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: fsflags a0 +; CHECK-NEXT: ret + %a = call @llvm.nearbyint.nxv1bf16( %x) + ret %a +} + +define @nearbyint_nxv2bf16( %x) { +; CHECK-LABEL: nearbyint_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfabs.v v8, v9 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: fsflags a0 +; CHECK-NEXT: ret + %a = call @llvm.nearbyint.nxv2bf16( %x) + ret %a +} + +define @nearbyint_nxv4bf16( %x) { +; CHECK-LABEL: nearbyint_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfabs.v v8, v10 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: fsflags a0 +; CHECK-NEXT: ret + %a = call @llvm.nearbyint.nxv4bf16( %x) + ret %a +} + +define @nearbyint_nxv8bf16( %x) { +; CHECK-LABEL: nearbyint_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfabs.v v8, v12 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; CHECK-NEXT: vfsgnj.vv v12, v8, v12, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: fsflags a0 +; CHECK-NEXT: ret + %a = call @llvm.nearbyint.nxv8bf16( %x) + ret %a +} + +define @nearbyint_nxv16bf16( %x) { +; CHECK-LABEL: nearbyint_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v8, v16 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: fsflags a0 +; CHECK-NEXT: ret + %a = call @llvm.nearbyint.nxv16bf16( %x) + ret %a +} + +define @nearbyint_nxv32bf16( %x) { +; CHECK-LABEL: nearbyint_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 +; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: fsflags a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v8, v24 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 +; CHECK-NEXT: fsflags a0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %a = call @llvm.nearbyint.nxv32bf16( %x) + ret %a +} + define @nearbyint_nxv1f16( %x) { ; ZVFH-LABEL: nearbyint_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI0_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -52,8 +219,8 @@ declare @llvm.nearbyint.nxv1f16() define @nearbyint_nxv2f16( %x) { ; ZVFH-LABEL: nearbyint_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI1_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI7_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -91,8 +258,8 @@ declare @llvm.nearbyint.nxv2f16() define @nearbyint_nxv4f16( %x) { ; ZVFH-LABEL: nearbyint_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI2_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI8_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -130,8 +297,8 @@ declare @llvm.nearbyint.nxv4f16() define @nearbyint_nxv8f16( %x) { ; ZVFH-LABEL: nearbyint_nxv8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI3_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI9_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 @@ -169,8 +336,8 @@ declare @llvm.nearbyint.nxv8f16() define @nearbyint_nxv16f16( %x) { ; ZVFH-LABEL: nearbyint_nxv16f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI4_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI10_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 ; ZVFH-NEXT: vmflt.vf v0, v12, fa5 @@ -208,8 +375,8 @@ declare @llvm.nearbyint.nxv16f16() define @nearbyint_nxv32f16( %x) { ; ZVFH-LABEL: nearbyint_nxv32f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI5_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI11_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 ; ZVFH-NEXT: vmflt.vf v0, v16, fa5 @@ -374,8 +541,8 @@ declare @llvm.nearbyint.nxv16f32() define @nearbyint_nxv1f64( %x) { ; CHECK-LABEL: nearbyint_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI17_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 @@ -394,8 +561,8 @@ declare @llvm.nearbyint.nxv1f64() define @nearbyint_nxv2f64( %x) { ; CHECK-LABEL: nearbyint_nxv2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI18_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 @@ -414,8 +581,8 @@ declare @llvm.nearbyint.nxv2f64() define @nearbyint_nxv4f64( %x) { ; CHECK-LABEL: nearbyint_nxv4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI19_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 @@ -434,8 +601,8 @@ declare @llvm.nearbyint.nxv4f64() define @nearbyint_nxv8f64( %x) { ; CHECK-LABEL: nearbyint_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI14_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI20_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 diff --git a/llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll index ca1f72ee4d524b..7fac8949c5517a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll @@ -1,20 +1,160 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN +define @rint_nxv1bf16( %x) { +; CHECK-LABEL: rint_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfabs.v v8, v9 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %a = call @llvm.rint.nxv1bf16( %x) + ret %a +} + +define @rint_nxv2bf16( %x) { +; CHECK-LABEL: rint_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfabs.v v8, v9 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %a = call @llvm.rint.nxv2bf16( %x) + ret %a +} + +define @rint_nxv4bf16( %x) { +; CHECK-LABEL: rint_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfabs.v v8, v10 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %a = call @llvm.rint.nxv4bf16( %x) + ret %a +} + +define @rint_nxv8bf16( %x) { +; CHECK-LABEL: rint_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfabs.v v8, v12 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; CHECK-NEXT: vfsgnj.vv v12, v8, v12, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %a = call @llvm.rint.nxv8bf16( %x) + ret %a +} + +define @rint_nxv16bf16( %x) { +; CHECK-LABEL: rint_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v8, v16 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %a = call @llvm.rint.nxv16bf16( %x) + ret %a +} + +define @rint_nxv32bf16( %x) { +; CHECK-LABEL: rint_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v8, v24 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 +; CHECK-NEXT: ret + %a = call @llvm.rint.nxv32bf16( %x) + ret %a +} + define @rint_nxv1f16( %x) { ; ZVFH-LABEL: rint_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI0_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -48,8 +188,8 @@ declare @llvm.rint.nxv1f16() define @rint_nxv2f16( %x) { ; ZVFH-LABEL: rint_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI1_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI7_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -83,8 +223,8 @@ declare @llvm.rint.nxv2f16() define @rint_nxv4f16( %x) { ; ZVFH-LABEL: rint_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI2_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI8_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -118,8 +258,8 @@ declare @llvm.rint.nxv4f16() define @rint_nxv8f16( %x) { ; ZVFH-LABEL: rint_nxv8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI3_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI9_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 @@ -153,8 +293,8 @@ declare @llvm.rint.nxv8f16() define @rint_nxv16f16( %x) { ; ZVFH-LABEL: rint_nxv16f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI4_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI10_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 ; ZVFH-NEXT: vmflt.vf v0, v12, fa5 @@ -188,8 +328,8 @@ declare @llvm.rint.nxv16f16() define @rint_nxv32f16( %x) { ; ZVFH-LABEL: rint_nxv32f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI5_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI11_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 ; ZVFH-NEXT: vmflt.vf v0, v16, fa5 @@ -325,8 +465,8 @@ declare @llvm.rint.nxv16f32() define @rint_nxv1f64( %x) { ; CHECK-LABEL: rint_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI17_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 @@ -343,8 +483,8 @@ declare @llvm.rint.nxv1f64() define @rint_nxv2f64( %x) { ; CHECK-LABEL: rint_nxv2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI18_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 @@ -361,8 +501,8 @@ declare @llvm.rint.nxv2f64() define @rint_nxv4f64( %x) { ; CHECK-LABEL: rint_nxv4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI19_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 @@ -379,8 +519,8 @@ declare @llvm.rint.nxv4f64() define @rint_nxv8f64( %x) { ; CHECK-LABEL: rint_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI14_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI20_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 diff --git a/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll index a39abcc6ed0e27..193773b0c89c9a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll @@ -1,22 +1,174 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN ; This file tests the code generation for `llvm.round.*` on scalable vector type. +define @round_nxv1bf16( %x) { +; CHECK-LABEL: round_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfabs.v v8, v9 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %a = call @llvm.round.nxv1bf16( %x) + ret %a +} + +define @round_nxv2bf16( %x) { +; CHECK-LABEL: round_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfabs.v v8, v9 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %a = call @llvm.round.nxv2bf16( %x) + ret %a +} + +define @round_nxv4bf16( %x) { +; CHECK-LABEL: round_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfabs.v v8, v10 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %a = call @llvm.round.nxv4bf16( %x) + ret %a +} + +define @round_nxv8bf16( %x) { +; CHECK-LABEL: round_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfabs.v v8, v12 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; CHECK-NEXT: vfsgnj.vv v12, v8, v12, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %a = call @llvm.round.nxv8bf16( %x) + ret %a +} + +define @round_nxv16bf16( %x) { +; CHECK-LABEL: round_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v8, v16 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %a = call @llvm.round.nxv16bf16( %x) + ret %a +} + +define @round_nxv32bf16( %x) { +; CHECK-LABEL: round_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 +; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 +; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: ret + %a = call @llvm.round.nxv32bf16( %x) + ret %a +} + define @round_nxv1f16( %x) { ; ZVFH-LABEL: round_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI0_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -54,8 +206,8 @@ declare @llvm.round.nxv1f16() define @round_nxv2f16( %x) { ; ZVFH-LABEL: round_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI1_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI7_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -93,8 +245,8 @@ declare @llvm.round.nxv2f16() define @round_nxv4f16( %x) { ; ZVFH-LABEL: round_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI2_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI8_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -132,8 +284,8 @@ declare @llvm.round.nxv4f16() define @round_nxv8f16( %x) { ; ZVFH-LABEL: round_nxv8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI3_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI9_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 @@ -171,8 +323,8 @@ declare @llvm.round.nxv8f16() define @round_nxv16f16( %x) { ; ZVFH-LABEL: round_nxv16f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI4_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI10_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 ; ZVFH-NEXT: vmflt.vf v0, v12, fa5 @@ -210,8 +362,8 @@ declare @llvm.round.nxv16f16() define @round_nxv32f16( %x) { ; ZVFH-LABEL: round_nxv32f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI5_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI11_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 ; ZVFH-NEXT: vmflt.vf v0, v16, fa5 @@ -361,8 +513,8 @@ declare @llvm.round.nxv16f32() define @round_nxv1f64( %x) { ; CHECK-LABEL: round_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI17_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 @@ -381,8 +533,8 @@ declare @llvm.round.nxv1f64() define @round_nxv2f64( %x) { ; CHECK-LABEL: round_nxv2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI18_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 @@ -401,8 +553,8 @@ declare @llvm.round.nxv2f64() define @round_nxv4f64( %x) { ; CHECK-LABEL: round_nxv4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI19_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 @@ -421,8 +573,8 @@ declare @llvm.round.nxv4f64() define @round_nxv8f64( %x) { ; CHECK-LABEL: round_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI14_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI20_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 diff --git a/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll index 52ad443bfdebda..052ee2d3a43cf2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll @@ -1,22 +1,173 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN ; This file tests the code generation for `llvm.roundeven.*` on scalable vector type. +define @roundeven_nxv1bf16( %x) { +; CHECK-LABEL: roundeven_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfabs.v v8, v9 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %a = call @llvm.roundeven.nxv1bf16( %x) + ret %a +} + +define @roundeven_nxv2bf16( %x) { +; CHECK-LABEL: roundeven_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfabs.v v8, v9 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %a = call @llvm.roundeven.nxv2bf16( %x) + ret %a +} + +define @roundeven_nxv4bf16( %x) { +; CHECK-LABEL: roundeven_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfabs.v v8, v10 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %a = call @llvm.roundeven.nxv4bf16( %x) + ret %a +} + +define @roundeven_nxv8bf16( %x) { +; CHECK-LABEL: roundeven_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfabs.v v8, v12 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; CHECK-NEXT: vfsgnj.vv v12, v8, v12, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %a = call @llvm.roundeven.nxv8bf16( %x) + ret %a +} + +define @roundeven_nxv16bf16( %x) { +; CHECK-LABEL: roundeven_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v8, v16 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %a = call @llvm.roundeven.nxv16bf16( %x) + ret %a +} + +define @roundeven_nxv32bf16( %x) { +; CHECK-LABEL: roundeven_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 +; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 +; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: ret + %a = call @llvm.roundeven.nxv32bf16( %x) + ret %a +} define @roundeven_nxv1f16( %x) { ; ZVFH-LABEL: roundeven_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI0_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -54,8 +205,8 @@ declare @llvm.roundeven.nxv1f16() define @roundeven_nxv2f16( %x) { ; ZVFH-LABEL: roundeven_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI1_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI7_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -93,8 +244,8 @@ declare @llvm.roundeven.nxv2f16() define @roundeven_nxv4f16( %x) { ; ZVFH-LABEL: roundeven_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI2_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI8_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -132,8 +283,8 @@ declare @llvm.roundeven.nxv4f16() define @roundeven_nxv8f16( %x) { ; ZVFH-LABEL: roundeven_nxv8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI3_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI9_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 @@ -171,8 +322,8 @@ declare @llvm.roundeven.nxv8f16() define @roundeven_nxv16f16( %x) { ; ZVFH-LABEL: roundeven_nxv16f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI4_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI10_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 ; ZVFH-NEXT: vmflt.vf v0, v12, fa5 @@ -210,8 +361,8 @@ declare @llvm.roundeven.nxv16f16() define @roundeven_nxv32f16( %x) { ; ZVFH-LABEL: roundeven_nxv32f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI5_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI11_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 ; ZVFH-NEXT: vmflt.vf v0, v16, fa5 @@ -361,8 +512,8 @@ declare @llvm.roundeven.nxv16f32() define @roundeven_nxv1f64( %x) { ; CHECK-LABEL: roundeven_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI17_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 @@ -381,8 +532,8 @@ declare @llvm.roundeven.nxv1f64() define @roundeven_nxv2f64( %x) { ; CHECK-LABEL: roundeven_nxv2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI18_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 @@ -401,8 +552,8 @@ declare @llvm.roundeven.nxv2f64() define @roundeven_nxv4f64( %x) { ; CHECK-LABEL: roundeven_nxv4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI19_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 @@ -421,8 +572,8 @@ declare @llvm.roundeven.nxv4f64() define @roundeven_nxv8f64( %x) { ; CHECK-LABEL: roundeven_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI14_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI20_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 diff --git a/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll index 971424e8cea09e..b29b24a9ce7b25 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll @@ -1,20 +1,160 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN +define @trunc_nxv1bf16( %x) { +; CHECK-LABEL: trunc_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfabs.v v8, v9 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v9, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %a = call @llvm.trunc.nxv1bf16( %x) + ret %a +} + +define @trunc_nxv2bf16( %x) { +; CHECK-LABEL: trunc_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfabs.v v8, v9 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v9, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %a = call @llvm.trunc.nxv2bf16( %x) + ret %a +} + +define @trunc_nxv4bf16( %x) { +; CHECK-LABEL: trunc_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfabs.v v8, v10 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v10, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %a = call @llvm.trunc.nxv4bf16( %x) + ret %a +} + +define @trunc_nxv8bf16( %x) { +; CHECK-LABEL: trunc_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfabs.v v8, v12 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v12, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; CHECK-NEXT: vfsgnj.vv v12, v8, v12, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %a = call @llvm.trunc.nxv8bf16( %x) + ret %a +} + +define @trunc_nxv16bf16( %x) { +; CHECK-LABEL: trunc_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v8, v16 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %a = call @llvm.trunc.nxv16bf16( %x) + ret %a +} + +define @trunc_nxv32bf16( %x) { +; CHECK-LABEL: trunc_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 +; CHECK-NEXT: vfcvt.rtz.x.f.v v24, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v8, v24 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.rtz.x.f.v v16, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 +; CHECK-NEXT: ret + %a = call @llvm.trunc.nxv32bf16( %x) + ret %a +} + define @trunc_nxv1f16( %x) { ; ZVFH-LABEL: trunc_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI0_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -48,8 +188,8 @@ declare @llvm.trunc.nxv1f16() define @trunc_nxv2f16( %x) { ; ZVFH-LABEL: trunc_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI1_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI7_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -83,8 +223,8 @@ declare @llvm.trunc.nxv2f16() define @trunc_nxv4f16( %x) { ; ZVFH-LABEL: trunc_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI2_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI8_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -118,8 +258,8 @@ declare @llvm.trunc.nxv4f16() define @trunc_nxv8f16( %x) { ; ZVFH-LABEL: trunc_nxv8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI3_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI9_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 @@ -153,8 +293,8 @@ declare @llvm.trunc.nxv8f16() define @trunc_nxv16f16( %x) { ; ZVFH-LABEL: trunc_nxv16f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI4_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI10_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 ; ZVFH-NEXT: vmflt.vf v0, v12, fa5 @@ -188,8 +328,8 @@ declare @llvm.trunc.nxv16f16() define @trunc_nxv32f16( %x) { ; ZVFH-LABEL: trunc_nxv32f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI5_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0) +; ZVFH-NEXT: lui a0, %hi(.LCPI11_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 ; ZVFH-NEXT: vmflt.vf v0, v16, fa5 @@ -325,8 +465,8 @@ declare @llvm.trunc.nxv16f32() define @trunc_nxv1f64( %x) { ; CHECK-LABEL: trunc_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI17_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 @@ -343,8 +483,8 @@ declare @llvm.trunc.nxv1f64() define @trunc_nxv2f64( %x) { ; CHECK-LABEL: trunc_nxv2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI18_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 @@ -361,8 +501,8 @@ declare @llvm.trunc.nxv2f64() define @trunc_nxv4f64( %x) { ; CHECK-LABEL: trunc_nxv4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI19_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 @@ -379,8 +519,8 @@ declare @llvm.trunc.nxv4f64() define @trunc_nxv8f64( %x) { ; CHECK-LABEL: trunc_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI14_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI20_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 diff --git a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll index a3ea462b6a7376..5aa773b01e6926 100644 --- a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll @@ -1,20 +1,420 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +declare @llvm.vp.nearbyint.nxv1bf16(, , i32) + +define @vp_nearbyint_nxv1bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_nearbyint_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfabs.v v8, v9, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t +; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: fsflags a0 +; CHECK-NEXT: ret + %v = call @llvm.vp.nearbyint.nxv1bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_nearbyint_nxv1bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_nearbyint_nxv1bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfabs.v v8, v9 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: fsflags a0 +; CHECK-NEXT: ret + %v = call @llvm.vp.nearbyint.nxv1bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.nearbyint.nxv2bf16(, , i32) + +define @vp_nearbyint_nxv2bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_nearbyint_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfabs.v v8, v9, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t +; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: fsflags a0 +; CHECK-NEXT: ret + %v = call @llvm.vp.nearbyint.nxv2bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_nearbyint_nxv2bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_nearbyint_nxv2bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfabs.v v8, v9 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: fsflags a0 +; CHECK-NEXT: ret + %v = call @llvm.vp.nearbyint.nxv2bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.nearbyint.nxv4bf16(, , i32) + +define @vp_nearbyint_nxv4bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_nearbyint_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfabs.v v12, v10, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vmflt.vf v9, v12, fa5, v0.t +; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v12, v10, v0.t +; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vfsgnj.vv v10, v12, v10, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: fsflags a0 +; CHECK-NEXT: ret + %v = call @llvm.vp.nearbyint.nxv4bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_nearbyint_nxv4bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_nearbyint_nxv4bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfabs.v v8, v10 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: fsflags a0 +; CHECK-NEXT: ret + %v = call @llvm.vp.nearbyint.nxv4bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.nearbyint.nxv8bf16(, , i32) + +define @vp_nearbyint_nxv8bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_nearbyint_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfabs.v v16, v12, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; CHECK-NEXT: vmflt.vf v10, v16, fa5, v0.t +; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v16, v12, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; CHECK-NEXT: vfsgnj.vv v12, v16, v12, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: fsflags a0 +; CHECK-NEXT: ret + %v = call @llvm.vp.nearbyint.nxv8bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_nearbyint_nxv8bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_nearbyint_nxv8bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfabs.v v8, v12 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; CHECK-NEXT: vfsgnj.vv v12, v8, v12, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: fsflags a0 +; CHECK-NEXT: ret + %v = call @llvm.vp.nearbyint.nxv8bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.nearbyint.nxv16bf16(, , i32) + +define @vp_nearbyint_nxv16bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_nearbyint_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vmflt.vf v12, v24, fa5, v0.t +; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: fsflags a0 +; CHECK-NEXT: ret + %v = call @llvm.vp.nearbyint.nxv16bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_nearbyint_nxv16bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_nearbyint_nxv16bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v8, v16 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: fsflags a0 +; CHECK-NEXT: ret + %v = call @llvm.vp.nearbyint.nxv16bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.nearbyint.nxv32bf16(, , i32) + +define @vp_nearbyint_nxv32bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_nearbyint_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: vmv1r.v v7, v0 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v17, v0, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v8, v24, v0.t +; CHECK-NEXT: lui a2, 307200 +; CHECK-NEXT: fmv.w.x fa5, a2 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vmflt.vf v17, v8, fa5, v0.t +; CHECK-NEXT: frflags a2 +; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: fsflags a2 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 +; CHECK-NEXT: bltu a0, a1, .LBB10_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB10_2: +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v16, v24, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vmflt.vf v7, v16, fa5, v0.t +; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: fsflags a0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.nearbyint.nxv32bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_nearbyint_nxv32bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_nearbyint_nxv32bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma +; CHECK-NEXT: vmset.m v16 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v16, v16, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v8, v24, v0.t +; CHECK-NEXT: lui a2, 307200 +; CHECK-NEXT: fmv.w.x fa5, a2 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vmflt.vf v16, v8, fa5, v0.t +; CHECK-NEXT: frflags a2 +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: fsflags a2 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 +; CHECK-NEXT: bltu a0, a1, .LBB11_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB11_2: +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 +; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: fsflags a0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.nearbyint.nxv32bf16( %va, splat (i1 true), i32 %evl) + ret %v +} declare @llvm.vp.nearbyint.nxv1f16(, , i32) define @vp_nearbyint_nxv1f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_nearbyint_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI0_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI12_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI12_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu @@ -55,8 +455,8 @@ define @vp_nearbyint_nxv1f16( %va, @vp_nearbyint_nxv1f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_nearbyint_nxv1f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI1_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI13_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI13_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -95,8 +495,8 @@ declare @llvm.vp.nearbyint.nxv2f16(, @vp_nearbyint_nxv2f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_nearbyint_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI2_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI14_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI14_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu @@ -137,8 +537,8 @@ define @vp_nearbyint_nxv2f16( %va, @vp_nearbyint_nxv2f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_nearbyint_nxv2f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI3_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI15_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI15_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -177,8 +577,8 @@ declare @llvm.vp.nearbyint.nxv4f16(, @vp_nearbyint_nxv4f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_nearbyint_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI4_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI16_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI16_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu @@ -221,8 +621,8 @@ define @vp_nearbyint_nxv4f16( %va, @vp_nearbyint_nxv4f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_nearbyint_nxv4f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI5_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI17_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI17_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -261,8 +661,8 @@ declare @llvm.vp.nearbyint.nxv8f16(, @vp_nearbyint_nxv8f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_nearbyint_nxv8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI6_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI18_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI18_0)(a1) ; ZVFH-NEXT: vmv1r.v v10, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8, v0.t @@ -307,8 +707,8 @@ define @vp_nearbyint_nxv8f16( %va, @vp_nearbyint_nxv8f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_nearbyint_nxv8f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI7_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI19_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI19_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 @@ -347,8 +747,8 @@ declare @llvm.vp.nearbyint.nxv16f16(, < define @vp_nearbyint_nxv16f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_nearbyint_nxv16f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI8_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI20_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI20_0)(a1) ; ZVFH-NEXT: vmv1r.v v12, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8, v0.t @@ -393,8 +793,8 @@ define @vp_nearbyint_nxv16f16( %va, @vp_nearbyint_nxv16f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_nearbyint_nxv16f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI9_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI21_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI21_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 ; ZVFH-NEXT: vmflt.vf v0, v12, fa5 @@ -433,8 +833,8 @@ declare @llvm.vp.nearbyint.nxv32f16(, < define @vp_nearbyint_nxv32f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_nearbyint_nxv32f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI10_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI22_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI22_0)(a1) ; ZVFH-NEXT: vmv1r.v v16, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v24, v8, v0.t @@ -489,10 +889,10 @@ define @vp_nearbyint_nxv32f16( %va, @vp_nearbyint_nxv32f16( %va, @vp_nearbyint_nxv32f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_nearbyint_nxv32f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI11_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI23_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI23_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 ; ZVFH-NEXT: vmflt.vf v0, v16, fa5 @@ -576,10 +976,10 @@ define @vp_nearbyint_nxv32f16_unmasked( ; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 -; ZVFHMIN-NEXT: bltu a0, a1, .LBB11_2 +; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB11_2: +; ZVFHMIN-NEXT: .LBB23_2: ; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 @@ -824,8 +1224,8 @@ declare @llvm.vp.nearbyint.nxv1f64(, define @vp_nearbyint_nxv1f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI22_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI34_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI34_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu @@ -845,8 +1245,8 @@ define @vp_nearbyint_nxv1f64( %va, @vp_nearbyint_nxv1f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_nxv1f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI23_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI35_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI35_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 @@ -866,8 +1266,8 @@ declare @llvm.vp.nearbyint.nxv2f64(, define @vp_nearbyint_nxv2f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_nxv2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI24_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI36_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI36_0)(a1) ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v12, v8, v0.t @@ -889,8 +1289,8 @@ define @vp_nearbyint_nxv2f64( %va, @vp_nearbyint_nxv2f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_nxv2f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI25_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI37_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI37_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 @@ -910,8 +1310,8 @@ declare @llvm.vp.nearbyint.nxv4f64(, define @vp_nearbyint_nxv4f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_nxv4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI26_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI38_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI38_0)(a1) ; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v16, v8, v0.t @@ -933,8 +1333,8 @@ define @vp_nearbyint_nxv4f64( %va, @vp_nearbyint_nxv4f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_nxv4f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI27_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI39_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI39_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 @@ -954,8 +1354,8 @@ declare @llvm.vp.nearbyint.nxv7f64(, define @vp_nearbyint_nxv7f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_nxv7f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI28_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI28_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI40_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI40_0)(a1) ; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t @@ -977,8 +1377,8 @@ define @vp_nearbyint_nxv7f64( %va, @vp_nearbyint_nxv7f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_nxv7f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI29_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI29_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI41_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI41_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 @@ -998,8 +1398,8 @@ declare @llvm.vp.nearbyint.nxv8f64(, define @vp_nearbyint_nxv8f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI30_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI30_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI42_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI42_0)(a1) ; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t @@ -1021,8 +1421,8 @@ define @vp_nearbyint_nxv8f64( %va, @vp_nearbyint_nxv8f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_nxv8f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI31_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI31_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI43_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI43_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 @@ -1049,8 +1449,8 @@ define @vp_nearbyint_nxv16f64( %va, ; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vx v6, v0, a2 ; CHECK-NEXT: sub a2, a0, a1 -; CHECK-NEXT: lui a3, %hi(.LCPI32_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI32_0)(a3) +; CHECK-NEXT: lui a3, %hi(.LCPI44_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI44_0)(a3) ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 ; CHECK-NEXT: and a2, a3, a2 @@ -1067,10 +1467,10 @@ define @vp_nearbyint_nxv16f64( %va, ; CHECK-NEXT: fsflags a2 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: bltu a0, a1, .LBB32_2 +; CHECK-NEXT: bltu a0, a1, .LBB44_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: .LBB32_2: +; CHECK-NEXT: .LBB44_2: ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t @@ -1094,8 +1494,8 @@ define @vp_nearbyint_nxv16f64_unmasked( @vp_nearbyint_nxv16f64_unmasked( @llvm.vp.rint.nxv1bf16(, , i32) + +define @vp_rint_nxv1bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_rint_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfabs.v v8, v9, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.rint.nxv1bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_rint_nxv1bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_rint_nxv1bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfabs.v v8, v9 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.rint.nxv1bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.rint.nxv2bf16(, , i32) + +define @vp_rint_nxv2bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_rint_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfabs.v v8, v9, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.rint.nxv2bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_rint_nxv2bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_rint_nxv2bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfabs.v v8, v9 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.rint.nxv2bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.rint.nxv4bf16(, , i32) + +define @vp_rint_nxv4bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_rint_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfabs.v v12, v10, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vmflt.vf v9, v12, fa5, v0.t +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v12, v10, v0.t +; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vfsgnj.vv v10, v12, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.vp.rint.nxv4bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_rint_nxv4bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_rint_nxv4bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfabs.v v8, v10 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.vp.rint.nxv4bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.rint.nxv8bf16(, , i32) + +define @vp_rint_nxv8bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_rint_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfabs.v v16, v12, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; CHECK-NEXT: vmflt.vf v10, v16, fa5, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v16, v12, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; CHECK-NEXT: vfsgnj.vv v12, v16, v12, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.vp.rint.nxv8bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_rint_nxv8bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_rint_nxv8bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfabs.v v8, v12 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; CHECK-NEXT: vfsgnj.vv v12, v8, v12, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.vp.rint.nxv8bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.rint.nxv16bf16(, , i32) + +define @vp_rint_nxv16bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_rint_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vmflt.vf v12, v24, fa5, v0.t +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %v = call @llvm.vp.rint.nxv16bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_rint_nxv16bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_rint_nxv16bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v8, v16 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %v = call @llvm.vp.rint.nxv16bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.rint.nxv32bf16(, , i32) + +define @vp_rint_nxv32bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_rint_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: vmv1r.v v16, v0 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v17, v0, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v8, v24, v0.t +; CHECK-NEXT: lui a2, 307200 +; CHECK-NEXT: fmv.w.x fa5, a2 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vmflt.vf v17, v8, fa5, v0.t +; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 +; CHECK-NEXT: bltu a0, a1, .LBB10_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB10_2: +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0 +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vmv1r.v v8, v16 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v16, v24, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vmflt.vf v8, v16, fa5, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.rint.nxv32bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_rint_nxv32bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_rint_nxv32bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma +; CHECK-NEXT: vmset.m v16 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v16, v16, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v8, v24, v0.t +; CHECK-NEXT: lui a2, 307200 +; CHECK-NEXT: fmv.w.x fa5, a2 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vmflt.vf v16, v8, fa5, v0.t +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 +; CHECK-NEXT: bltu a0, a1, .LBB11_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB11_2: +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.rint.nxv32bf16( %va, splat (i1 true), i32 %evl) + ret %v +} declare @llvm.vp.rint.nxv1f16(, , i32) define @vp_rint_nxv1f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_rint_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI0_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI12_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI12_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu @@ -51,8 +428,8 @@ define @vp_rint_nxv1f16( %va, @vp_rint_nxv1f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_rint_nxv1f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI1_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI13_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI13_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -87,8 +464,8 @@ declare @llvm.vp.rint.nxv2f16(, @vp_rint_nxv2f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_rint_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI2_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI14_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI14_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu @@ -125,8 +502,8 @@ define @vp_rint_nxv2f16( %va, @vp_rint_nxv2f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_rint_nxv2f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI3_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI15_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI15_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -161,8 +538,8 @@ declare @llvm.vp.rint.nxv4f16(, @vp_rint_nxv4f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_rint_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI4_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI16_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI16_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu @@ -201,8 +578,8 @@ define @vp_rint_nxv4f16( %va, @vp_rint_nxv4f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_rint_nxv4f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI5_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI17_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI17_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -237,8 +614,8 @@ declare @llvm.vp.rint.nxv8f16(, @vp_rint_nxv8f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_rint_nxv8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI6_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI18_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI18_0)(a1) ; ZVFH-NEXT: vmv1r.v v10, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8, v0.t @@ -279,8 +656,8 @@ define @vp_rint_nxv8f16( %va, @vp_rint_nxv8f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_rint_nxv8f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI7_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI19_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI19_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 @@ -315,8 +692,8 @@ declare @llvm.vp.rint.nxv16f16(, @vp_rint_nxv16f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_rint_nxv16f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI8_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI20_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI20_0)(a1) ; ZVFH-NEXT: vmv1r.v v12, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8, v0.t @@ -357,8 +734,8 @@ define @vp_rint_nxv16f16( %va, @vp_rint_nxv16f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_rint_nxv16f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI9_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI21_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI21_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 ; ZVFH-NEXT: vmflt.vf v0, v12, fa5 @@ -393,8 +770,8 @@ declare @llvm.vp.rint.nxv32f16(, @vp_rint_nxv32f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_rint_nxv32f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI10_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI22_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI22_0)(a1) ; ZVFH-NEXT: vmv1r.v v16, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v24, v8, v0.t @@ -445,10 +822,10 @@ define @vp_rint_nxv32f16( %va, @vp_rint_nxv32f16( %va, @vp_rint_nxv32f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_rint_nxv32f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI11_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI23_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI23_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 ; ZVFH-NEXT: vmflt.vf v0, v16, fa5 @@ -531,10 +908,10 @@ define @vp_rint_nxv32f16_unmasked( %va, ; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 -; ZVFHMIN-NEXT: bltu a0, a1, .LBB11_2 +; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB11_2: +; ZVFHMIN-NEXT: .LBB23_2: ; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 @@ -757,8 +1134,8 @@ declare @llvm.vp.rint.nxv1f64(, @vp_rint_nxv1f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI22_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI34_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI34_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu @@ -776,8 +1153,8 @@ define @vp_rint_nxv1f64( %va, @vp_rint_nxv1f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_nxv1f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI23_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI35_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI35_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 @@ -795,8 +1172,8 @@ declare @llvm.vp.rint.nxv2f64(, @vp_rint_nxv2f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_nxv2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI24_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI36_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI36_0)(a1) ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v12, v8, v0.t @@ -816,8 +1193,8 @@ define @vp_rint_nxv2f64( %va, @vp_rint_nxv2f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_nxv2f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI25_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI37_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI37_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 @@ -835,8 +1212,8 @@ declare @llvm.vp.rint.nxv4f64(, @vp_rint_nxv4f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_nxv4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI26_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI38_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI38_0)(a1) ; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v16, v8, v0.t @@ -856,8 +1233,8 @@ define @vp_rint_nxv4f64( %va, @vp_rint_nxv4f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_nxv4f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI27_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI39_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI39_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 @@ -875,8 +1252,8 @@ declare @llvm.vp.rint.nxv7f64(, @vp_rint_nxv7f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_nxv7f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI28_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI28_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI40_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI40_0)(a1) ; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t @@ -896,8 +1273,8 @@ define @vp_rint_nxv7f64( %va, @vp_rint_nxv7f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_nxv7f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI29_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI29_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI41_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI41_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 @@ -915,8 +1292,8 @@ declare @llvm.vp.rint.nxv8f64(, @vp_rint_nxv8f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI30_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI30_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI42_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI42_0)(a1) ; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t @@ -936,8 +1313,8 @@ define @vp_rint_nxv8f64( %va, @vp_rint_nxv8f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_nxv8f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI31_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI31_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI43_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI43_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 @@ -968,8 +1345,8 @@ define @vp_rint_nxv16f64( %va, @vp_rint_nxv16f64( %va, @vp_rint_nxv16f64_unmasked( ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: sub a2, a0, a1 -; CHECK-NEXT: lui a3, %hi(.LCPI33_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI33_0)(a3) +; CHECK-NEXT: lui a3, %hi(.LCPI45_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI45_0)(a3) ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 ; CHECK-NEXT: and a2, a3, a2 @@ -1028,10 +1405,10 @@ define @vp_rint_nxv16f64_unmasked( ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: bltu a0, a1, .LBB33_2 +; CHECK-NEXT: bltu a0, a1, .LBB45_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: .LBB33_2: +; CHECK-NEXT: .LBB45_2: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8 ; CHECK-NEXT: vmflt.vf v0, v24, fa5 diff --git a/llvm/test/CodeGen/RISCV/rvv/round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/round-vp.ll index 1ddadcc4937361..a4936483e8a152 100644 --- a/llvm/test/CodeGen/RISCV/rvv/round-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/round-vp.ll @@ -1,20 +1,428 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +declare @llvm.vp.round.nxv1bf16(, , i32) + +define @vp_round_nxv1bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_round_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfabs.v v8, v9, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t +; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.round.nxv1bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_round_nxv1bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_round_nxv1bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfabs.v v8, v9 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.round.nxv1bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.round.nxv2bf16(, , i32) + +define @vp_round_nxv2bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_round_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfabs.v v8, v9, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t +; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.round.nxv2bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_round_nxv2bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_round_nxv2bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfabs.v v8, v9 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.round.nxv2bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.round.nxv4bf16(, , i32) + +define @vp_round_nxv4bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_round_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfabs.v v12, v10, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vmflt.vf v9, v12, fa5, v0.t +; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v12, v10, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vfsgnj.vv v10, v12, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.vp.round.nxv4bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_round_nxv4bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_round_nxv4bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfabs.v v8, v10 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.vp.round.nxv4bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.round.nxv8bf16(, , i32) + +define @vp_round_nxv8bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_round_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfabs.v v16, v12, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; CHECK-NEXT: vmflt.vf v10, v16, fa5, v0.t +; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v16, v12, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; CHECK-NEXT: vfsgnj.vv v12, v16, v12, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.vp.round.nxv8bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_round_nxv8bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_round_nxv8bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfabs.v v8, v12 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; CHECK-NEXT: vfsgnj.vv v12, v8, v12, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.vp.round.nxv8bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.round.nxv16bf16(, , i32) + +define @vp_round_nxv16bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_round_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vmflt.vf v12, v24, fa5, v0.t +; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %v = call @llvm.vp.round.nxv16bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_round_nxv16bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_round_nxv16bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v8, v16 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %v = call @llvm.vp.round.nxv16bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.round.nxv32bf16(, , i32) + +define @vp_round_nxv32bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_round_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vmv1r.v v16, v0 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v17, v0, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v8, v24, v0.t +; CHECK-NEXT: lui a2, 307200 +; CHECK-NEXT: fmv.w.x fa5, a2 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vmflt.vf v17, v8, fa5, v0.t +; CHECK-NEXT: fsrmi a2, 4 +; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t +; CHECK-NEXT: fsrm a2 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 +; CHECK-NEXT: bltu a0, a1, .LBB10_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB10_2: +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0 +; CHECK-NEXT: vmv1r.v v8, v16 +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v16, v24, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vmflt.vf v8, v16, fa5, v0.t +; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.round.nxv32bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_round_nxv32bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_round_nxv32bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma +; CHECK-NEXT: vmset.m v16 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v16, v16, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v8, v24, v0.t +; CHECK-NEXT: lui a2, 307200 +; CHECK-NEXT: fmv.w.x fa5, a2 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vmflt.vf v16, v8, fa5, v0.t +; CHECK-NEXT: fsrmi a2, 4 +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t +; CHECK-NEXT: fsrm a2 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 +; CHECK-NEXT: bltu a0, a1, .LBB11_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB11_2: +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 +; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.round.nxv32bf16( %va, splat (i1 true), i32 %evl) + ret %v +} declare @llvm.vp.round.nxv1f16(, , i32) define @vp_round_nxv1f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI0_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI12_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI12_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu @@ -55,8 +463,8 @@ define @vp_round_nxv1f16( %va, @vp_round_nxv1f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_nxv1f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI1_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI13_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI13_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -95,8 +503,8 @@ declare @llvm.vp.round.nxv2f16(, @vp_round_nxv2f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI2_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI14_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI14_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu @@ -137,8 +545,8 @@ define @vp_round_nxv2f16( %va, @vp_round_nxv2f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_nxv2f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI3_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI15_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI15_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -177,8 +585,8 @@ declare @llvm.vp.round.nxv4f16(, @vp_round_nxv4f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI4_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI16_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI16_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu @@ -221,8 +629,8 @@ define @vp_round_nxv4f16( %va, @vp_round_nxv4f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_nxv4f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI5_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI17_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI17_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -261,8 +669,8 @@ declare @llvm.vp.round.nxv8f16(, @vp_round_nxv8f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_nxv8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI6_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI18_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI18_0)(a1) ; ZVFH-NEXT: vmv1r.v v10, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8, v0.t @@ -307,8 +715,8 @@ define @vp_round_nxv8f16( %va, @vp_round_nxv8f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_nxv8f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI7_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI19_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI19_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 @@ -347,8 +755,8 @@ declare @llvm.vp.round.nxv16f16(, @vp_round_nxv16f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_nxv16f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI8_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI20_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI20_0)(a1) ; ZVFH-NEXT: vmv1r.v v12, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8, v0.t @@ -393,8 +801,8 @@ define @vp_round_nxv16f16( %va, @vp_round_nxv16f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_nxv16f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI9_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI21_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI21_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 ; ZVFH-NEXT: vmflt.vf v0, v12, fa5 @@ -433,8 +841,8 @@ declare @llvm.vp.round.nxv32f16(, @vp_round_nxv32f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_nxv32f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI10_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI22_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI22_0)(a1) ; ZVFH-NEXT: vmv1r.v v16, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v24, v8, v0.t @@ -489,10 +897,10 @@ define @vp_round_nxv32f16( %va, @vp_round_nxv32f16( %va, @vp_round_nxv32f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_nxv32f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI11_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI23_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI23_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 ; ZVFH-NEXT: vmflt.vf v0, v16, fa5 @@ -584,10 +992,10 @@ define @vp_round_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 -; ZVFHMIN-NEXT: bltu a0, a1, .LBB11_2 +; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB11_2: +; ZVFHMIN-NEXT: .LBB23_2: ; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 @@ -832,8 +1240,8 @@ declare @llvm.vp.round.nxv1f64(, @vp_round_nxv1f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI22_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI34_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI34_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu @@ -853,8 +1261,8 @@ define @vp_round_nxv1f64( %va, @vp_round_nxv1f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_nxv1f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI23_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI35_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI35_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 @@ -874,8 +1282,8 @@ declare @llvm.vp.round.nxv2f64(, @vp_round_nxv2f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_nxv2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI24_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI36_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI36_0)(a1) ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v12, v8, v0.t @@ -897,8 +1305,8 @@ define @vp_round_nxv2f64( %va, @vp_round_nxv2f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_nxv2f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI25_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI37_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI37_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 @@ -918,8 +1326,8 @@ declare @llvm.vp.round.nxv4f64(, @vp_round_nxv4f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_nxv4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI26_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI38_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI38_0)(a1) ; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v16, v8, v0.t @@ -941,8 +1349,8 @@ define @vp_round_nxv4f64( %va, @vp_round_nxv4f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_nxv4f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI27_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI39_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI39_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 @@ -962,8 +1370,8 @@ declare @llvm.vp.round.nxv7f64(, @vp_round_nxv7f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_nxv7f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI28_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI28_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI40_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI40_0)(a1) ; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t @@ -985,8 +1393,8 @@ define @vp_round_nxv7f64( %va, @vp_round_nxv7f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_nxv7f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI29_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI29_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI41_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI41_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 @@ -1006,8 +1414,8 @@ declare @llvm.vp.round.nxv8f64(, @vp_round_nxv8f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI30_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI30_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI42_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI42_0)(a1) ; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t @@ -1029,8 +1437,8 @@ define @vp_round_nxv8f64( %va, @vp_round_nxv8f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_nxv8f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI31_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI31_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI43_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI43_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 @@ -1063,8 +1471,8 @@ define @vp_round_nxv16f64( %va, @vp_round_nxv16f64( %va, @vp_round_nxv16f64_unmasked( ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: sub a2, a0, a1 -; CHECK-NEXT: lui a3, %hi(.LCPI33_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI33_0)(a3) +; CHECK-NEXT: lui a3, %hi(.LCPI45_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI45_0)(a3) ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 ; CHECK-NEXT: and a2, a3, a2 @@ -1130,10 +1538,10 @@ define @vp_round_nxv16f64_unmasked( ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: bltu a0, a1, .LBB33_2 +; CHECK-NEXT: bltu a0, a1, .LBB45_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: .LBB33_2: +; CHECK-NEXT: .LBB45_2: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8 ; CHECK-NEXT: vmflt.vf v0, v24, fa5 diff --git a/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll index 8c5a7bb2dea6aa..9857009002eb90 100644 --- a/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll @@ -1,20 +1,428 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +declare @llvm.vp.roundeven.nxv1bf16(, , i32) + +define @vp_roundeven_nxv1bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_roundeven_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfabs.v v8, v9, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t +; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.roundeven.nxv1bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_roundeven_nxv1bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_roundeven_nxv1bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfabs.v v8, v9 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.roundeven.nxv1bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.roundeven.nxv2bf16(, , i32) + +define @vp_roundeven_nxv2bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_roundeven_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfabs.v v8, v9, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t +; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.roundeven.nxv2bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_roundeven_nxv2bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_roundeven_nxv2bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfabs.v v8, v9 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.roundeven.nxv2bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.roundeven.nxv4bf16(, , i32) + +define @vp_roundeven_nxv4bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_roundeven_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfabs.v v12, v10, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vmflt.vf v9, v12, fa5, v0.t +; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v12, v10, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vfsgnj.vv v10, v12, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.vp.roundeven.nxv4bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_roundeven_nxv4bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_roundeven_nxv4bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfabs.v v8, v10 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.vp.roundeven.nxv4bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.roundeven.nxv8bf16(, , i32) + +define @vp_roundeven_nxv8bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_roundeven_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfabs.v v16, v12, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; CHECK-NEXT: vmflt.vf v10, v16, fa5, v0.t +; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v16, v12, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; CHECK-NEXT: vfsgnj.vv v12, v16, v12, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.vp.roundeven.nxv8bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_roundeven_nxv8bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_roundeven_nxv8bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfabs.v v8, v12 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; CHECK-NEXT: vfsgnj.vv v12, v8, v12, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.vp.roundeven.nxv8bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.roundeven.nxv16bf16(, , i32) + +define @vp_roundeven_nxv16bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_roundeven_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vmflt.vf v12, v24, fa5, v0.t +; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %v = call @llvm.vp.roundeven.nxv16bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_roundeven_nxv16bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_roundeven_nxv16bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v8, v16 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %v = call @llvm.vp.roundeven.nxv16bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.roundeven.nxv32bf16(, , i32) + +define @vp_roundeven_nxv32bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_roundeven_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vmv1r.v v16, v0 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v17, v0, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v8, v24, v0.t +; CHECK-NEXT: lui a2, 307200 +; CHECK-NEXT: fmv.w.x fa5, a2 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vmflt.vf v17, v8, fa5, v0.t +; CHECK-NEXT: fsrmi a2, 0 +; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t +; CHECK-NEXT: fsrm a2 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 +; CHECK-NEXT: bltu a0, a1, .LBB10_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB10_2: +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0 +; CHECK-NEXT: vmv1r.v v8, v16 +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v16, v24, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vmflt.vf v8, v16, fa5, v0.t +; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.roundeven.nxv32bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_roundeven_nxv32bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_roundeven_nxv32bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma +; CHECK-NEXT: vmset.m v16 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v16, v16, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v8, v24, v0.t +; CHECK-NEXT: lui a2, 307200 +; CHECK-NEXT: fmv.w.x fa5, a2 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vmflt.vf v16, v8, fa5, v0.t +; CHECK-NEXT: fsrmi a2, 0 +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t +; CHECK-NEXT: fsrm a2 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 +; CHECK-NEXT: bltu a0, a1, .LBB11_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB11_2: +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 +; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.roundeven.nxv32bf16( %va, splat (i1 true), i32 %evl) + ret %v +} declare @llvm.vp.roundeven.nxv1f16(, , i32) define @vp_roundeven_nxv1f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI0_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI12_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI12_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu @@ -55,8 +463,8 @@ define @vp_roundeven_nxv1f16( %va, @vp_roundeven_nxv1f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_nxv1f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI1_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI13_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI13_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -95,8 +503,8 @@ declare @llvm.vp.roundeven.nxv2f16(, @vp_roundeven_nxv2f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI2_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI14_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI14_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu @@ -137,8 +545,8 @@ define @vp_roundeven_nxv2f16( %va, @vp_roundeven_nxv2f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_nxv2f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI3_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI15_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI15_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -177,8 +585,8 @@ declare @llvm.vp.roundeven.nxv4f16(, @vp_roundeven_nxv4f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI4_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI16_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI16_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu @@ -221,8 +629,8 @@ define @vp_roundeven_nxv4f16( %va, @vp_roundeven_nxv4f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_nxv4f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI5_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI17_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI17_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -261,8 +669,8 @@ declare @llvm.vp.roundeven.nxv8f16(, @vp_roundeven_nxv8f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_nxv8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI6_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI18_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI18_0)(a1) ; ZVFH-NEXT: vmv1r.v v10, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8, v0.t @@ -307,8 +715,8 @@ define @vp_roundeven_nxv8f16( %va, @vp_roundeven_nxv8f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_nxv8f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI7_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI19_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI19_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 @@ -347,8 +755,8 @@ declare @llvm.vp.roundeven.nxv16f16(, < define @vp_roundeven_nxv16f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_nxv16f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI8_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI20_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI20_0)(a1) ; ZVFH-NEXT: vmv1r.v v12, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8, v0.t @@ -393,8 +801,8 @@ define @vp_roundeven_nxv16f16( %va, @vp_roundeven_nxv16f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_nxv16f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI9_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI21_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI21_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 ; ZVFH-NEXT: vmflt.vf v0, v12, fa5 @@ -433,8 +841,8 @@ declare @llvm.vp.roundeven.nxv32f16(, < define @vp_roundeven_nxv32f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_nxv32f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI10_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI22_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI22_0)(a1) ; ZVFH-NEXT: vmv1r.v v16, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v24, v8, v0.t @@ -489,10 +897,10 @@ define @vp_roundeven_nxv32f16( %va, @vp_roundeven_nxv32f16( %va, @vp_roundeven_nxv32f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_nxv32f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI11_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI23_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI23_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 ; ZVFH-NEXT: vmflt.vf v0, v16, fa5 @@ -584,10 +992,10 @@ define @vp_roundeven_nxv32f16_unmasked( ; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 -; ZVFHMIN-NEXT: bltu a0, a1, .LBB11_2 +; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB11_2: +; ZVFHMIN-NEXT: .LBB23_2: ; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 @@ -832,8 +1240,8 @@ declare @llvm.vp.roundeven.nxv1f64(, define @vp_roundeven_nxv1f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI22_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI34_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI34_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu @@ -853,8 +1261,8 @@ define @vp_roundeven_nxv1f64( %va, @vp_roundeven_nxv1f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_nxv1f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI23_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI35_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI35_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 @@ -874,8 +1282,8 @@ declare @llvm.vp.roundeven.nxv2f64(, define @vp_roundeven_nxv2f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_nxv2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI24_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI36_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI36_0)(a1) ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v12, v8, v0.t @@ -897,8 +1305,8 @@ define @vp_roundeven_nxv2f64( %va, @vp_roundeven_nxv2f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_nxv2f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI25_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI37_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI37_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 @@ -918,8 +1326,8 @@ declare @llvm.vp.roundeven.nxv4f64(, define @vp_roundeven_nxv4f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_nxv4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI26_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI38_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI38_0)(a1) ; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v16, v8, v0.t @@ -941,8 +1349,8 @@ define @vp_roundeven_nxv4f64( %va, @vp_roundeven_nxv4f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_nxv4f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI27_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI39_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI39_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 @@ -962,8 +1370,8 @@ declare @llvm.vp.roundeven.nxv7f64(, define @vp_roundeven_nxv7f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_nxv7f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI28_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI28_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI40_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI40_0)(a1) ; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t @@ -985,8 +1393,8 @@ define @vp_roundeven_nxv7f64( %va, @vp_roundeven_nxv7f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_nxv7f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI29_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI29_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI41_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI41_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 @@ -1006,8 +1414,8 @@ declare @llvm.vp.roundeven.nxv8f64(, define @vp_roundeven_nxv8f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI30_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI30_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI42_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI42_0)(a1) ; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t @@ -1029,8 +1437,8 @@ define @vp_roundeven_nxv8f64( %va, @vp_roundeven_nxv8f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_nxv8f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI31_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI31_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI43_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI43_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 @@ -1063,8 +1471,8 @@ define @vp_roundeven_nxv16f64( %va, ; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vx v6, v0, a2 ; CHECK-NEXT: sub a2, a0, a1 -; CHECK-NEXT: lui a3, %hi(.LCPI32_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI32_0)(a3) +; CHECK-NEXT: lui a3, %hi(.LCPI44_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI44_0)(a3) ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 ; CHECK-NEXT: and a2, a3, a2 @@ -1085,10 +1493,10 @@ define @vp_roundeven_nxv16f64( %va, ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: bltu a0, a1, .LBB32_2 +; CHECK-NEXT: bltu a0, a1, .LBB44_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: .LBB32_2: +; CHECK-NEXT: .LBB44_2: ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t @@ -1116,8 +1524,8 @@ define @vp_roundeven_nxv16f64_unmasked( @vp_roundeven_nxv16f64_unmasked( @llvm.vp.roundtozero.nxv1bf16(, , i32) + +define @vp_roundtozero_nxv1bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_roundtozero_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfabs.v v8, v9, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t +; CHECK-NEXT: fsrmi a0, 1 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.roundtozero.nxv1bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_roundtozero_nxv1bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_roundtozero_nxv1bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfabs.v v8, v9 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 1 +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.roundtozero.nxv1bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.roundtozero.nxv2bf16(, , i32) + +define @vp_roundtozero_nxv2bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_roundtozero_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfabs.v v8, v9, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t +; CHECK-NEXT: fsrmi a0, 1 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.roundtozero.nxv2bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_roundtozero_nxv2bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_roundtozero_nxv2bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfabs.v v8, v9 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 1 +; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.roundtozero.nxv2bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.roundtozero.nxv4bf16(, , i32) + +define @vp_roundtozero_nxv4bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_roundtozero_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfabs.v v12, v10, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vmflt.vf v9, v12, fa5, v0.t +; CHECK-NEXT: fsrmi a0, 1 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v12, v10, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vfsgnj.vv v10, v12, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.vp.roundtozero.nxv4bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_roundtozero_nxv4bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_roundtozero_nxv4bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfabs.v v8, v10 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 1 +; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.vp.roundtozero.nxv4bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.roundtozero.nxv8bf16(, , i32) + +define @vp_roundtozero_nxv8bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_roundtozero_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfabs.v v16, v12, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; CHECK-NEXT: vmflt.vf v10, v16, fa5, v0.t +; CHECK-NEXT: fsrmi a0, 1 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v16, v12, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; CHECK-NEXT: vfsgnj.vv v12, v16, v12, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.vp.roundtozero.nxv8bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_roundtozero_nxv8bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_roundtozero_nxv8bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfabs.v v8, v12 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 1 +; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; CHECK-NEXT: vfsgnj.vv v12, v8, v12, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.vp.roundtozero.nxv8bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.roundtozero.nxv16bf16(, , i32) + +define @vp_roundtozero_nxv16bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_roundtozero_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16, v0.t +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vmflt.vf v12, v24, fa5, v0.t +; CHECK-NEXT: fsrmi a0, 1 +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %v = call @llvm.vp.roundtozero.nxv16bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_roundtozero_nxv16bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_roundtozero_nxv16bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v8, v16 +; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v8, fa5 +; CHECK-NEXT: fsrmi a0, 1 +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %v = call @llvm.vp.roundtozero.nxv16bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.roundtozero.nxv32bf16(, , i32) + +define @vp_roundtozero_nxv32bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_roundtozero_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vmv1r.v v16, v0 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v17, v0, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v8, v24, v0.t +; CHECK-NEXT: lui a2, 307200 +; CHECK-NEXT: fmv.w.x fa5, a2 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vmflt.vf v17, v8, fa5, v0.t +; CHECK-NEXT: fsrmi a2, 1 +; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t +; CHECK-NEXT: fsrm a2 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 +; CHECK-NEXT: bltu a0, a1, .LBB10_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB10_2: +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0 +; CHECK-NEXT: vmv1r.v v8, v16 +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v16, v24, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vmflt.vf v8, v16, fa5, v0.t +; CHECK-NEXT: fsrmi a0, 1 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.roundtozero.nxv32bf16( %va, %m, i32 %evl) + ret %v +} + +define @vp_roundtozero_nxv32bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_roundtozero_nxv32bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma +; CHECK-NEXT: vmset.m v16 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v16, v16, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v8, v24, v0.t +; CHECK-NEXT: lui a2, 307200 +; CHECK-NEXT: fmv.w.x fa5, a2 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vmflt.vf v16, v8, fa5, v0.t +; CHECK-NEXT: fsrmi a2, 1 +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t +; CHECK-NEXT: fsrm a2 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 +; CHECK-NEXT: bltu a0, a1, .LBB11_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB11_2: +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 +; CHECK-NEXT: fsrmi a0, 1 +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.roundtozero.nxv32bf16( %va, splat (i1 true), i32 %evl) + ret %v +} declare @llvm.vp.roundtozero.nxv1f16(, , i32) define @vp_roundtozero_nxv1f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI0_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI12_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI12_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu @@ -55,8 +463,8 @@ define @vp_roundtozero_nxv1f16( %va, @vp_roundtozero_nxv1f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_nxv1f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI1_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI13_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI13_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -95,8 +503,8 @@ declare @llvm.vp.roundtozero.nxv2f16(, @vp_roundtozero_nxv2f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI2_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI14_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI14_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu @@ -137,8 +545,8 @@ define @vp_roundtozero_nxv2f16( %va, @vp_roundtozero_nxv2f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_nxv2f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI3_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI15_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI15_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -177,8 +585,8 @@ declare @llvm.vp.roundtozero.nxv4f16(, @vp_roundtozero_nxv4f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI4_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI16_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI16_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu @@ -221,8 +629,8 @@ define @vp_roundtozero_nxv4f16( %va, @vp_roundtozero_nxv4f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_nxv4f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI5_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI17_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI17_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 @@ -261,8 +669,8 @@ declare @llvm.vp.roundtozero.nxv8f16(, @vp_roundtozero_nxv8f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_nxv8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI6_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI18_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI18_0)(a1) ; ZVFH-NEXT: vmv1r.v v10, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8, v0.t @@ -307,8 +715,8 @@ define @vp_roundtozero_nxv8f16( %va, @vp_roundtozero_nxv8f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_nxv8f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI7_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI19_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI19_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 @@ -347,8 +755,8 @@ declare @llvm.vp.roundtozero.nxv16f16(, define @vp_roundtozero_nxv16f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_nxv16f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI8_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI20_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI20_0)(a1) ; ZVFH-NEXT: vmv1r.v v12, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8, v0.t @@ -393,8 +801,8 @@ define @vp_roundtozero_nxv16f16( %va, < define @vp_roundtozero_nxv16f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_nxv16f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI9_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI21_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI21_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 ; ZVFH-NEXT: vmflt.vf v0, v12, fa5 @@ -433,8 +841,8 @@ declare @llvm.vp.roundtozero.nxv32f16(, define @vp_roundtozero_nxv32f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_nxv32f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI10_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI22_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI22_0)(a1) ; ZVFH-NEXT: vmv1r.v v16, v0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v24, v8, v0.t @@ -489,10 +897,10 @@ define @vp_roundtozero_nxv32f16( %va, < ; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 -; ZVFHMIN-NEXT: bltu a0, a1, .LBB10_2 +; ZVFHMIN-NEXT: bltu a0, a1, .LBB22_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB10_2: +; ZVFHMIN-NEXT: .LBB22_2: ; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0 @@ -531,8 +939,8 @@ define @vp_roundtozero_nxv32f16( %va, < define @vp_roundtozero_nxv32f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_nxv32f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI11_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a1) +; ZVFH-NEXT: lui a1, %hi(.LCPI23_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI23_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 ; ZVFH-NEXT: vmflt.vf v0, v16, fa5 @@ -584,10 +992,10 @@ define @vp_roundtozero_nxv32f16_unmasked( @llvm.vp.roundtozero.nxv1f64( define @vp_roundtozero_nxv1f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI22_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI34_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI34_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu @@ -853,8 +1261,8 @@ define @vp_roundtozero_nxv1f64( %va, define @vp_roundtozero_nxv1f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_nxv1f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI23_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI35_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI35_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 @@ -874,8 +1282,8 @@ declare @llvm.vp.roundtozero.nxv2f64( define @vp_roundtozero_nxv2f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_nxv2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI24_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI36_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI36_0)(a1) ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v12, v8, v0.t @@ -897,8 +1305,8 @@ define @vp_roundtozero_nxv2f64( %va, define @vp_roundtozero_nxv2f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_nxv2f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI25_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI37_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI37_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 @@ -918,8 +1326,8 @@ declare @llvm.vp.roundtozero.nxv4f64( define @vp_roundtozero_nxv4f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_nxv4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI26_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI38_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI38_0)(a1) ; CHECK-NEXT: vmv1r.v v12, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v16, v8, v0.t @@ -941,8 +1349,8 @@ define @vp_roundtozero_nxv4f64( %va, define @vp_roundtozero_nxv4f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_nxv4f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI27_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI39_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI39_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 @@ -962,8 +1370,8 @@ declare @llvm.vp.roundtozero.nxv7f64( define @vp_roundtozero_nxv7f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_nxv7f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI28_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI28_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI40_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI40_0)(a1) ; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t @@ -985,8 +1393,8 @@ define @vp_roundtozero_nxv7f64( %va, define @vp_roundtozero_nxv7f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_nxv7f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI29_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI29_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI41_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI41_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 @@ -1006,8 +1414,8 @@ declare @llvm.vp.roundtozero.nxv8f64( define @vp_roundtozero_nxv8f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI30_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI30_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI42_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI42_0)(a1) ; CHECK-NEXT: vmv1r.v v16, v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t @@ -1029,8 +1437,8 @@ define @vp_roundtozero_nxv8f64( %va, define @vp_roundtozero_nxv8f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_nxv8f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI31_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI31_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI43_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI43_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 @@ -1063,8 +1471,8 @@ define @vp_roundtozero_nxv16f64( %v ; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vx v6, v0, a2 ; CHECK-NEXT: sub a2, a0, a1 -; CHECK-NEXT: lui a3, %hi(.LCPI32_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI32_0)(a3) +; CHECK-NEXT: lui a3, %hi(.LCPI44_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI44_0)(a3) ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 ; CHECK-NEXT: and a2, a3, a2 @@ -1085,10 +1493,10 @@ define @vp_roundtozero_nxv16f64( %v ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: bltu a0, a1, .LBB32_2 +; CHECK-NEXT: bltu a0, a1, .LBB44_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: .LBB32_2: +; CHECK-NEXT: .LBB44_2: ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8, v0.t @@ -1116,8 +1524,8 @@ define @vp_roundtozero_nxv16f64_unmasked( @vp_roundtozero_nxv16f64_unmasked( @llvm.vp.fcmp.nxv1bf16(, , metadata, , i32) + +define @fcmp_oeq_vv_nxv1bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_oeq_vv_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v9, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.fcmp.nxv1bf16( %va, %vb, metadata !"oeq", %m, i32 %evl) + ret %v +} + +define @fcmp_oeq_vf_nxv1bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_oeq_vf_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v10, v8, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv1bf16( %va, %vb, metadata !"oeq", %m, i32 %evl) + ret %v +} + +define @fcmp_oeq_vf_swap_nxv1bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_oeq_vf_swap_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v8, v10, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv1bf16( %vb, %va, metadata !"oeq", %m, i32 %evl) + ret %v +} + +define @fcmp_ogt_vv_nxv1bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ogt_vv_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmflt.vv v0, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.fcmp.nxv1bf16( %va, %vb, metadata !"ogt", %m, i32 %evl) + ret %v +} + +define @fcmp_ogt_vf_nxv1bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ogt_vf_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmflt.vv v0, v8, v10, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv1bf16( %va, %vb, metadata !"ogt", %m, i32 %evl) + ret %v +} + +define @fcmp_ogt_vf_swap_nxv1bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ogt_vf_swap_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmflt.vv v0, v10, v8, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv1bf16( %vb, %va, metadata !"ogt", %m, i32 %evl) + ret %v +} + +define @fcmp_oge_vv_nxv1bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_oge_vv_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfle.vv v0, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.fcmp.nxv1bf16( %va, %vb, metadata !"oge", %m, i32 %evl) + ret %v +} + +define @fcmp_oge_vf_nxv1bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_oge_vf_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfle.vv v0, v8, v10, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv1bf16( %va, %vb, metadata !"oge", %m, i32 %evl) + ret %v +} + +define @fcmp_oge_vf_swap_nxv1bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_oge_vf_swap_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfle.vv v0, v10, v8, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv1bf16( %vb, %va, metadata !"oge", %m, i32 %evl) + ret %v +} + +define @fcmp_olt_vv_nxv1bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_olt_vv_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmflt.vv v0, v9, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.fcmp.nxv1bf16( %va, %vb, metadata !"olt", %m, i32 %evl) + ret %v +} + +define @fcmp_olt_vf_nxv1bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_olt_vf_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmflt.vv v0, v10, v8, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv1bf16( %va, %vb, metadata !"olt", %m, i32 %evl) + ret %v +} + +define @fcmp_olt_vf_swap_nxv1bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_olt_vf_swap_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmflt.vv v0, v8, v10, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv1bf16( %vb, %va, metadata !"olt", %m, i32 %evl) + ret %v +} + +define @fcmp_ole_vv_nxv1bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ole_vv_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfle.vv v0, v9, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.fcmp.nxv1bf16( %va, %vb, metadata !"ole", %m, i32 %evl) + ret %v +} + +define @fcmp_ole_vf_nxv1bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ole_vf_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfle.vv v0, v10, v8, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv1bf16( %va, %vb, metadata !"ole", %m, i32 %evl) + ret %v +} + +define @fcmp_ole_vf_swap_nxv1bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ole_vf_swap_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfle.vv v0, v8, v10, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv1bf16( %vb, %va, metadata !"ole", %m, i32 %evl) + ret %v +} + +define @fcmp_one_vv_nxv1bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_one_vv_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmflt.vv v8, v9, v10, v0.t +; CHECK-NEXT: vmflt.vv v9, v10, v9, v0.t +; CHECK-NEXT: vmor.mm v0, v9, v8 +; CHECK-NEXT: ret + %v = call @llvm.vp.fcmp.nxv1bf16( %va, %vb, metadata !"one", %m, i32 %evl) + ret %v +} + +define @fcmp_one_vf_nxv1bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_one_vf_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmflt.vv v9, v10, v8, v0.t +; CHECK-NEXT: vmflt.vv v8, v8, v10, v0.t +; CHECK-NEXT: vmor.mm v0, v8, v9 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv1bf16( %va, %vb, metadata !"one", %m, i32 %evl) + ret %v +} + +define @fcmp_one_vf_swap_nxv1bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_one_vf_swap_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmflt.vv v9, v8, v10, v0.t +; CHECK-NEXT: vmflt.vv v8, v10, v8, v0.t +; CHECK-NEXT: vmor.mm v0, v8, v9 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv1bf16( %vb, %va, metadata !"one", %m, i32 %evl) + ret %v +} + +define @fcmp_ord_vv_nxv1bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ord_vv_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfeq.vv v9, v10, v10, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfeq.vv v8, v10, v10, v0.t +; CHECK-NEXT: vmand.mm v0, v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.fcmp.nxv1bf16( %va, %vb, metadata !"ord", %m, i32 %evl) + ret %v +} + +define @fcmp_ord_vf_nxv1bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ord_vf_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfeq.vv v8, v10, v10, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfeq.vv v9, v10, v10, v0.t +; CHECK-NEXT: vmand.mm v0, v8, v9 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv1bf16( %va, %vb, metadata !"ord", %m, i32 %evl) + ret %v +} + +define @fcmp_ord_vf_swap_nxv1bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ord_vf_swap_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfeq.vv v8, v10, v10, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfeq.vv v9, v10, v10, v0.t +; CHECK-NEXT: vmand.mm v0, v9, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv1bf16( %vb, %va, metadata !"ord", %m, i32 %evl) + ret %v +} + +define @fcmp_ueq_vv_nxv1bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ueq_vv_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmflt.vv v8, v9, v10, v0.t +; CHECK-NEXT: vmflt.vv v9, v10, v9, v0.t +; CHECK-NEXT: vmnor.mm v0, v9, v8 +; CHECK-NEXT: ret + %v = call @llvm.vp.fcmp.nxv1bf16( %va, %vb, metadata !"ueq", %m, i32 %evl) + ret %v +} + +define @fcmp_ueq_vf_nxv1bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ueq_vf_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmflt.vv v9, v10, v8, v0.t +; CHECK-NEXT: vmflt.vv v8, v8, v10, v0.t +; CHECK-NEXT: vmnor.mm v0, v8, v9 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv1bf16( %va, %vb, metadata !"ueq", %m, i32 %evl) + ret %v +} + +define @fcmp_ueq_vf_swap_nxv1bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ueq_vf_swap_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmflt.vv v9, v8, v10, v0.t +; CHECK-NEXT: vmflt.vv v8, v10, v8, v0.t +; CHECK-NEXT: vmnor.mm v0, v8, v9 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv1bf16( %vb, %va, metadata !"ueq", %m, i32 %evl) + ret %v +} + +define @fcmp_ugt_vv_nxv1bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ugt_vv_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfle.vv v8, v9, v10, v0.t +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %v = call @llvm.vp.fcmp.nxv1bf16( %va, %vb, metadata !"ugt", %m, i32 %evl) + ret %v +} + +define @fcmp_ugt_vf_nxv1bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ugt_vf_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfle.vv v8, v10, v8, v0.t +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv1bf16( %va, %vb, metadata !"ugt", %m, i32 %evl) + ret %v +} + +define @fcmp_ugt_vf_swap_nxv1bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ugt_vf_swap_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfle.vv v8, v8, v10, v0.t +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv1bf16( %vb, %va, metadata !"ugt", %m, i32 %evl) + ret %v +} + +define @fcmp_uge_vv_nxv1bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_uge_vv_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmflt.vv v8, v9, v10, v0.t +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %v = call @llvm.vp.fcmp.nxv1bf16( %va, %vb, metadata !"uge", %m, i32 %evl) + ret %v +} + +define @fcmp_uge_vf_nxv1bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_uge_vf_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmflt.vv v8, v10, v8, v0.t +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv1bf16( %va, %vb, metadata !"uge", %m, i32 %evl) + ret %v +} + +define @fcmp_uge_vf_swap_nxv1bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_uge_vf_swap_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmflt.vv v8, v8, v10, v0.t +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv1bf16( %vb, %va, metadata !"uge", %m, i32 %evl) + ret %v +} + +define @fcmp_ult_vv_nxv1bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ult_vv_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfle.vv v8, v8, v10, v0.t +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %v = call @llvm.vp.fcmp.nxv1bf16( %va, %vb, metadata !"ult", %m, i32 %evl) + ret %v +} + +define @fcmp_ult_vf_nxv1bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ult_vf_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfle.vv v8, v8, v10, v0.t +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv1bf16( %va, %vb, metadata !"ult", %m, i32 %evl) + ret %v +} + +define @fcmp_ult_vf_swap_nxv1bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ult_vf_swap_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfle.vv v8, v10, v8, v0.t +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv1bf16( %vb, %va, metadata !"ult", %m, i32 %evl) + ret %v +} + +define @fcmp_ule_vv_nxv1bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ule_vv_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmflt.vv v8, v8, v10, v0.t +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %v = call @llvm.vp.fcmp.nxv1bf16( %va, %vb, metadata !"ule", %m, i32 %evl) + ret %v +} + +define @fcmp_ule_vf_nxv1bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ule_vf_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmflt.vv v8, v8, v10, v0.t +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv1bf16( %va, %vb, metadata !"ule", %m, i32 %evl) + ret %v +} + +define @fcmp_ule_vf_swap_nxv1bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ule_vf_swap_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmflt.vv v8, v10, v8, v0.t +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv1bf16( %vb, %va, metadata !"ule", %m, i32 %evl) + ret %v +} + +define @fcmp_une_vv_nxv1bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_une_vv_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfne.vv v0, v9, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.fcmp.nxv1bf16( %va, %vb, metadata !"une", %m, i32 %evl) + ret %v +} + +define @fcmp_une_vf_nxv1bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_une_vf_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfne.vv v0, v10, v8, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv1bf16( %va, %vb, metadata !"une", %m, i32 %evl) + ret %v +} + +define @fcmp_une_vf_swap_nxv1bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_une_vf_swap_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfne.vv v0, v8, v10, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv1bf16( %vb, %va, metadata !"une", %m, i32 %evl) + ret %v +} + +define @fcmp_uno_vv_nxv1bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_uno_vv_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfne.vv v9, v10, v10, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfne.vv v8, v10, v10, v0.t +; CHECK-NEXT: vmor.mm v0, v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.fcmp.nxv1bf16( %va, %vb, metadata !"uno", %m, i32 %evl) + ret %v +} + +define @fcmp_uno_vf_nxv1bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_uno_vf_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfne.vv v8, v10, v10, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfne.vv v9, v10, v10, v0.t +; CHECK-NEXT: vmor.mm v0, v8, v9 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv1bf16( %va, %vb, metadata !"uno", %m, i32 %evl) + ret %v +} + +define @fcmp_uno_vf_swap_nxv1bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_uno_vf_swap_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfne.vv v8, v10, v10, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmfne.vv v9, v10, v10, v0.t +; CHECK-NEXT: vmor.mm v0, v9, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv1bf16( %vb, %va, metadata !"uno", %m, i32 %evl) + ret %v +} + +declare @llvm.vp.fcmp.nxv3bf16(, , metadata, , i32) + +define @fcmp_oeq_vv_nxv3bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_oeq_vv_nxv3bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vmfeq.vv v8, v12, v10, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: ret + %v = call @llvm.vp.fcmp.nxv3bf16( %va, %vb, metadata !"oeq", %m, i32 %evl) + ret %v +} + +declare @llvm.vp.fcmp.nxv8bf16(, , metadata, , i32) + +define @fcmp_oeq_vv_nxv8bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_oeq_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v8, v16, v12, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: ret + %v = call @llvm.vp.fcmp.nxv8bf16( %va, %vb, metadata !"oeq", %m, i32 %evl) + ret %v +} + +define @fcmp_oeq_vf_nxv8bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_oeq_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v8, v12, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv8bf16( %va, %vb, metadata !"oeq", %m, i32 %evl) + ret %v +} + +define @fcmp_oeq_vf_swap_nxv8bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_oeq_vf_swap_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v8, v16, v12, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv8bf16( %vb, %va, metadata !"oeq", %m, i32 %evl) + ret %v +} + +define @fcmp_ogt_vv_nxv8bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ogt_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v8, v16, v12, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: ret + %v = call @llvm.vp.fcmp.nxv8bf16( %va, %vb, metadata !"ogt", %m, i32 %evl) + ret %v +} + +define @fcmp_ogt_vf_nxv8bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ogt_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v8, v16, v12, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv8bf16( %va, %vb, metadata !"ogt", %m, i32 %evl) + ret %v +} + +define @fcmp_ogt_vf_swap_nxv8bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ogt_vf_swap_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v8, v12, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv8bf16( %vb, %va, metadata !"ogt", %m, i32 %evl) + ret %v +} + +define @fcmp_oge_vv_nxv8bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_oge_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfle.vv v8, v16, v12, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: ret + %v = call @llvm.vp.fcmp.nxv8bf16( %va, %vb, metadata !"oge", %m, i32 %evl) + ret %v +} + +define @fcmp_oge_vf_nxv8bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_oge_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfle.vv v8, v16, v12, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv8bf16( %va, %vb, metadata !"oge", %m, i32 %evl) + ret %v +} + +define @fcmp_oge_vf_swap_nxv8bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_oge_vf_swap_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfle.vv v8, v12, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv8bf16( %vb, %va, metadata !"oge", %m, i32 %evl) + ret %v +} + +define @fcmp_olt_vv_nxv8bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_olt_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v8, v16, v12, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: ret + %v = call @llvm.vp.fcmp.nxv8bf16( %va, %vb, metadata !"olt", %m, i32 %evl) + ret %v +} + +define @fcmp_olt_vf_nxv8bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_olt_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v8, v12, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv8bf16( %va, %vb, metadata !"olt", %m, i32 %evl) + ret %v +} + +define @fcmp_olt_vf_swap_nxv8bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_olt_vf_swap_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v8, v16, v12, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv8bf16( %vb, %va, metadata !"olt", %m, i32 %evl) + ret %v +} + +define @fcmp_ole_vv_nxv8bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ole_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfle.vv v8, v16, v12, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: ret + %v = call @llvm.vp.fcmp.nxv8bf16( %va, %vb, metadata !"ole", %m, i32 %evl) + ret %v +} + +define @fcmp_ole_vf_nxv8bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ole_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfle.vv v8, v12, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv8bf16( %va, %vb, metadata !"ole", %m, i32 %evl) + ret %v +} + +define @fcmp_ole_vf_swap_nxv8bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ole_vf_swap_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfle.vv v8, v16, v12, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv8bf16( %vb, %va, metadata !"ole", %m, i32 %evl) + ret %v +} + +define @fcmp_one_vv_nxv8bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_one_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v8, v16, v12, v0.t +; CHECK-NEXT: vmflt.vv v9, v12, v16, v0.t +; CHECK-NEXT: vmor.mm v0, v9, v8 +; CHECK-NEXT: ret + %v = call @llvm.vp.fcmp.nxv8bf16( %va, %vb, metadata !"one", %m, i32 %evl) + ret %v +} + +define @fcmp_one_vf_nxv8bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_one_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v8, v12, v16, v0.t +; CHECK-NEXT: vmflt.vv v9, v16, v12, v0.t +; CHECK-NEXT: vmor.mm v0, v9, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv8bf16( %va, %vb, metadata !"one", %m, i32 %evl) + ret %v +} + +define @fcmp_one_vf_swap_nxv8bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_one_vf_swap_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v8, v16, v12, v0.t +; CHECK-NEXT: vmflt.vv v9, v12, v16, v0.t +; CHECK-NEXT: vmor.mm v0, v9, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv8bf16( %vb, %va, metadata !"one", %m, i32 %evl) + ret %v +} + +define @fcmp_ord_vv_nxv8bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ord_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v10, v12, v12, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v8, v12, v12, v0.t +; CHECK-NEXT: vmand.mm v0, v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.vp.fcmp.nxv8bf16( %va, %vb, metadata !"ord", %m, i32 %evl) + ret %v +} + +define @fcmp_ord_vf_nxv8bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ord_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v8, v12, v12, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v9, v12, v12, v0.t +; CHECK-NEXT: vmand.mm v0, v8, v9 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv8bf16( %va, %vb, metadata !"ord", %m, i32 %evl) + ret %v +} + +define @fcmp_ord_vf_swap_nxv8bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ord_vf_swap_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v8, v12, v12, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v9, v12, v12, v0.t +; CHECK-NEXT: vmand.mm v0, v9, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv8bf16( %vb, %va, metadata !"ord", %m, i32 %evl) + ret %v +} + +define @fcmp_ueq_vv_nxv8bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ueq_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v8, v16, v12, v0.t +; CHECK-NEXT: vmflt.vv v9, v12, v16, v0.t +; CHECK-NEXT: vmnor.mm v0, v9, v8 +; CHECK-NEXT: ret + %v = call @llvm.vp.fcmp.nxv8bf16( %va, %vb, metadata !"ueq", %m, i32 %evl) + ret %v +} + +define @fcmp_ueq_vf_nxv8bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ueq_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v8, v12, v16, v0.t +; CHECK-NEXT: vmflt.vv v9, v16, v12, v0.t +; CHECK-NEXT: vmnor.mm v0, v9, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv8bf16( %va, %vb, metadata !"ueq", %m, i32 %evl) + ret %v +} + +define @fcmp_ueq_vf_swap_nxv8bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ueq_vf_swap_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v8, v16, v12, v0.t +; CHECK-NEXT: vmflt.vv v9, v12, v16, v0.t +; CHECK-NEXT: vmnor.mm v0, v9, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv8bf16( %vb, %va, metadata !"ueq", %m, i32 %evl) + ret %v +} + +define @fcmp_ugt_vv_nxv8bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ugt_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfle.vv v8, v16, v12, v0.t +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %v = call @llvm.vp.fcmp.nxv8bf16( %va, %vb, metadata !"ugt", %m, i32 %evl) + ret %v +} + +define @fcmp_ugt_vf_nxv8bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ugt_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfle.vv v8, v12, v16, v0.t +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv8bf16( %va, %vb, metadata !"ugt", %m, i32 %evl) + ret %v +} + +define @fcmp_ugt_vf_swap_nxv8bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ugt_vf_swap_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfle.vv v8, v16, v12, v0.t +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv8bf16( %vb, %va, metadata !"ugt", %m, i32 %evl) + ret %v +} + +define @fcmp_uge_vv_nxv8bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_uge_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v8, v16, v12, v0.t +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %v = call @llvm.vp.fcmp.nxv8bf16( %va, %vb, metadata !"uge", %m, i32 %evl) + ret %v +} + +define @fcmp_uge_vf_nxv8bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_uge_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v8, v12, v16, v0.t +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv8bf16( %va, %vb, metadata !"uge", %m, i32 %evl) + ret %v +} + +define @fcmp_uge_vf_swap_nxv8bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_uge_vf_swap_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v8, v16, v12, v0.t +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv8bf16( %vb, %va, metadata !"uge", %m, i32 %evl) + ret %v +} + +define @fcmp_ult_vv_nxv8bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ult_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfle.vv v8, v16, v12, v0.t +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %v = call @llvm.vp.fcmp.nxv8bf16( %va, %vb, metadata !"ult", %m, i32 %evl) + ret %v +} + +define @fcmp_ult_vf_nxv8bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ult_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfle.vv v8, v16, v12, v0.t +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv8bf16( %va, %vb, metadata !"ult", %m, i32 %evl) + ret %v +} + +define @fcmp_ult_vf_swap_nxv8bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ult_vf_swap_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfle.vv v8, v12, v16, v0.t +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv8bf16( %vb, %va, metadata !"ult", %m, i32 %evl) + ret %v +} + +define @fcmp_ule_vv_nxv8bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ule_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v8, v16, v12, v0.t +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %v = call @llvm.vp.fcmp.nxv8bf16( %va, %vb, metadata !"ule", %m, i32 %evl) + ret %v +} + +define @fcmp_ule_vf_nxv8bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ule_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v8, v16, v12, v0.t +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv8bf16( %va, %vb, metadata !"ule", %m, i32 %evl) + ret %v +} + +define @fcmp_ule_vf_swap_nxv8bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_ule_vf_swap_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v8, v12, v16, v0.t +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv8bf16( %vb, %va, metadata !"ule", %m, i32 %evl) + ret %v +} + +define @fcmp_une_vv_nxv8bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_une_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfne.vv v8, v16, v12, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: ret + %v = call @llvm.vp.fcmp.nxv8bf16( %va, %vb, metadata !"une", %m, i32 %evl) + ret %v +} + +define @fcmp_une_vf_nxv8bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_une_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfne.vv v8, v12, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv8bf16( %va, %vb, metadata !"une", %m, i32 %evl) + ret %v +} + +define @fcmp_une_vf_swap_nxv8bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_une_vf_swap_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfne.vv v8, v16, v12, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv8bf16( %vb, %va, metadata !"une", %m, i32 %evl) + ret %v +} + +define @fcmp_uno_vv_nxv8bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_uno_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfne.vv v10, v12, v12, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfne.vv v8, v12, v12, v0.t +; CHECK-NEXT: vmor.mm v0, v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.vp.fcmp.nxv8bf16( %va, %vb, metadata !"uno", %m, i32 %evl) + ret %v +} + +define @fcmp_uno_vf_nxv8bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_uno_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfne.vv v8, v12, v12, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfne.vv v9, v12, v12, v0.t +; CHECK-NEXT: vmor.mm v0, v8, v9 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv8bf16( %va, %vb, metadata !"uno", %m, i32 %evl) + ret %v +} + +define @fcmp_uno_vf_swap_nxv8bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_uno_vf_swap_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfne.vv v8, v12, v12, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmfne.vv v9, v12, v12, v0.t +; CHECK-NEXT: vmor.mm v0, v9, v8 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fcmp.nxv8bf16( %vb, %va, metadata !"uno", %m, i32 %evl) + ret %v +} + +declare @llvm.vp.fcmp.nxv64bf16(, , metadata, , i32) + +define @fcmp_oeq_vv_nxv64bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: fcmp_oeq_vv_nxv64bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: mv a3, a1 +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, a1, a3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 34 * vlenb +; CHECK-NEXT: vmv8r.v v24, v16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: mv a3, a1 +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, a1, a3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: slli a1, a3, 3 +; CHECK-NEXT: add a1, a0, a1 +; CHECK-NEXT: vl8re16.v v16, (a1) +; CHECK-NEXT: slli a5, a3, 2 +; CHECK-NEXT: sub a1, a2, a5 +; CHECK-NEXT: sltu a4, a2, a1 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a6, a4, a1 +; CHECK-NEXT: slli a4, a3, 1 +; CHECK-NEXT: sub a1, a6, a4 +; CHECK-NEXT: sltu a7, a6, a1 +; CHECK-NEXT: addi a7, a7, -1 +; CHECK-NEXT: and a7, a7, a1 +; CHECK-NEXT: srli a1, a3, 1 +; CHECK-NEXT: csrr t0, vlenb +; CHECK-NEXT: add t0, sp, t0 +; CHECK-NEXT: addi t0, t0, 16 +; CHECK-NEXT: vs1r.v v0, (t0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli t0, zero, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v0, a1 +; CHECK-NEXT: srli a3, a3, 2 +; CHECK-NEXT: addi t0, sp, 16 +; CHECK-NEXT: vs1r.v v8, (t0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli t0, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v8, a3 +; CHECK-NEXT: vl8re16.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: mv t0, a0 +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add t0, t0, a0 +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add a0, a0, t0 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v20 +; CHECK-NEXT: vmv4r.v v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: mv t0, a0 +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add a0, a0, t0 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28 +; CHECK-NEXT: vsetvli zero, a7, e32, m8, ta, ma +; CHECK-NEXT: vmfeq.vv v26, v16, v8, v0.t +; CHECK-NEXT: bltu a6, a4, .LBB85_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a6, a4 +; CHECK-NEXT: .LBB85_2: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: mv a7, a0 +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add a0, a0, a7 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl1r.v v0, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a6, e32, m8, ta, ma +; CHECK-NEXT: vmfeq.vv v6, v16, v8, v0.t +; CHECK-NEXT: add a0, a3, a3 +; CHECK-NEXT: bltu a2, a5, .LBB85_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: mv a2, a5 +; CHECK-NEXT: .LBB85_4: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vslideup.vx v6, v26, a3 +; CHECK-NEXT: sub a5, a2, a4 +; CHECK-NEXT: sltu a6, a2, a5 +; CHECK-NEXT: addi a6, a6, -1 +; CHECK-NEXT: and a5, a6, a5 +; CHECK-NEXT: csrr a6, vlenb +; CHECK-NEXT: add a6, sp, a6 +; CHECK-NEXT: addi a6, a6, 16 +; CHECK-NEXT: vl1r.v v8, (a6) # Unknown-size Folded Reload +; CHECK-NEXT: vmv1r.v v7, v8 +; CHECK-NEXT: vsetvli a6, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v8, a3 +; CHECK-NEXT: csrr a6, vlenb +; CHECK-NEXT: slli a6, a6, 1 +; CHECK-NEXT: mv a7, a6 +; CHECK-NEXT: slli a6, a6, 3 +; CHECK-NEXT: add a6, a6, a7 +; CHECK-NEXT: add a6, sp, a6 +; CHECK-NEXT: addi a6, a6, 16 +; CHECK-NEXT: vl8r.v v24, (a6) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli a6, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28 +; CHECK-NEXT: csrr a6, vlenb +; CHECK-NEXT: slli a6, a6, 1 +; CHECK-NEXT: mv a7, a6 +; CHECK-NEXT: slli a6, a6, 2 +; CHECK-NEXT: add a6, a6, a7 +; CHECK-NEXT: add a6, sp, a6 +; CHECK-NEXT: addi a6, a6, 16 +; CHECK-NEXT: vs8r.v v8, (a6) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a6, vlenb +; CHECK-NEXT: slli a6, a6, 1 +; CHECK-NEXT: mv a7, a6 +; CHECK-NEXT: slli a6, a6, 2 +; CHECK-NEXT: add a7, a7, a6 +; CHECK-NEXT: slli a6, a6, 1 +; CHECK-NEXT: add a6, a6, a7 +; CHECK-NEXT: add a6, sp, a6 +; CHECK-NEXT: addi a6, a6, 16 +; CHECK-NEXT: vl8r.v v16, (a6) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v20 +; CHECK-NEXT: csrr a6, vlenb +; CHECK-NEXT: slli a6, a6, 1 +; CHECK-NEXT: mv a7, a6 +; CHECK-NEXT: slli a6, a6, 2 +; CHECK-NEXT: add a6, a6, a7 +; CHECK-NEXT: add a6, sp, a6 +; CHECK-NEXT: addi a6, a6, 16 +; CHECK-NEXT: vl8r.v v16, (a6) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a5, e32, m8, ta, ma +; CHECK-NEXT: vmfeq.vv v5, v16, v8, v0.t +; CHECK-NEXT: bltu a2, a4, .LBB85_6 +; CHECK-NEXT: # %bb.5: +; CHECK-NEXT: mv a2, a4 +; CHECK-NEXT: .LBB85_6: +; CHECK-NEXT: vsetvli a4, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: slli a4, a4, 1 +; CHECK-NEXT: mv a5, a4 +; CHECK-NEXT: slli a4, a4, 2 +; CHECK-NEXT: add a5, a5, a4 +; CHECK-NEXT: slli a4, a4, 1 +; CHECK-NEXT: add a4, a4, a5 +; CHECK-NEXT: add a4, sp, a4 +; CHECK-NEXT: addi a4, a4, 16 +; CHECK-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-NEXT: vmfeq.vv v8, v16, v24, v0.t +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vslideup.vx v8, v5, a3 +; CHECK-NEXT: add a0, a1, a1 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vslideup.vx v8, v6, a1 +; CHECK-NEXT: vmv.v.v v0, v8 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.fcmp.nxv64bf16( %va, %vb, metadata !"oeq", %m, i32 %evl) + ret %v +} declare @llvm.vp.fcmp.nxv1f16(, , metadata, , i32) @@ -2108,10 +3760,10 @@ define @fcmp_oeq_vv_nxv64f16( %va, @fcmp_oeq_vv_nxv64f16( %va, @fcmp_oeq_vv_nxv64f16( %va, @fcmp_oeq_vv_nxv64f16( %va, @fcmp_oeq_vv_nxv64f16( %va, @fcmp_oeq_vv_nxv64f16( %va, @fcmp_oeq_vv_nxv64f16( %va, @fcmp_oeq_vv_nxv64f16( %va, @fcmp_uno_vf_swap_nxv8f64( %va, do declare @llvm.vp.fcmp.nxv32f64(, , metadata, , i32) define @fcmp_oeq_vv_nxv32f64( %va, %vb, %m, i32 zeroext %evl) { -; CHECK-LABEL: fcmp_oeq_vv_nxv32f64: -; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a3, 48 -; CHECK-NEXT: mul a1, a1, a3 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb -; CHECK-NEXT: vmv1r.v v24, v0 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 5 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a3, 40 -; CHECK-NEXT: mul a1, a1, a3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a4, vlenb -; CHECK-NEXT: slli t0, a4, 3 -; CHECK-NEXT: slli a1, a4, 5 -; CHECK-NEXT: sub t1, a1, t0 -; CHECK-NEXT: srli a1, a4, 2 -; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vx v7, v0, a1 -; CHECK-NEXT: srli a3, a4, 3 -; CHECK-NEXT: add a5, a2, t0 -; CHECK-NEXT: vl8re64.v v8, (a5) -; CHECK-NEXT: slli t3, a4, 4 -; CHECK-NEXT: slli a5, a4, 1 -; CHECK-NEXT: vsetvli a7, zero, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vx v0, v0, a3 -; CHECK-NEXT: mv a7, a6 -; CHECK-NEXT: bltu a6, a5, .LBB171_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a7, a5 -; CHECK-NEXT: .LBB171_2: -; CHECK-NEXT: add t2, a0, t0 -; CHECK-NEXT: add t1, a2, t1 -; CHECK-NEXT: add t0, a2, t3 -; CHECK-NEXT: vl8re64.v v16, (a2) -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: sub a2, a7, a4 -; CHECK-NEXT: sltu t3, a7, a2 -; CHECK-NEXT: addi t3, t3, -1 -; CHECK-NEXT: and a2, t3, a2 -; CHECK-NEXT: csrr t3, vlenb -; CHECK-NEXT: slli t3, t3, 5 -; CHECK-NEXT: add t3, sp, t3 -; CHECK-NEXT: addi t3, t3, 16 -; CHECK-NEXT: vl8r.v v16, (t3) # Unknown-size Folded Reload -; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vmfeq.vv v6, v16, v8, v0.t -; CHECK-NEXT: bltu a7, a4, .LBB171_4 -; CHECK-NEXT: # %bb.3: -; CHECK-NEXT: mv a7, a4 -; CHECK-NEXT: .LBB171_4: -; CHECK-NEXT: vl8re64.v v8, (t2) -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 5 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: vl8re64.v v8, (t1) -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: li t1, 24 -; CHECK-NEXT: mul a2, a2, t1 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: vsetvli a2, zero, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vx v18, v7, a3 -; CHECK-NEXT: vl8re64.v v8, (t0) -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: vl8re64.v v8, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a2, 40 -; CHECK-NEXT: mul a0, a0, a2 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vsetvli zero, a7, e64, m8, ta, ma -; CHECK-NEXT: vmfeq.vv v17, v24, v8, v0.t -; CHECK-NEXT: add a2, a3, a3 -; CHECK-NEXT: sub a0, a6, a5 -; CHECK-NEXT: sltu a5, a6, a0 -; CHECK-NEXT: addi a5, a5, -1 -; CHECK-NEXT: and a0, a5, a0 -; CHECK-NEXT: vsetvli zero, a2, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vx v17, v6, a3 -; CHECK-NEXT: mv a2, a0 -; CHECK-NEXT: bltu a0, a4, .LBB171_6 -; CHECK-NEXT: # %bb.5: -; CHECK-NEXT: mv a2, a4 -; CHECK-NEXT: .LBB171_6: -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: slli a5, a5, 3 -; CHECK-NEXT: add a5, sp, a5 -; CHECK-NEXT: addi a5, a5, 16 -; CHECK-NEXT: vl8r.v v8, (a5) # Unknown-size Folded Reload -; CHECK-NEXT: addi a5, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload -; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vmfeq.vv v16, v24, v8, v0.t -; CHECK-NEXT: sub a2, a0, a4 -; CHECK-NEXT: sltu a0, a0, a2 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: and a0, a0, a2 -; CHECK-NEXT: vmv1r.v v0, v18 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 5 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: li a4, 24 -; CHECK-NEXT: mul a2, a2, a4 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmfeq.vv v18, v24, v8, v0.t -; CHECK-NEXT: add a0, a1, a3 -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vx v17, v16, a1 -; CHECK-NEXT: slli a0, a3, 1 -; CHECK-NEXT: add a0, a0, a3 -; CHECK-NEXT: add a3, a0, a3 -; CHECK-NEXT: vsetvli zero, a3, e8, mf2, ta, ma -; CHECK-NEXT: vslideup.vx v17, v18, a0 -; CHECK-NEXT: vmv1r.v v0, v17 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 48 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: ret %v = call @llvm.vp.fcmp.nxv32f64( %va, %vb, metadata !"oeq", %m, i32 %evl) ret %v } diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll index c2c977bec60555..23d73481aed2d7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll @@ -1,16 +1,1162 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV32 -; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV64 -; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,ZVFHMIN32 -; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,ZVFHMIN64 +; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH,RV64 +; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN,ZVFHMIN32 +; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN,ZVFHMIN64 ; FIXME: The scalar/vector operations ('fv' tests) should swap operands and ; condition codes accordingly in order to generate a 'vf' instruction. +define @fcmp_oeq_vv_nxv8bf16( %va, %vb) { +; CHECK-LABEL: fcmp_oeq_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v16, v12 +; CHECK-NEXT: ret + %vc = fcmp oeq %va, %vb + ret %vc +} + +define @fcmp_oeq_vf_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: fcmp_oeq_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v12, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp oeq %va, %splat + ret %vc +} + +define @fcmp_oeq_fv_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: fcmp_oeq_fv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v16, v12 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp oeq %splat, %va + ret %vc +} + +define @fcmp_oeq_vv_nxv8bf16_nonans( %va, %vb) #0 { +; CHECK-LABEL: fcmp_oeq_vv_nxv8bf16_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v16, v12 +; CHECK-NEXT: ret + %vc = fcmp oeq %va, %vb + ret %vc +} + +define @fcmp_oeq_vf_nxv8bf16_nonans( %va, bfloat %b) #0 { +; CHECK-LABEL: fcmp_oeq_vf_nxv8bf16_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v12, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp oeq %va, %splat + ret %vc +} + +define @fcmp_ogt_vv_nxv8bf16( %va, %vb) { +; CHECK-LABEL: fcmp_ogt_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v0, v16, v12 +; CHECK-NEXT: ret + %vc = fcmp ogt %va, %vb + ret %vc +} + +define @fcmp_ogt_vf_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: fcmp_ogt_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v0, v16, v12 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp ogt %va, %splat + ret %vc +} + +define @fcmp_ogt_fv_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: fcmp_ogt_fv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v0, v12, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp ogt %splat, %va + ret %vc +} + +define @fcmp_ogt_vv_nxv8bf16_nonans( %va, %vb) #0 { +; CHECK-LABEL: fcmp_ogt_vv_nxv8bf16_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v0, v16, v12 +; CHECK-NEXT: ret + %vc = fcmp ogt %va, %vb + ret %vc +} + +define @fcmp_ogt_vf_nxv8bf16_nonans( %va, bfloat %b) #0 { +; CHECK-LABEL: fcmp_ogt_vf_nxv8bf16_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v0, v16, v12 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp ogt %va, %splat + ret %vc +} + +define @fcmp_oge_vv_nxv8bf16( %va, %vb) { +; CHECK-LABEL: fcmp_oge_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfle.vv v0, v16, v12 +; CHECK-NEXT: ret + %vc = fcmp oge %va, %vb + ret %vc +} + +define @fcmp_oge_vf_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: fcmp_oge_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfle.vv v0, v16, v12 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp oge %va, %splat + ret %vc +} + +define @fcmp_oge_fv_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: fcmp_oge_fv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfle.vv v0, v12, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp oge %splat, %va + ret %vc +} + +define @fcmp_oge_vv_nxv8bf16_nonans( %va, %vb) #0 { +; CHECK-LABEL: fcmp_oge_vv_nxv8bf16_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfle.vv v0, v16, v12 +; CHECK-NEXT: ret + %vc = fcmp oge %va, %vb + ret %vc +} + +define @fcmp_oge_vf_nxv8bf16_nonans( %va, bfloat %b) #0 { +; CHECK-LABEL: fcmp_oge_vf_nxv8bf16_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfle.vv v0, v16, v12 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp oge %va, %splat + ret %vc +} + +define @fcmp_olt_vv_nxv8bf16( %va, %vb) { +; CHECK-LABEL: fcmp_olt_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v0, v16, v12 +; CHECK-NEXT: ret + %vc = fcmp olt %va, %vb + ret %vc +} + +define @fcmp_olt_vf_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: fcmp_olt_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v0, v12, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp olt %va, %splat + ret %vc +} + +define @fcmp_olt_fv_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: fcmp_olt_fv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v0, v16, v12 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp olt %splat, %va + ret %vc +} + +define @fcmp_olt_vv_nxv8bf16_nonans( %va, %vb) #0 { +; CHECK-LABEL: fcmp_olt_vv_nxv8bf16_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v0, v16, v12 +; CHECK-NEXT: ret + %vc = fcmp olt %va, %vb + ret %vc +} + +define @fcmp_olt_vf_nxv8bf16_nonans( %va, bfloat %b) #0 { +; CHECK-LABEL: fcmp_olt_vf_nxv8bf16_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v0, v12, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp olt %va, %splat + ret %vc +} + +define @fcmp_ole_vv_nxv8bf16( %va, %vb) { +; CHECK-LABEL: fcmp_ole_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfle.vv v0, v16, v12 +; CHECK-NEXT: ret + %vc = fcmp ole %va, %vb + ret %vc +} + +define @fcmp_ole_vf_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: fcmp_ole_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfle.vv v0, v12, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp ole %va, %splat + ret %vc +} + +define @fcmp_ole_fv_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: fcmp_ole_fv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfle.vv v0, v16, v12 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp ole %splat, %va + ret %vc +} + +define @fcmp_ole_vv_nxv8bf16_nonans( %va, %vb) #0 { +; CHECK-LABEL: fcmp_ole_vv_nxv8bf16_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfle.vv v0, v16, v12 +; CHECK-NEXT: ret + %vc = fcmp ole %va, %vb + ret %vc +} + +define @fcmp_ole_vf_nxv8bf16_nonans( %va, bfloat %b) #0 { +; CHECK-LABEL: fcmp_ole_vf_nxv8bf16_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfle.vv v0, v12, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp ole %va, %splat + ret %vc +} + +define @fcmp_one_vv_nxv8bf16( %va, %vb) { +; CHECK-LABEL: fcmp_one_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v8, v16, v12 +; CHECK-NEXT: vmflt.vv v9, v12, v16 +; CHECK-NEXT: vmor.mm v0, v9, v8 +; CHECK-NEXT: ret + %vc = fcmp one %va, %vb + ret %vc +} + +define @fcmp_one_vf_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: fcmp_one_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v8, v12, v16 +; CHECK-NEXT: vmflt.vv v9, v16, v12 +; CHECK-NEXT: vmor.mm v0, v9, v8 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp one %va, %splat + ret %vc +} + +define @fcmp_one_fv_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: fcmp_one_fv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v8, v16, v12 +; CHECK-NEXT: vmflt.vv v9, v12, v16 +; CHECK-NEXT: vmor.mm v0, v9, v8 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp one %splat, %va + ret %vc +} + +define @fcmp_one_vv_nxv8bf16_nonans( %va, %vb) #0 { +; CHECK-LABEL: fcmp_one_vv_nxv8bf16_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfne.vv v0, v16, v12 +; CHECK-NEXT: ret + %vc = fcmp one %va, %vb + ret %vc +} + +define @fcmp_one_vf_nxv8bf16_nonans( %va, bfloat %b) #0 { +; CHECK-LABEL: fcmp_one_vf_nxv8bf16_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfne.vv v0, v12, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp one %va, %splat + ret %vc +} + +define @fcmp_ord_vv_nxv8bf16( %va, %vb) { +; CHECK-LABEL: fcmp_ord_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v10, v12, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v8, v12, v12 +; CHECK-NEXT: vmand.mm v0, v8, v10 +; CHECK-NEXT: ret + %vc = fcmp ord %va, %vb + ret %vc +} + +define @fcmp_ord_vf_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: fcmp_ord_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v8, v12, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v9, v12, v12 +; CHECK-NEXT: vmand.mm v0, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp ord %va, %splat + ret %vc +} + +define @fcmp_ord_fv_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: fcmp_ord_fv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v8, v12, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v9, v12, v12 +; CHECK-NEXT: vmand.mm v0, v9, v8 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp ord %splat, %va + ret %vc +} + +define @fcmp_ord_vv_nxv8bf16_nonans( %va, %vb) #0 { +; CHECK-LABEL: fcmp_ord_vv_nxv8bf16_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v10, v12, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v8, v12, v12 +; CHECK-NEXT: vmand.mm v0, v8, v10 +; CHECK-NEXT: ret + %vc = fcmp ord %va, %vb + ret %vc +} + +define @fcmp_ord_vf_nxv8bf16_nonans( %va, bfloat %b) #0 { +; CHECK-LABEL: fcmp_ord_vf_nxv8bf16_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v8, v12, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v9, v12, v12 +; CHECK-NEXT: vmand.mm v0, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp ord %va, %splat + ret %vc +} + +define @fcmp_ueq_vv_nxv8bf16( %va, %vb) { +; CHECK-LABEL: fcmp_ueq_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v8, v16, v12 +; CHECK-NEXT: vmflt.vv v9, v12, v16 +; CHECK-NEXT: vmnor.mm v0, v9, v8 +; CHECK-NEXT: ret + %vc = fcmp ueq %va, %vb + ret %vc +} + +define @fcmp_ueq_vf_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: fcmp_ueq_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v8, v12, v16 +; CHECK-NEXT: vmflt.vv v9, v16, v12 +; CHECK-NEXT: vmnor.mm v0, v9, v8 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp ueq %va, %splat + ret %vc +} + +define @fcmp_ueq_fv_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: fcmp_ueq_fv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v8, v16, v12 +; CHECK-NEXT: vmflt.vv v9, v12, v16 +; CHECK-NEXT: vmnor.mm v0, v9, v8 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp ueq %splat, %va + ret %vc +} + +define @fcmp_ueq_vv_nxv8bf16_nonans( %va, %vb) #0 { +; CHECK-LABEL: fcmp_ueq_vv_nxv8bf16_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v16, v12 +; CHECK-NEXT: ret + %vc = fcmp ueq %va, %vb + ret %vc +} + +define @fcmp_ueq_vf_nxv8bf16_nonans( %va, bfloat %b) #0 { +; CHECK-LABEL: fcmp_ueq_vf_nxv8bf16_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v12, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp ueq %va, %splat + ret %vc +} + +define @fcmp_ugt_vv_nxv8bf16( %va, %vb) { +; CHECK-LABEL: fcmp_ugt_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfle.vv v8, v16, v12 +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %vc = fcmp ugt %va, %vb + ret %vc +} + +define @fcmp_ugt_vf_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: fcmp_ugt_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfle.vv v8, v12, v16 +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp ugt %va, %splat + ret %vc +} + +define @fcmp_ugt_fv_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: fcmp_ugt_fv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfle.vv v8, v16, v12 +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp ugt %splat, %va + ret %vc +} + +define @fcmp_ugt_vv_nxv8bf16_nonans( %va, %vb) #0 { +; CHECK-LABEL: fcmp_ugt_vv_nxv8bf16_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v0, v16, v12 +; CHECK-NEXT: ret + %vc = fcmp ugt %va, %vb + ret %vc +} + +define @fcmp_ugt_vf_nxv8bf16_nonans( %va, bfloat %b) #0 { +; CHECK-LABEL: fcmp_ugt_vf_nxv8bf16_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v0, v16, v12 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp ugt %va, %splat + ret %vc +} + +define @fcmp_uge_vv_nxv8bf16( %va, %vb) { +; CHECK-LABEL: fcmp_uge_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v8, v16, v12 +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %vc = fcmp uge %va, %vb + ret %vc +} + +define @fcmp_uge_vf_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: fcmp_uge_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v8, v12, v16 +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp uge %va, %splat + ret %vc +} + +define @fcmp_uge_fv_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: fcmp_uge_fv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v8, v16, v12 +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp uge %splat, %va + ret %vc +} + +define @fcmp_uge_vv_nxv8bf16_nonans( %va, %vb) #0 { +; CHECK-LABEL: fcmp_uge_vv_nxv8bf16_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfle.vv v0, v16, v12 +; CHECK-NEXT: ret + %vc = fcmp uge %va, %vb + ret %vc +} + +define @fcmp_uge_vf_nxv8bf16_nonans( %va, bfloat %b) #0 { +; CHECK-LABEL: fcmp_uge_vf_nxv8bf16_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfle.vv v0, v16, v12 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp uge %va, %splat + ret %vc +} + +define @fcmp_ult_vv_nxv8bf16( %va, %vb) { +; CHECK-LABEL: fcmp_ult_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfle.vv v8, v16, v12 +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %vc = fcmp ult %va, %vb + ret %vc +} + +define @fcmp_ult_vf_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: fcmp_ult_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfle.vv v8, v16, v12 +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp ult %va, %splat + ret %vc +} + +define @fcmp_ult_fv_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: fcmp_ult_fv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfle.vv v8, v12, v16 +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp ult %splat, %va + ret %vc +} + +define @fcmp_ult_vv_nxv8bf16_nonans( %va, %vb) #0 { +; CHECK-LABEL: fcmp_ult_vv_nxv8bf16_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v0, v16, v12 +; CHECK-NEXT: ret + %vc = fcmp ult %va, %vb + ret %vc +} + +define @fcmp_ult_vf_nxv8bf16_nonans( %va, bfloat %b) #0 { +; CHECK-LABEL: fcmp_ult_vf_nxv8bf16_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v0, v12, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp ult %va, %splat + ret %vc +} + +define @fcmp_ule_vv_nxv8bf16( %va, %vb) { +; CHECK-LABEL: fcmp_ule_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v8, v16, v12 +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %vc = fcmp ule %va, %vb + ret %vc +} + +define @fcmp_ule_vf_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: fcmp_ule_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v8, v16, v12 +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp ule %va, %splat + ret %vc +} + +define @fcmp_ule_fv_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: fcmp_ule_fv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmflt.vv v8, v12, v16 +; CHECK-NEXT: vmnot.m v0, v8 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp ule %splat, %va + ret %vc +} + +define @fcmp_ule_vv_nxv8bf16_nonans( %va, %vb) #0 { +; CHECK-LABEL: fcmp_ule_vv_nxv8bf16_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfle.vv v0, v16, v12 +; CHECK-NEXT: ret + %vc = fcmp ule %va, %vb + ret %vc +} + +define @fcmp_ule_vf_nxv8bf16_nonans( %va, bfloat %b) #0 { +; CHECK-LABEL: fcmp_ule_vf_nxv8bf16_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfle.vv v0, v12, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp ule %va, %splat + ret %vc +} + +define @fcmp_une_vv_nxv8bf16( %va, %vb) { +; CHECK-LABEL: fcmp_une_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfne.vv v0, v16, v12 +; CHECK-NEXT: ret + %vc = fcmp une %va, %vb + ret %vc +} + +define @fcmp_une_vf_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: fcmp_une_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfne.vv v0, v12, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp une %va, %splat + ret %vc +} + +define @fcmp_une_fv_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: fcmp_une_fv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfne.vv v0, v16, v12 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp une %splat, %va + ret %vc +} + +define @fcmp_une_vv_nxv8bf16_nonans( %va, %vb) #0 { +; CHECK-LABEL: fcmp_une_vv_nxv8bf16_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfne.vv v0, v16, v12 +; CHECK-NEXT: ret + %vc = fcmp une %va, %vb + ret %vc +} + +define @fcmp_une_vf_nxv8bf16_nonans( %va, bfloat %b) #0 { +; CHECK-LABEL: fcmp_une_vf_nxv8bf16_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfne.vv v0, v12, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp une %va, %splat + ret %vc +} + +define @fcmp_uno_vv_nxv8bf16( %va, %vb) { +; CHECK-LABEL: fcmp_uno_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfne.vv v10, v12, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfne.vv v8, v12, v12 +; CHECK-NEXT: vmor.mm v0, v8, v10 +; CHECK-NEXT: ret + %vc = fcmp uno %va, %vb + ret %vc +} + +define @fcmp_uno_vf_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: fcmp_uno_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfne.vv v8, v12, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfne.vv v9, v12, v12 +; CHECK-NEXT: vmor.mm v0, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp uno %va, %splat + ret %vc +} + +define @fcmp_uno_fv_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: fcmp_uno_fv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfne.vv v8, v12, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfne.vv v9, v12, v12 +; CHECK-NEXT: vmor.mm v0, v9, v8 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp uno %splat, %va + ret %vc +} + +define @fcmp_uno_vv_nxv8bf16_nonans( %va, %vb) #0 { +; CHECK-LABEL: fcmp_uno_vv_nxv8bf16_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfne.vv v10, v12, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfne.vv v8, v12, v12 +; CHECK-NEXT: vmor.mm v0, v8, v10 +; CHECK-NEXT: ret + %vc = fcmp uno %va, %vb + ret %vc +} + +define @fcmp_uno_vf_nxv8bf16_nonans( %va, bfloat %b) #0 { +; CHECK-LABEL: fcmp_uno_vf_nxv8bf16_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfne.vv v8, v12, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmfne.vv v9, v12, v12 +; CHECK-NEXT: vmor.mm v0, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fcmp uno %va, %splat + ret %vc +} + define @fcmp_oeq_vv_nxv8f16( %va, %vb) { ; ZVFH-LABEL: fcmp_oeq_vv_nxv8f16: ; ZVFH: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-constrained-sdnode.ll index af80e627b43fa1..53be153f8ff2da 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfadd-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-constrained-sdnode.ll @@ -1,12 +1,239 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN + +define @vfadd_vv_nxv1bf16( %va, %vb) strictfp { +; CHECK-LABEL: vfadd_vv_nxv1bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfadd.vv v9, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret +entry: + %vc = call @llvm.experimental.constrained.fadd.nxv1bf16( %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfadd_vf_nxv1bf16( %va, bfloat %b) strictfp { +; CHECK-LABEL: vfadd_vf_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfadd.vv v9, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = call @llvm.experimental.constrained.fadd.nxv1bf16( %va, %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfadd_vv_nxv2bf16( %va, %vb) strictfp { +; CHECK-LABEL: vfadd_vv_nxv2bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfadd.vv v9, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret +entry: + %vc = call @llvm.experimental.constrained.fadd.nxv2bf16( %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfadd_vf_nxv2bf16( %va, bfloat %b) strictfp { +; CHECK-LABEL: vfadd_vf_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfadd.vv v9, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = call @llvm.experimental.constrained.fadd.nxv2bf16( %va, %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfadd_vv_nxv4bf16( %va, %vb) strictfp { +; CHECK-LABEL: vfadd_vv_nxv4bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfadd.vv v10, v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret +entry: + %vc = call @llvm.experimental.constrained.fadd.nxv4bf16( %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfadd_vf_nxv4bf16( %va, bfloat %b) strictfp { +; CHECK-LABEL: vfadd_vf_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfadd.vv v10, v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = call @llvm.experimental.constrained.fadd.nxv4bf16( %va, %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfadd_vv_nxv8bf16( %va, %vb) strictfp { +; CHECK-LABEL: vfadd_vv_nxv8bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfadd.vv v12, v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret +entry: + %vc = call @llvm.experimental.constrained.fadd.nxv8bf16( %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfadd_vf_nxv8bf16( %va, bfloat %b) strictfp { +; CHECK-LABEL: vfadd_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfadd.vv v12, v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = call @llvm.experimental.constrained.fadd.nxv8bf16( %va, %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfadd_vv_nxv16bf16( %va, %vb) strictfp { +; CHECK-LABEL: vfadd_vv_nxv16bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfadd.vv v16, v24, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret +entry: + %vc = call @llvm.experimental.constrained.fadd.nxv16bf16( %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfadd_vf_nxv16bf16( %va, bfloat %b) strictfp { +; CHECK-LABEL: vfadd_vf_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vmv.v.x v12, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfadd.vv v16, v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = call @llvm.experimental.constrained.fadd.nxv16bf16( %va, %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfadd_vv_nxv32bf16( %va, %vb) strictfp { +; CHECK-LABEL: vfadd_vv_nxv32bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfadd.vv v24, v0, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfadd.vv v16, v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: ret +entry: + %vc = call @llvm.experimental.constrained.fadd.nxv32bf16( %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfadd_vf_nxv32bf16( %va, bfloat %b) strictfp { +; CHECK-LABEL: vfadd_vf_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; CHECK-NEXT: vmv.v.x v16, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfadd.vv v24, v24, v0 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfadd.vv v16, v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = call @llvm.experimental.constrained.fadd.nxv32bf16( %va, %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} declare @llvm.experimental.constrained.fadd.nxv1f16(, , metadata, metadata) define @vfadd_vv_nxv1f16( %va, %vb) strictfp { diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll index 8f21e326e68790..c3c0958f7096d9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll @@ -1,12 +1,252 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN + +define @vfadd_vv_nxv1bf16( %va, %vb) { +; CHECK-LABEL: vfadd_vv_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfadd.vv v9, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %vc = fadd %va, %vb + ret %vc +} + +define @vfadd_vf_nxv1bf16( %va, bfloat %b) { +; CHECK-LABEL: vfadd_vf_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfadd.vv v9, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fadd %va, %splat + ret %vc +} + +define @vfadd_vv_nxv2bf16( %va, %vb) { +; CHECK-LABEL: vfadd_vv_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfadd.vv v9, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %vc = fadd %va, %vb + ret %vc +} + +define @vfadd_vf_nxv2bf16( %va, bfloat %b) { +; CHECK-LABEL: vfadd_vf_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfadd.vv v9, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fadd %va, %splat + ret %vc +} + +define @vfadd_vv_nxv4bf16( %va, %vb) { +; CHECK-LABEL: vfadd_vv_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfadd.vv v10, v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %vc = fadd %va, %vb + ret %vc +} + +define @vfadd_vf_nxv4bf16( %va, bfloat %b) { +; CHECK-LABEL: vfadd_vf_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfadd.vv v10, v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fadd %va, %splat + ret %vc +} + +define @vfadd_vv_nxv8bf16( %va, %vb) { +; CHECK-LABEL: vfadd_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfadd.vv v12, v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %vc = fadd %va, %vb + ret %vc +} + +define @vfadd_vf_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: vfadd_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfadd.vv v12, v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fadd %va, %splat + ret %vc +} + +define @vfadd_fv_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: vfadd_fv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfadd.vv v12, v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fadd %splat, %va + ret %vc +} + +define @vfadd_vv_nxv16bf16( %va, %vb) { +; CHECK-LABEL: vfadd_vv_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfadd.vv v16, v24, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %vc = fadd %va, %vb + ret %vc +} + +define @vfadd_vf_nxv16bf16( %va, bfloat %b) { +; CHECK-LABEL: vfadd_vf_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vmv.v.x v12, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfadd.vv v16, v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fadd %va, %splat + ret %vc +} + +define @vfadd_vv_nxv32bf16( %va, %vb) { +; CHECK-LABEL: vfadd_vv_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfadd.vv v24, v0, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfadd.vv v16, v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: ret + %vc = fadd %va, %vb + ret %vc +} + +define @vfadd_vf_nxv32bf16( %va, bfloat %b) { +; CHECK-LABEL: vfadd_vf_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; CHECK-NEXT: vmv.v.x v16, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfadd.vv v24, v24, v0 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v20 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfadd.vv v16, v24, v0 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fadd %va, %splat + ret %vc +} define @vfadd_vv_nxv1f16( %va, %vb) { ; ZVFH-LABEL: vfadd_vv_nxv1f16: diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll index 395f1a7c382bff..b3de904d20622b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll @@ -1,13 +1,660 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN + +declare @llvm.vp.fadd.nxv1bf16(, , , i32) + +define @vfadd_vv_nxv1bf16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfadd_vv_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfadd.vv v9, v9, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.fadd.nxv1bf16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vfadd_vv_nxv1bf16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vfadd_vv_nxv1bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfadd.vv v9, v9, v10 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.fadd.nxv1bf16( %va, %b, splat (i1 true), i32 %evl) + ret %v +} + +define @vfadd_vf_nxv1bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfadd_vf_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfadd.vv v9, v10, v8, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fadd.nxv1bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfadd_vf_nxv1bf16_commute( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfadd_vf_nxv1bf16_commute: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfadd.vv v9, v8, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fadd.nxv1bf16( %vb, %va, %m, i32 %evl) + ret %v +} +define @vfadd_vf_nxv1bf16_unmasked( %va, bfloat %b, i32 zeroext %evl) { +; CHECK-LABEL: vfadd_vf_nxv1bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfadd.vv v9, v10, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fadd.nxv1bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +define @vfadd_vf_nxv1bf16_unmasked_commute( %va, bfloat %b, i32 zeroext %evl) { +; CHECK-LABEL: vfadd_vf_nxv1bf16_unmasked_commute: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfadd.vv v9, v8, v10 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fadd.nxv1bf16( %vb, %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.fadd.nxv2bf16(, , , i32) + +define @vfadd_vv_nxv2bf16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfadd_vv_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfadd.vv v9, v9, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.fadd.nxv2bf16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vfadd_vv_nxv2bf16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vfadd_vv_nxv2bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfadd.vv v9, v9, v10 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.fadd.nxv2bf16( %va, %b, splat (i1 true), i32 %evl) + ret %v +} + +define @vfadd_vf_nxv2bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfadd_vf_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfadd.vv v9, v10, v8, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fadd.nxv2bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfadd_vf_nxv2bf16_unmasked( %va, bfloat %b, i32 zeroext %evl) { +; CHECK-LABEL: vfadd_vf_nxv2bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfadd.vv v9, v10, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fadd.nxv2bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.fadd.nxv4bf16(, , , i32) + +define @vfadd_vv_nxv4bf16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfadd_vv_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfadd.vv v10, v12, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.vp.fadd.nxv4bf16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vfadd_vv_nxv4bf16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vfadd_vv_nxv4bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfadd.vv v10, v12, v10 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.vp.fadd.nxv4bf16( %va, %b, splat (i1 true), i32 %evl) + ret %v +} + +define @vfadd_vf_nxv4bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfadd_vf_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfadd.vv v10, v10, v12, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fadd.nxv4bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfadd_vf_nxv4bf16_unmasked( %va, bfloat %b, i32 zeroext %evl) { +; CHECK-LABEL: vfadd_vf_nxv4bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfadd.vv v10, v10, v12 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fadd.nxv4bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.fadd.nxv8bf16(, , , i32) + +define @vfadd_vv_nxv8bf16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfadd_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfadd.vv v12, v16, v12, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.vp.fadd.nxv8bf16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vfadd_vv_nxv8bf16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vfadd_vv_nxv8bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfadd.vv v12, v16, v12 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.vp.fadd.nxv8bf16( %va, %b, splat (i1 true), i32 %evl) + ret %v +} + +define @vfadd_vf_nxv8bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfadd_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfadd.vv v12, v12, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fadd.nxv8bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfadd_vf_nxv8bf16_unmasked( %va, bfloat %b, i32 zeroext %evl) { +; CHECK-LABEL: vfadd_vf_nxv8bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfadd.vv v12, v12, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fadd.nxv8bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.fadd.nxv16bf16(, , , i32) + +define @vfadd_vv_nxv16bf16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfadd_vv_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfadd.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %v = call @llvm.vp.fadd.nxv16bf16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vfadd_vv_nxv16bf16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vfadd_vv_nxv16bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfadd.vv v16, v24, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %v = call @llvm.vp.fadd.nxv16bf16( %va, %b, splat (i1 true), i32 %evl) + ret %v +} + +define @vfadd_vf_nxv16bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfadd_vf_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfadd.vv v16, v16, v24, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fadd.nxv16bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfadd_vf_nxv16bf16_unmasked( %va, bfloat %b, i32 zeroext %evl) { +; CHECK-LABEL: vfadd_vf_nxv16bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfadd.vv v16, v16, v24 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fadd.nxv16bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.fadd.nxv32bf16(, , , i32) + +define @vfadd_vv_nxv32bf16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfadd_vv_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: vmv1r.v v7, v0 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v0, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfadd.vv v16, v16, v24, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: bltu a0, a1, .LBB22_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB22_2: +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfadd.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.fadd.nxv32bf16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vfadd_vv_nxv32bf16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vfadd_vv_nxv32bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma +; CHECK-NEXT: vmset.m v24 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v24, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfadd.vv v16, v16, v24, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: bltu a0, a1, .LBB23_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB23_2: +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfadd.vv v16, v24, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.fadd.nxv32bf16( %va, %b, splat (i1 true), i32 %evl) + ret %v +} + +define @vfadd_vf_nxv32bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfadd_vf_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x12, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 18 * vlenb +; CHECK-NEXT: vmv8r.v v24, v8 +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma +; CHECK-NEXT: vmv.v.x v16, a1 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a2, a1, 3 +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: slli a4, a4, 3 +; CHECK-NEXT: add a4, sp, a4 +; CHECK-NEXT: addi a4, a4, 16 +; CHECK-NEXT: vs1r.v v0, (a4) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v0, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a4, a2, 3 +; CHECK-NEXT: add a2, a4, a2 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfadd.vv v16, v8, v16, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: bltu a0, a1, .LBB24_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB24_2: +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a2, a1, 3 +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfadd.vv v16, v16, v24, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fadd.nxv32bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfadd_vf_nxv32bf16_unmasked( %va, bfloat %b, i32 zeroext %evl) { +; CHECK-LABEL: vfadd_vf_nxv32bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: vmv8r.v v16, v8 +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vmset.m v24 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v24, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfadd.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: bltu a0, a1, .LBB25_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB25_2: +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfadd.vv v16, v16, v24 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fadd.nxv32bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} declare @llvm.vp.fadd.nxv1f16(, , , i32) define @vfadd_vv_nxv1f16( %va, %b, %m, i32 zeroext %evl) { @@ -564,10 +1211,10 @@ define @vfadd_vv_nxv32f16( %va, @vfadd_vv_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: vfadd.vv v16, v16, v24, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 -; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2 +; ZVFHMIN-NEXT: bltu a0, a1, .LBB49_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB23_2: +; ZVFHMIN-NEXT: .LBB49_2: ; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 @@ -699,10 +1346,10 @@ define @vfadd_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: vfadd.vv v16, v8, v16, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 -; ZVFHMIN-NEXT: bltu a0, a1, .LBB24_2 +; ZVFHMIN-NEXT: bltu a0, a1, .LBB50_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB24_2: +; ZVFHMIN-NEXT: .LBB50_2: ; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 @@ -780,10 +1427,10 @@ define @vfadd_vf_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: vfadd.vv v16, v24, v16, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 -; ZVFHMIN-NEXT: bltu a0, a1, .LBB25_2 +; ZVFHMIN-NEXT: bltu a0, a1, .LBB51_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB25_2: +; ZVFHMIN-NEXT: .LBB51_2: ; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfclass-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfclass-sdnode.ll index f2af8ac3b02d4c..c97278480f1a81 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfclass-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfclass-sdnode.ll @@ -1,18 +1,51 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN -define @isnan_nxv2f16( %x) { -; CHECK-LABEL: isnan_nxv2f16: +define @isnan_nxv2bf16( %x) { +; CHECK-LABEL: isnan_nxv2bf16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; CHECK-NEXT: vfclass.v v8, v8 -; CHECK-NEXT: li a0, 768 -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: lui a0, 8 +; CHECK-NEXT: addi a1, a0, -1 +; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma +; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: addi a0, a0, -128 +; CHECK-NEXT: vmsgt.vx v0, v8, a0 ; CHECK-NEXT: ret + %1 = call @llvm.is.fpclass.nxv2bf16( %x, i32 3) ; nan + ret %1 +} + +define @isnan_nxv2f16( %x) { +; ZVFH-LABEL: isnan_nxv2f16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; ZVFH-NEXT: vfclass.v v8, v8 +; ZVFH-NEXT: li a0, 768 +; ZVFH-NEXT: vand.vx v8, v8, a0 +; ZVFH-NEXT: vmsne.vi v0, v8, 0 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: isnan_nxv2f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lui a0, 8 +; ZVFHMIN-NEXT: addi a0, a0, -1 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a0 +; ZVFHMIN-NEXT: li a0, 31 +; ZVFHMIN-NEXT: slli a0, a0, 10 +; ZVFHMIN-NEXT: vmsgt.vx v0, v8, a0 +; ZVFHMIN-NEXT: ret %1 = call @llvm.is.fpclass.nxv2f16( %x, i32 3) ; nan ret %1 } diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll index 69095a0b21bb09..aa59732e1e1e52 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll @@ -1,12 +1,258 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN + +define @vfdiv_vv_nxv1bf16( %va, %vb) strictfp { +; CHECK-LABEL: vfdiv_vv_nxv1bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfdiv.vv v9, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret +entry: + %vc = call @llvm.experimental.constrained.fdiv.nxv1bf16( %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfdiv_vf_nxv1bf16( %va, bfloat %b) strictfp { +; CHECK-LABEL: vfdiv_vf_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfdiv.vv v9, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = call @llvm.experimental.constrained.fdiv.nxv1bf16( %va, %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfdiv_vv_nxv2bf16( %va, %vb) strictfp { +; CHECK-LABEL: vfdiv_vv_nxv2bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfdiv.vv v9, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret +entry: + %vc = call @llvm.experimental.constrained.fdiv.nxv2bf16( %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfdiv_vf_nxv2bf16( %va, bfloat %b) strictfp { +; CHECK-LABEL: vfdiv_vf_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfdiv.vv v9, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = call @llvm.experimental.constrained.fdiv.nxv2bf16( %va, %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfdiv_vv_nxv4bf16( %va, %vb) strictfp { +; CHECK-LABEL: vfdiv_vv_nxv4bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfdiv.vv v10, v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret +entry: + %vc = call @llvm.experimental.constrained.fdiv.nxv4bf16( %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfdiv_vf_nxv4bf16( %va, bfloat %b) strictfp { +; CHECK-LABEL: vfdiv_vf_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfdiv.vv v10, v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = call @llvm.experimental.constrained.fdiv.nxv4bf16( %va, %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfdiv_vv_nxv8bf16( %va, %vb) strictfp { +; CHECK-LABEL: vfdiv_vv_nxv8bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfdiv.vv v12, v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret +entry: + %vc = call @llvm.experimental.constrained.fdiv.nxv8bf16( %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfdiv_vf_nxv8bf16( %va, bfloat %b) strictfp { +; CHECK-LABEL: vfdiv_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfdiv.vv v12, v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = call @llvm.experimental.constrained.fdiv.nxv8bf16( %va, %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfdiv_fv_nxv8bf16( %va, bfloat %b) strictfp { +; CHECK-LABEL: vfdiv_fv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfdiv.vv v12, v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = call @llvm.experimental.constrained.fdiv.nxv8bf16( %splat, %va, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfdiv_vv_nxv16bf16( %va, %vb) strictfp { +; CHECK-LABEL: vfdiv_vv_nxv16bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfdiv.vv v16, v24, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret +entry: + %vc = call @llvm.experimental.constrained.fdiv.nxv16bf16( %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfdiv_vf_nxv16bf16( %va, bfloat %b) strictfp { +; CHECK-LABEL: vfdiv_vf_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vmv.v.x v12, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfdiv.vv v16, v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = call @llvm.experimental.constrained.fdiv.nxv16bf16( %va, %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfdiv_vv_nxv32bf16( %va, %vb) strictfp { +; CHECK-LABEL: vfdiv_vv_nxv32bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfdiv.vv v24, v0, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfdiv.vv v16, v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: ret +entry: + %vc = call @llvm.experimental.constrained.fdiv.nxv32bf16( %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfdiv_vf_nxv32bf16( %va, bfloat %b) strictfp { +; CHECK-LABEL: vfdiv_vf_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; CHECK-NEXT: vmv.v.x v16, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfdiv.vv v24, v24, v0 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfdiv.vv v16, v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = call @llvm.experimental.constrained.fdiv.nxv32bf16( %va, %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} declare @llvm.experimental.constrained.fdiv.nxv1f16(, , metadata, metadata) define @vfdiv_vv_nxv1f16( %va, %vb) strictfp { diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll index 9f5434dd34727d..f7db2be35d720b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll @@ -1,13 +1,249 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +define @vfdiv_vv_nxv1bf16( %va, %vb) { +; CHECK-LABEL: vfdiv_vv_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfdiv.vv v9, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %vc = fdiv %va, %vb + ret %vc +} + +define @vfdiv_vf_nxv1bf16( %va, bfloat %b) { +; CHECK-LABEL: vfdiv_vf_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfdiv.vv v9, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fdiv %va, %splat + ret %vc +} + +define @vfdiv_vv_nxv2bf16( %va, %vb) { +; CHECK-LABEL: vfdiv_vv_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfdiv.vv v9, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %vc = fdiv %va, %vb + ret %vc +} + +define @vfdiv_vf_nxv2bf16( %va, bfloat %b) { +; CHECK-LABEL: vfdiv_vf_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfdiv.vv v9, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fdiv %va, %splat + ret %vc +} + +define @vfdiv_vv_nxv4bf16( %va, %vb) { +; CHECK-LABEL: vfdiv_vv_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfdiv.vv v10, v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %vc = fdiv %va, %vb + ret %vc +} + +define @vfdiv_vf_nxv4bf16( %va, bfloat %b) { +; CHECK-LABEL: vfdiv_vf_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfdiv.vv v10, v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fdiv %va, %splat + ret %vc +} + +define @vfdiv_vv_nxv8bf16( %va, %vb) { +; CHECK-LABEL: vfdiv_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfdiv.vv v12, v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %vc = fdiv %va, %vb + ret %vc +} + +define @vfdiv_vf_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: vfdiv_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfdiv.vv v12, v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fdiv %va, %splat + ret %vc +} + +define @vfdiv_fv_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: vfdiv_fv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfdiv.vv v12, v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fdiv %splat, %va + ret %vc +} + +define @vfdiv_vv_nxv16bf16( %va, %vb) { +; CHECK-LABEL: vfdiv_vv_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfdiv.vv v16, v24, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %vc = fdiv %va, %vb + ret %vc +} + +define @vfdiv_vf_nxv16bf16( %va, bfloat %b) { +; CHECK-LABEL: vfdiv_vf_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vmv.v.x v12, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfdiv.vv v16, v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fdiv %va, %splat + ret %vc +} + +define @vfdiv_vv_nxv32bf16( %va, %vb) { +; CHECK-LABEL: vfdiv_vv_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfdiv.vv v24, v0, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfdiv.vv v16, v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: ret + %vc = fdiv %va, %vb + ret %vc +} + +define @vfdiv_vf_nxv32bf16( %va, bfloat %b) { +; CHECK-LABEL: vfdiv_vf_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; CHECK-NEXT: vmv.v.x v16, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfdiv.vv v24, v24, v0 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v20 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfdiv.vv v16, v24, v0 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fdiv %va, %splat + ret %vc +} + define @vfdiv_vv_nxv1f16( %va, %vb) { ; ZVFH-LABEL: vfdiv_vv_nxv1f16: ; ZVFH: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll index 52e2a9535ef603..aa39fe5b5ec851 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll @@ -1,13 +1,622 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN + +declare @llvm.vp.fdiv.nxv1bf16(, , , i32) + +define @vfdiv_vv_nxv1bf16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfdiv_vv_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfdiv.vv v9, v9, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.fdiv.nxv1bf16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vfdiv_vv_nxv1bf16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vfdiv_vv_nxv1bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfdiv.vv v9, v9, v10 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.fdiv.nxv1bf16( %va, %b, splat (i1 true), i32 %evl) + ret %v +} + +define @vfdiv_vf_nxv1bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfdiv_vf_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfdiv.vv v9, v10, v8, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fdiv.nxv1bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfdiv_vf_nxv1bf16_unmasked( %va, bfloat %b, i32 zeroext %evl) { +; CHECK-LABEL: vfdiv_vf_nxv1bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfdiv.vv v9, v10, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fdiv.nxv1bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.fdiv.nxv2bf16(, , , i32) + +define @vfdiv_vv_nxv2bf16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfdiv_vv_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfdiv.vv v9, v9, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.fdiv.nxv2bf16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vfdiv_vv_nxv2bf16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vfdiv_vv_nxv2bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfdiv.vv v9, v9, v10 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.fdiv.nxv2bf16( %va, %b, splat (i1 true), i32 %evl) + ret %v +} + +define @vfdiv_vf_nxv2bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfdiv_vf_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfdiv.vv v9, v10, v8, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fdiv.nxv2bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfdiv_vf_nxv2bf16_unmasked( %va, bfloat %b, i32 zeroext %evl) { +; CHECK-LABEL: vfdiv_vf_nxv2bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfdiv.vv v9, v10, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fdiv.nxv2bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.fdiv.nxv4bf16(, , , i32) + +define @vfdiv_vv_nxv4bf16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfdiv_vv_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfdiv.vv v10, v12, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.vp.fdiv.nxv4bf16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vfdiv_vv_nxv4bf16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vfdiv_vv_nxv4bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfdiv.vv v10, v12, v10 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.vp.fdiv.nxv4bf16( %va, %b, splat (i1 true), i32 %evl) + ret %v +} + +define @vfdiv_vf_nxv4bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfdiv_vf_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfdiv.vv v10, v10, v12, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fdiv.nxv4bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfdiv_vf_nxv4bf16_unmasked( %va, bfloat %b, i32 zeroext %evl) { +; CHECK-LABEL: vfdiv_vf_nxv4bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfdiv.vv v10, v10, v12 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fdiv.nxv4bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.fdiv.nxv8bf16(, , , i32) +define @vfdiv_vv_nxv8bf16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfdiv_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfdiv.vv v12, v16, v12, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.vp.fdiv.nxv8bf16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vfdiv_vv_nxv8bf16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vfdiv_vv_nxv8bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfdiv.vv v12, v16, v12 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.vp.fdiv.nxv8bf16( %va, %b, splat (i1 true), i32 %evl) + ret %v +} + +define @vfdiv_vf_nxv8bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfdiv_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfdiv.vv v12, v12, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fdiv.nxv8bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfdiv_vf_nxv8bf16_unmasked( %va, bfloat %b, i32 zeroext %evl) { +; CHECK-LABEL: vfdiv_vf_nxv8bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfdiv.vv v12, v12, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fdiv.nxv8bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.fdiv.nxv16bf16(, , , i32) + +define @vfdiv_vv_nxv16bf16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfdiv_vv_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfdiv.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %v = call @llvm.vp.fdiv.nxv16bf16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vfdiv_vv_nxv16bf16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vfdiv_vv_nxv16bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfdiv.vv v16, v24, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %v = call @llvm.vp.fdiv.nxv16bf16( %va, %b, splat (i1 true), i32 %evl) + ret %v +} + +define @vfdiv_vf_nxv16bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfdiv_vf_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfdiv.vv v16, v16, v24, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fdiv.nxv16bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfdiv_vf_nxv16bf16_unmasked( %va, bfloat %b, i32 zeroext %evl) { +; CHECK-LABEL: vfdiv_vf_nxv16bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfdiv.vv v16, v16, v24 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fdiv.nxv16bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.fdiv.nxv32bf16(, , , i32) + +define @vfdiv_vv_nxv32bf16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfdiv_vv_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: vmv1r.v v7, v0 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v0, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfdiv.vv v16, v16, v24, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: bltu a0, a1, .LBB20_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB20_2: +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfdiv.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.fdiv.nxv32bf16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vfdiv_vv_nxv32bf16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vfdiv_vv_nxv32bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma +; CHECK-NEXT: vmset.m v24 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v24, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfdiv.vv v16, v16, v24, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: bltu a0, a1, .LBB21_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB21_2: +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfdiv.vv v16, v24, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.fdiv.nxv32bf16( %va, %b, splat (i1 true), i32 %evl) + ret %v +} + +define @vfdiv_vf_nxv32bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfdiv_vf_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x12, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 18 * vlenb +; CHECK-NEXT: vmv8r.v v24, v8 +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma +; CHECK-NEXT: vmv.v.x v16, a1 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a2, a1, 3 +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: slli a4, a4, 3 +; CHECK-NEXT: add a4, sp, a4 +; CHECK-NEXT: addi a4, a4, 16 +; CHECK-NEXT: vs1r.v v0, (a4) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v0, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a4, a2, 3 +; CHECK-NEXT: add a2, a4, a2 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfdiv.vv v16, v8, v16, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: bltu a0, a1, .LBB22_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB22_2: +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a2, a1, 3 +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfdiv.vv v16, v16, v24, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fdiv.nxv32bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfdiv_vf_nxv32bf16_unmasked( %va, bfloat %b, i32 zeroext %evl) { +; CHECK-LABEL: vfdiv_vf_nxv32bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: vmv8r.v v16, v8 +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vmset.m v24 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v24, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfdiv.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: bltu a0, a1, .LBB23_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB23_2: +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfdiv.vv v16, v16, v24 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fdiv.nxv32bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} declare @llvm.vp.fdiv.nxv1f16(, , , i32) define @vfdiv_vv_nxv1f16( %va, %b, %m, i32 zeroext %evl) { @@ -514,10 +1123,10 @@ define @vfdiv_vv_nxv32f16( %va, @vfdiv_vv_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: vfdiv.vv v16, v16, v24, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 -; ZVFHMIN-NEXT: bltu a0, a1, .LBB21_2 +; ZVFHMIN-NEXT: bltu a0, a1, .LBB45_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB21_2: +; ZVFHMIN-NEXT: .LBB45_2: ; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 @@ -649,10 +1258,10 @@ define @vfdiv_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: vfdiv.vv v16, v8, v16, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 -; ZVFHMIN-NEXT: bltu a0, a1, .LBB22_2 +; ZVFHMIN-NEXT: bltu a0, a1, .LBB46_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB22_2: +; ZVFHMIN-NEXT: .LBB46_2: ; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 @@ -730,10 +1339,10 @@ define @vfdiv_vf_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: vfdiv.vv v16, v24, v16, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 -; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2 +; ZVFHMIN-NEXT: bltu a0, a1, .LBB47_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB23_2: +; ZVFHMIN-NEXT: .LBB47_2: ; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll index b6bb0371121b4f..baecb7bb7d2483 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll @@ -1,15 +1,1429 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN +declare @llvm.vp.fma.nxv1bf16(, , , , i32) + +define @vfma_vv_nxv1bf16( %va, %b, %c, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vv_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfmadd.vv v12, v10, v11, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.vp.fma.nxv1bf16( %va, %b, %c, %m, i32 %evl) + ret %v +} + +define @vfma_vv_nxv1bf16_unmasked( %va, %b, %c, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vv_nxv1bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfmadd.vv v12, v10, v11 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.vp.fma.nxv1bf16( %va, %b, %c, splat (i1 true), i32 %evl) + ret %v +} + +define @vfma_vf_nxv1bf16( %va, bfloat %b, %vc, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vf_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfmadd.vv v12, v9, v11, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fma.nxv1bf16( %va, %vb, %vc, %m, i32 %evl) + ret %v +} + +define @vfma_vf_nxv1bf16_commute( %va, bfloat %b, %vc, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vf_nxv1bf16_commute: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfmadd.vv v9, v8, v11, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fma.nxv1bf16( %vb, %va, %vc, %m, i32 %evl) + ret %v +} + +define @vfma_vf_nxv1bf16_unmasked( %va, bfloat %b, %vc, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vf_nxv1bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfmadd.vv v12, v9, v11 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fma.nxv1bf16( %va, %vb, %vc, splat (i1 true), i32 %evl) + ret %v +} + +define @vfma_vf_nxv1bf16_unmasked_commute( %va, bfloat %b, %vc, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vf_nxv1bf16_unmasked_commute: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfmadd.vv v12, v9, v11 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fma.nxv1bf16( %vb, %va, %vc, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.fma.nxv2bf16(, , , , i32) + +define @vfma_vv_nxv2bf16( %va, %b, %c, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vv_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfmadd.vv v12, v10, v11, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.vp.fma.nxv2bf16( %va, %b, %c, %m, i32 %evl) + ret %v +} + +define @vfma_vv_nxv2bf16_unmasked( %va, %b, %c, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vv_nxv2bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfmadd.vv v12, v10, v11 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.vp.fma.nxv2bf16( %va, %b, %c, splat (i1 true), i32 %evl) + ret %v +} + +define @vfma_vf_nxv2bf16( %va, bfloat %b, %vc, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vf_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfmadd.vv v12, v9, v11, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fma.nxv2bf16( %va, %vb, %vc, %m, i32 %evl) + ret %v +} + +define @vfma_vf_nxv2bf16_commute( %va, bfloat %b, %vc, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vf_nxv2bf16_commute: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfmadd.vv v9, v8, v11, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fma.nxv2bf16( %vb, %va, %vc, %m, i32 %evl) + ret %v +} + +define @vfma_vf_nxv2bf16_unmasked( %va, bfloat %b, %vc, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vf_nxv2bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfmadd.vv v12, v9, v11 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fma.nxv2bf16( %va, %vb, %vc, splat (i1 true), i32 %evl) + ret %v +} + +define @vfma_vf_nxv2bf16_unmasked_commute( %va, bfloat %b, %vc, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vf_nxv2bf16_unmasked_commute: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfmadd.vv v12, v9, v11 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fma.nxv2bf16( %vb, %va, %vc, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.fma.nxv4bf16(, , , , i32) + +define @vfma_vv_nxv4bf16( %va, %b, %c, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vv_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfmadd.vv v14, v10, v12, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v14 +; CHECK-NEXT: ret + %v = call @llvm.vp.fma.nxv4bf16( %va, %b, %c, %m, i32 %evl) + ret %v +} + +define @vfma_vv_nxv4bf16_unmasked( %va, %b, %c, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vv_nxv4bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfmadd.vv v14, v10, v12 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v14 +; CHECK-NEXT: ret + %v = call @llvm.vp.fma.nxv4bf16( %va, %b, %c, splat (i1 true), i32 %evl) + ret %v +} + +define @vfma_vf_nxv4bf16( %va, bfloat %b, %vc, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vf_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfmadd.vv v16, v14, v12, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fma.nxv4bf16( %va, %vb, %vc, %m, i32 %evl) + ret %v +} + +define @vfma_vf_nxv4bf16_commute( %va, bfloat %b, %vc, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vf_nxv4bf16_commute: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfmadd.vv v14, v8, v12, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v14 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fma.nxv4bf16( %vb, %va, %vc, %m, i32 %evl) + ret %v +} + +define @vfma_vf_nxv4bf16_unmasked( %va, bfloat %b, %vc, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vf_nxv4bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfmadd.vv v16, v14, v12 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fma.nxv4bf16( %va, %vb, %vc, splat (i1 true), i32 %evl) + ret %v +} + +define @vfma_vf_nxv4bf16_unmasked_commute( %va, bfloat %b, %vc, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vf_nxv4bf16_unmasked_commute: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfmadd.vv v16, v14, v12 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fma.nxv4bf16( %vb, %va, %vc, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.fma.nxv8bf16(, , , , i32) + +define @vfma_vv_nxv8bf16( %va, %b, %c, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v20, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfmadd.vv v20, v12, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v20 +; CHECK-NEXT: ret + %v = call @llvm.vp.fma.nxv8bf16( %va, %b, %c, %m, i32 %evl) + ret %v +} + +define @vfma_vv_nxv8bf16_unmasked( %va, %b, %c, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vv_nxv8bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v20, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfmadd.vv v20, v12, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v20 +; CHECK-NEXT: ret + %v = call @llvm.vp.fma.nxv8bf16( %va, %b, %c, splat (i1 true), i32 %evl) + ret %v +} + +define @vfma_vf_nxv8bf16( %va, bfloat %b, %vc, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v20, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfmadd.vv v24, v20, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fma.nxv8bf16( %va, %vb, %vc, %m, i32 %evl) + ret %v +} + +define @vfma_vf_nxv8bf16_commute( %va, bfloat %b, %vc, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vf_nxv8bf16_commute: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v20, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfmadd.vv v20, v8, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v20 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fma.nxv8bf16( %vb, %va, %vc, %m, i32 %evl) + ret %v +} + +define @vfma_vf_nxv8bf16_unmasked( %va, bfloat %b, %vc, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vf_nxv8bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v20, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfmadd.vv v24, v20, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fma.nxv8bf16( %va, %vb, %vc, splat (i1 true), i32 %evl) + ret %v +} + +define @vfma_vf_nxv8bf16_unmasked_commute( %va, bfloat %b, %vc, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vf_nxv8bf16_unmasked_commute: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v20, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfmadd.vv v24, v20, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fma.nxv8bf16( %vb, %va, %vc, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.fma.nxv16bf16(, , , , i32) + +define @vfma_vv_nxv16bf16( %va, %b, %c, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vv_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfmadd.vv v16, v24, v8, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.fma.nxv16bf16( %va, %b, %c, %m, i32 %evl) + ret %v +} + +define @vfma_vv_nxv16bf16_unmasked( %va, %b, %c, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vv_nxv16bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfmadd.vv v0, v16, v24 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 +; CHECK-NEXT: ret + %v = call @llvm.vp.fma.nxv16bf16( %va, %b, %c, splat (i1 true), i32 %evl) + ret %v +} + +define @vfma_vf_nxv16bf16( %va, bfloat %b, %vc, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vf_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vmv.v.x v4, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v4 +; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfmadd.vv v16, v24, v8, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fma.nxv16bf16( %va, %vb, %vc, %m, i32 %evl) + ret %v +} + +define @vfma_vf_nxv16bf16_commute( %va, bfloat %b, %vc, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vf_nxv16bf16_commute: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vmv.v.x v4, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v4 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfmadd.vv v16, v8, v24, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fma.nxv16bf16( %vb, %va, %vc, %m, i32 %evl) + ret %v +} + +define @vfma_vf_nxv16bf16_unmasked( %va, bfloat %b, %vc, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vf_nxv16bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 2 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vmv.v.x v16, a1 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 +; CHECK-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfmadd.vv v16, v0, v24 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fma.nxv16bf16( %va, %vb, %vc, splat (i1 true), i32 %evl) + ret %v +} + +define @vfma_vf_nxv16bf16_unmasked_commute( %va, bfloat %b, %vc, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vf_nxv16bf16_unmasked_commute: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 2 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vmv.v.x v16, a1 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 +; CHECK-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfmadd.vv v16, v0, v24 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fma.nxv16bf16( %vb, %va, %vc, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.fma.nxv32bf16(, , , , i32) + +define @vfma_vv_nxv32bf16( %va, %b, %c, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vv_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 1 +; CHECK-NEXT: mv a3, a2 +; CHECK-NEXT: slli a2, a2, 2 +; CHECK-NEXT: add a3, a3, a2 +; CHECK-NEXT: slli a2, a2, 2 +; CHECK-NEXT: add a2, a2, a3 +; CHECK-NEXT: sub sp, sp, a2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x2a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 42 * vlenb +; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: vl8re16.v v0, (a0) +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a0, a2, 1 +; CHECK-NEXT: sub a3, a1, a0 +; CHECK-NEXT: sltu a4, a1, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: slli a4, a4, 4 +; CHECK-NEXT: add a4, sp, a4 +; CHECK-NEXT: addi a4, a4, 16 +; CHECK-NEXT: vs1r.v v24, (a4) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v24, v24, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs1r.v v24, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: mv a4, a2 +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a4, a4, a2 +; CHECK-NEXT: slli a2, a2, 1 +; CHECK-NEXT: add a2, a2, a4 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vmv8r.v v8, v16 +; CHECK-NEXT: vmv4r.v v24, v8 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a4, a2, 5 +; CHECK-NEXT: add a2, a4, a2 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a4, a2, 4 +; CHECK-NEXT: add a2, a4, a2 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v0, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v4 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vl1r.v v0, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfmadd.vv v16, v24, v8, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: bltu a1, a0, .LBB30_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: .LBB30_2: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: mv a2, a0 +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a2, a2, a0 +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a2, a0, 5 +; CHECK-NEXT: add a0, a2, a0 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a2, a0, 4 +; CHECK-NEXT: add a0, a2, a0 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a2, a0, 5 +; CHECK-NEXT: add a0, a2, a0 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl1r.v v0, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a2, a0, 5 +; CHECK-NEXT: add a0, a2, a0 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v16, v24, v0.t +; CHECK-NEXT: vmv.v.v v16, v8 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add a1, a1, a0 +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.fma.nxv32bf16( %va, %b, %c, %m, i32 %evl) + ret %v +} + +define @vfma_vv_nxv32bf16_unmasked( %va, %b, %c, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vv_nxv32bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 5 +; CHECK-NEXT: sub sp, sp, a2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; CHECK-NEXT: vmv8r.v v24, v16 +; CHECK-NEXT: vl8re16.v v16, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: mv a2, a0 +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a0, a2, 1 +; CHECK-NEXT: sub a3, a1, a0 +; CHECK-NEXT: sltu a4, a1, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma +; CHECK-NEXT: vmset.m v7 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v7, a2 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: mv a4, a2 +; CHECK-NEXT: slli a2, a2, 1 +; CHECK-NEXT: add a2, a2, a4 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfmadd.vv v16, v24, v8, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: bltu a1, a0, .LBB31_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: .LBB31_2: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: mv a2, a0 +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vfmacc.vv v0, v16, v24 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.fma.nxv32bf16( %va, %b, %c, splat (i1 true), i32 %evl) + ret %v +} + +define @vfma_vf_nxv32bf16( %va, bfloat %b, %vc, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vf_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 2 +; CHECK-NEXT: add a2, a2, a1 +; CHECK-NEXT: slli a1, a1, 2 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x2a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 42 * vlenb +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma +; CHECK-NEXT: vmv.v.x v24, a1 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a2, a1, 5 +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: slli a4, a4, 4 +; CHECK-NEXT: add a4, sp, a4 +; CHECK-NEXT: addi a4, a4, 16 +; CHECK-NEXT: vs1r.v v0, (a4) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v0, a2 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: mv a4, a2 +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a4, a4, a2 +; CHECK-NEXT: slli a2, a2, 1 +; CHECK-NEXT: add a2, a2, a4 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a4, a2, 4 +; CHECK-NEXT: add a2, a4, a2 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a4, a2, 5 +; CHECK-NEXT: add a2, a4, a2 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v16, v24, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v20, v8 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: bltu a0, a1, .LBB32_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB32_2: +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a2, a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a2, a1, 4 +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a2, a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a2, a1, 5 +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a2, a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v16, v24, v0.t +; CHECK-NEXT: vmv.v.v v16, v8 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add a1, a1, a0 +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fma.nxv32bf16( %va, %vb, %vc, %m, i32 %evl) + ret %v +} + +define @vfma_vf_nxv32bf16_commute( %va, bfloat %b, %vc, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vf_nxv32bf16_commute: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 2 +; CHECK-NEXT: add a2, a2, a1 +; CHECK-NEXT: slli a1, a1, 2 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x2a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 42 * vlenb +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma +; CHECK-NEXT: vmv.v.x v24, a1 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a2, a1, 5 +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: slli a4, a4, 4 +; CHECK-NEXT: add a4, sp, a4 +; CHECK-NEXT: addi a4, a4, 16 +; CHECK-NEXT: vs1r.v v0, (a4) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v0, a2 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: mv a4, a2 +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a4, a4, a2 +; CHECK-NEXT: slli a2, a2, 1 +; CHECK-NEXT: add a2, a2, a4 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a4, a2, 4 +; CHECK-NEXT: add a2, a4, a2 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a4, a2, 5 +; CHECK-NEXT: add a2, a4, a2 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfmadd.vv v16, v8, v24, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: bltu a0, a1, .LBB33_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB33_2: +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a2, a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a2, a1, 4 +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a2, a1, 5 +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a2, a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a2, a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v24, v16, v0.t +; CHECK-NEXT: vmv.v.v v16, v8 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add a1, a1, a0 +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fma.nxv32bf16( %vb, %va, %vc, %m, i32 %evl) + ret %v +} + +define @vfma_vf_nxv32bf16_unmasked( %va, bfloat %b, %vc, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vf_nxv32bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 5 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma +; CHECK-NEXT: vmv.v.x v24, a1 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vmset.m v7 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v7, a2 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: mv a4, a2 +; CHECK-NEXT: slli a2, a2, 1 +; CHECK-NEXT: add a2, a2, a4 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v16, v24, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v20, v8 +; CHECK-NEXT: bltu a0, a1, .LBB34_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB34_2: +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v24 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfmadd.vv v0, v24, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v16, v0 +; CHECK-NEXT: vmv8r.v v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fma.nxv32bf16( %va, %vb, %vc, splat (i1 true), i32 %evl) + ret %v +} + +define @vfma_vf_nxv32bf16_unmasked_commute( %va, bfloat %b, %vc, i32 zeroext %evl) { +; CHECK-LABEL: vfma_vf_nxv32bf16_unmasked_commute: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 5 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma +; CHECK-NEXT: vmv.v.x v24, a1 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vmset.m v7 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v7, a2 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: mv a4, a2 +; CHECK-NEXT: slli a2, a2, 1 +; CHECK-NEXT: add a2, a2, a4 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfmadd.vv v16, v8, v24, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: bltu a0, a1, .LBB35_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB35_2: +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfmadd.vv v0, v24, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fma.nxv32bf16( %vb, %va, %vc, splat (i1 true), i32 %evl) + ret %v +} + declare @llvm.vp.fma.nxv1f16(, , , , i32) define @vfma_vv_nxv1f16( %va, %b, %c, %m, i32 zeroext %evl) { @@ -833,8 +2247,12 @@ define @vfma_vv_nxv32f16( %va, @vfma_vv_nxv32f16( %va, @vfma_vv_nxv32f16( %va, @vfma_vv_nxv32f16( %va, @vfma_vv_nxv32f16_unmasked( %va, ; ZVFHMIN-NEXT: vmv8r.v v24, v16 ; ZVFHMIN-NEXT: vl8re16.v v16, (a0) ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: li a2, 24 -; ZVFHMIN-NEXT: mul a0, a0, a2 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a2, a0 +; ZVFHMIN-NEXT: slli a0, a0, 1 +; ZVFHMIN-NEXT: add a0, a0, a2 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill @@ -1017,8 +2447,10 @@ define @vfma_vv_nxv32f16_unmasked( %va, ; ZVFHMIN-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28 ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: li a4, 24 -; ZVFHMIN-NEXT: mul a2, a2, a4 +; ZVFHMIN-NEXT: slli a2, a2, 3 +; ZVFHMIN-NEXT: mv a4, a2 +; ZVFHMIN-NEXT: slli a2, a2, 1 +; ZVFHMIN-NEXT: add a2, a2, a4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload @@ -1029,10 +2461,10 @@ define @vfma_vv_nxv32f16_unmasked( %va, ; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 -; ZVFHMIN-NEXT: bltu a1, a0, .LBB31_2 +; ZVFHMIN-NEXT: bltu a1, a0, .LBB67_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: .LBB31_2: +; ZVFHMIN-NEXT: .LBB67_2: ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add a0, sp, a0 @@ -1048,8 +2480,10 @@ define @vfma_vv_nxv32f16_unmasked( %va, ; ZVFHMIN-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: li a2, 24 -; ZVFHMIN-NEXT: mul a0, a0, a2 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a2, a0 +; ZVFHMIN-NEXT: slli a0, a0, 1 +; ZVFHMIN-NEXT: add a0, a0, a2 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -1081,8 +2515,12 @@ define @vfma_vf_nxv32f16( %va, half %b, ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 42 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 2 +; ZVFHMIN-NEXT: add a2, a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 2 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: sub sp, sp, a1 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x2a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 42 * vlenb ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 @@ -1109,8 +2547,11 @@ define @vfma_vf_nxv32f16( %va, half %b, ; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma ; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2 ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: li a4, 25 -; ZVFHMIN-NEXT: mul a2, a2, a4 +; ZVFHMIN-NEXT: mv a4, a2 +; ZVFHMIN-NEXT: slli a2, a2, 3 +; ZVFHMIN-NEXT: add a4, a4, a2 +; ZVFHMIN-NEXT: slli a2, a2, 1 +; ZVFHMIN-NEXT: add a2, a2, a4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill @@ -1146,13 +2587,16 @@ define @vfma_vf_nxv32f16( %va, half %b, ; ZVFHMIN-NEXT: vfncvt.f.f.w v20, v8 ; ZVFHMIN-NEXT: addi a2, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: bltu a0, a1, .LBB32_2 +; ZVFHMIN-NEXT: bltu a0, a1, .LBB68_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB32_2: +; ZVFHMIN-NEXT: .LBB68_2: ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 25 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: add a2, a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload @@ -1170,8 +2614,11 @@ define @vfma_vf_nxv32f16( %va, half %b, ; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 25 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: add a2, a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill @@ -1193,8 +2640,11 @@ define @vfma_vf_nxv32f16( %va, half %b, ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 25 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: add a2, a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload @@ -1206,8 +2656,12 @@ define @vfma_vf_nxv32f16( %va, half %b, ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: li a1, 42 -; ZVFHMIN-NEXT: mul a0, a0, a1 +; ZVFHMIN-NEXT: slli a0, a0, 1 +; ZVFHMIN-NEXT: mv a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 2 +; ZVFHMIN-NEXT: add a1, a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 2 +; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: addi sp, sp, 16 ; ZVFHMIN-NEXT: ret @@ -1229,8 +2683,12 @@ define @vfma_vf_nxv32f16_commute( %va, ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 42 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 2 +; ZVFHMIN-NEXT: add a2, a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 2 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: sub sp, sp, a1 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x2a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 42 * vlenb ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 @@ -1257,8 +2715,11 @@ define @vfma_vf_nxv32f16_commute( %va, ; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma ; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2 ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: li a4, 25 -; ZVFHMIN-NEXT: mul a2, a2, a4 +; ZVFHMIN-NEXT: mv a4, a2 +; ZVFHMIN-NEXT: slli a2, a2, 3 +; ZVFHMIN-NEXT: add a4, a4, a2 +; ZVFHMIN-NEXT: slli a2, a2, 1 +; ZVFHMIN-NEXT: add a2, a2, a4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill @@ -1294,13 +2755,16 @@ define @vfma_vf_nxv32f16_commute( %va, ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 ; ZVFHMIN-NEXT: addi a2, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: bltu a0, a1, .LBB33_2 +; ZVFHMIN-NEXT: bltu a0, a1, .LBB69_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB33_2: +; ZVFHMIN-NEXT: .LBB69_2: ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 25 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: add a2, a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload @@ -1325,8 +2789,11 @@ define @vfma_vf_nxv32f16_commute( %va, ; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v0 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 25 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: add a2, a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill @@ -1341,8 +2808,11 @@ define @vfma_vf_nxv32f16_commute( %va, ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 25 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: add a2, a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload @@ -1354,8 +2824,12 @@ define @vfma_vf_nxv32f16_commute( %va, ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: li a1, 42 -; ZVFHMIN-NEXT: mul a0, a0, a1 +; ZVFHMIN-NEXT: slli a0, a0, 1 +; ZVFHMIN-NEXT: mv a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 2 +; ZVFHMIN-NEXT: add a1, a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 2 +; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: addi sp, sp, 16 ; ZVFHMIN-NEXT: ret @@ -1384,8 +2858,10 @@ define @vfma_vf_nxv32f16_unmasked( %va, ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v24, a1 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 24 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill @@ -1415,8 +2891,10 @@ define @vfma_vf_nxv32f16_unmasked( %va, ; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: li a4, 24 -; ZVFHMIN-NEXT: mul a2, a2, a4 +; ZVFHMIN-NEXT: slli a2, a2, 3 +; ZVFHMIN-NEXT: mv a4, a2 +; ZVFHMIN-NEXT: slli a2, a2, 1 +; ZVFHMIN-NEXT: add a2, a2, a4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload @@ -1427,10 +2905,10 @@ define @vfma_vf_nxv32f16_unmasked( %va, ; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v20, v8 -; ZVFHMIN-NEXT: bltu a0, a1, .LBB34_2 +; ZVFHMIN-NEXT: bltu a0, a1, .LBB70_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB34_2: +; ZVFHMIN-NEXT: .LBB70_2: ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 4 ; ZVFHMIN-NEXT: add a1, sp, a1 @@ -1446,8 +2924,10 @@ define @vfma_vf_nxv32f16_unmasked( %va, ; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 24 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload @@ -1489,8 +2969,10 @@ define @vfma_vf_nxv32f16_unmasked_commute( @vfma_vf_nxv32f16_unmasked_commute( @vfma_vf_nxv32f16_unmasked_commute( @vfma_vf_nxv32f16_unmasked_commute( @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64_unmasked( ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a3, 24 -; CHECK-NEXT: mul a1, a1, a3 +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: mv a3, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a3 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; CHECK-NEXT: csrr a1, vlenb @@ -2389,10 +3889,10 @@ define @vfma_vv_nxv16f64_unmasked( ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; CHECK-NEXT: vfmadd.vv v16, v8, v24 -; CHECK-NEXT: bltu a4, a1, .LBB93_2 +; CHECK-NEXT: bltu a4, a1, .LBB129_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a4, a1 -; CHECK-NEXT: .LBB93_2: +; CHECK-NEXT: .LBB129_2: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 @@ -2404,8 +3904,10 @@ define @vfma_vv_nxv16f64_unmasked( ; CHECK-NEXT: vfmadd.vv v0, v24, v8 ; CHECK-NEXT: vmv.v.v v8, v0 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -7161,8 +8663,12 @@ define @vfmsub_vv_nxv32f16( %va, @vfmsub_vv_nxv32f16( %va, @vfmsub_vv_nxv32f16( %va, @vfmsub_vv_nxv32f16( %va, @vfmsub_vv_nxv32f16( %va, @vfmsub_vv_nxv32f16( %va, @vfmsub_vv_nxv32f16( %va, @vfmsub_vv_nxv32f16_unmasked( %v ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: li a3, 40 -; ZVFHMIN-NEXT: mul a2, a2, a3 +; ZVFHMIN-NEXT: slli a2, a2, 3 +; ZVFHMIN-NEXT: mv a3, a2 +; ZVFHMIN-NEXT: slli a2, a2, 2 +; ZVFHMIN-NEXT: add a2, a2, a3 ; ZVFHMIN-NEXT: sub sp, sp, a2 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb ; ZVFHMIN-NEXT: vl8re16.v v24, (a0) @@ -7316,8 +8840,10 @@ define @vfmsub_vv_nxv32f16_unmasked( %v ; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; ZVFHMIN-NEXT: vxor.vx v0, v24, a0 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: li a2, 24 -; ZVFHMIN-NEXT: mul a0, a0, a2 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a2, a0 +; ZVFHMIN-NEXT: slli a0, a0, 1 +; ZVFHMIN-NEXT: add a0, a0, a2 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill @@ -7369,8 +8895,10 @@ define @vfmsub_vv_nxv32f16_unmasked( %v ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16 ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: li a3, 24 -; ZVFHMIN-NEXT: mul a2, a2, a3 +; ZVFHMIN-NEXT: slli a2, a2, 3 +; ZVFHMIN-NEXT: mv a3, a2 +; ZVFHMIN-NEXT: slli a2, a2, 1 +; ZVFHMIN-NEXT: add a2, a2, a3 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload @@ -7380,10 +8908,10 @@ define @vfmsub_vv_nxv32f16_unmasked( %v ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: bltu a1, a0, .LBB245_2 +; ZVFHMIN-NEXT: bltu a1, a0, .LBB281_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: .LBB245_2: +; ZVFHMIN-NEXT: .LBB281_2: ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 5 ; ZVFHMIN-NEXT: add a0, sp, a0 @@ -7404,8 +8932,10 @@ define @vfmsub_vv_nxv32f16_unmasked( %v ; ZVFHMIN-NEXT: vfncvt.f.f.w v0, v24 ; ZVFHMIN-NEXT: vmv8r.v v8, v0 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: li a1, 40 -; ZVFHMIN-NEXT: mul a0, a0, a1 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 2 +; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: addi sp, sp, 16 ; ZVFHMIN-NEXT: ret @@ -7433,8 +8963,10 @@ define @vfmsub_vf_nxv32f16( %va, half % ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v24, a1 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 24 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill @@ -7456,15 +8988,17 @@ define @vfmsub_vf_nxv32f16( %va, half % ; ZVFHMIN-NEXT: addi a3, a3, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: mv a3, a0 -; ZVFHMIN-NEXT: bltu a0, a2, .LBB246_2 +; ZVFHMIN-NEXT: bltu a0, a2, .LBB282_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a3, a2 -; ZVFHMIN-NEXT: .LBB246_2: +; ZVFHMIN-NEXT: .LBB282_2: ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vmv4r.v v4, v12 ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: li a5, 24 -; ZVFHMIN-NEXT: mul a4, a4, a5 +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload @@ -7505,8 +9039,10 @@ define @vfmsub_vf_nxv32f16( %va, half % ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 24 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload @@ -7559,8 +9095,10 @@ define @vfmsub_vf_nxv32f16_commute( %va ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v24, a1 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 24 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill @@ -7582,15 +9120,17 @@ define @vfmsub_vf_nxv32f16_commute( %va ; ZVFHMIN-NEXT: addi a3, a3, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: mv a3, a0 -; ZVFHMIN-NEXT: bltu a0, a2, .LBB247_2 +; ZVFHMIN-NEXT: bltu a0, a2, .LBB283_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a3, a2 -; ZVFHMIN-NEXT: .LBB247_2: +; ZVFHMIN-NEXT: .LBB283_2: ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vmv4r.v v4, v12 ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: li a5, 24 -; ZVFHMIN-NEXT: mul a4, a4, a5 +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload @@ -7626,8 +9166,10 @@ define @vfmsub_vf_nxv32f16_commute( %va ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v4 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 24 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload @@ -7685,8 +9227,10 @@ define @vfmsub_vf_nxv32f16_unmasked( %v ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v24, a1 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 24 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill @@ -7720,8 +9264,10 @@ define @vfmsub_vf_nxv32f16_unmasked( %v ; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: li a4, 24 -; ZVFHMIN-NEXT: mul a2, a2, a4 +; ZVFHMIN-NEXT: slli a2, a2, 3 +; ZVFHMIN-NEXT: mv a4, a2 +; ZVFHMIN-NEXT: slli a2, a2, 1 +; ZVFHMIN-NEXT: add a2, a2, a4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload @@ -7743,10 +9289,10 @@ define @vfmsub_vf_nxv32f16_unmasked( %v ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: bltu a0, a1, .LBB248_2 +; ZVFHMIN-NEXT: bltu a0, a1, .LBB284_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB248_2: +; ZVFHMIN-NEXT: .LBB284_2: ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 4 ; ZVFHMIN-NEXT: add a1, sp, a1 @@ -7754,8 +9300,10 @@ define @vfmsub_vf_nxv32f16_unmasked( %v ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 24 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload @@ -7803,8 +9351,10 @@ define @vfmsub_vf_nxv32f16_unmasked_commute( @vfmsub_vf_nxv32f16_unmasked_commute( @vfmsub_vf_nxv32f16_unmasked_commute( @vfmsub_vf_nxv32f16_unmasked_commute( @vfnmadd_vv_nxv32f16( %va, @vfnmadd_vv_nxv32f16( %va, @vfnmadd_vv_nxv32f16( %va, @vfnmadd_vv_nxv32f16_commuted( % ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: li a3, 40 -; ZVFHMIN-NEXT: mul a2, a2, a3 +; ZVFHMIN-NEXT: slli a2, a2, 3 +; ZVFHMIN-NEXT: mv a3, a2 +; ZVFHMIN-NEXT: slli a2, a2, 2 +; ZVFHMIN-NEXT: add a2, a2, a3 ; ZVFHMIN-NEXT: sub sp, sp, a2 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb ; ZVFHMIN-NEXT: vl8re16.v v24, (a0) @@ -8061,15 +9623,17 @@ define @vfnmadd_vv_nxv32f16_commuted( % ; ZVFHMIN-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16 ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: li a5, 24 -; ZVFHMIN-NEXT: mul a4, a4, a5 +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: bltu a1, a2, .LBB251_2 +; ZVFHMIN-NEXT: bltu a1, a2, .LBB287_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a3, a2 -; ZVFHMIN-NEXT: .LBB251_2: +; ZVFHMIN-NEXT: .LBB287_2: ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 5 ; ZVFHMIN-NEXT: add a4, sp, a4 @@ -8077,8 +9641,10 @@ define @vfnmadd_vv_nxv32f16_commuted( % ; ZVFHMIN-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: li a5, 24 -; ZVFHMIN-NEXT: mul a4, a4, a5 +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload @@ -8094,8 +9660,10 @@ define @vfnmadd_vv_nxv32f16_commuted( % ; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v4 ; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: li a4, 24 -; ZVFHMIN-NEXT: mul a3, a3, a4 +; ZVFHMIN-NEXT: slli a3, a3, 3 +; ZVFHMIN-NEXT: mv a4, a3 +; ZVFHMIN-NEXT: slli a3, a3, 1 +; ZVFHMIN-NEXT: add a3, a3, a4 ; ZVFHMIN-NEXT: add a3, sp, a3 ; ZVFHMIN-NEXT: addi a3, a3, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill @@ -8125,8 +9693,10 @@ define @vfnmadd_vv_nxv32f16_commuted( % ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: li a2, 24 -; ZVFHMIN-NEXT: mul a0, a0, a2 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a2, a0 +; ZVFHMIN-NEXT: slli a0, a0, 1 +; ZVFHMIN-NEXT: add a0, a0, a2 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -8143,8 +9713,10 @@ define @vfnmadd_vv_nxv32f16_commuted( % ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: li a1, 40 -; ZVFHMIN-NEXT: mul a0, a0, a1 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 2 +; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: addi sp, sp, 16 ; ZVFHMIN-NEXT: ret @@ -8171,8 +9743,10 @@ define @vfnmadd_vv_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: sub sp, sp, a2 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: li a3, 24 -; ZVFHMIN-NEXT: mul a2, a2, a3 +; ZVFHMIN-NEXT: slli a2, a2, 3 +; ZVFHMIN-NEXT: mv a3, a2 +; ZVFHMIN-NEXT: slli a2, a2, 1 +; ZVFHMIN-NEXT: add a2, a2, a3 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill @@ -8209,8 +9783,10 @@ define @vfnmadd_vv_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: addi a2, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: li a4, 24 -; ZVFHMIN-NEXT: mul a2, a2, a4 +; ZVFHMIN-NEXT: slli a2, a2, 3 +; ZVFHMIN-NEXT: mv a4, a2 +; ZVFHMIN-NEXT: slli a2, a2, 1 +; ZVFHMIN-NEXT: add a2, a2, a4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload @@ -8238,13 +9814,15 @@ define @vfnmadd_vv_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: bltu a1, a0, .LBB252_2 +; ZVFHMIN-NEXT: bltu a1, a0, .LBB288_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: .LBB252_2: +; ZVFHMIN-NEXT: .LBB288_2: ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: li a2, 24 -; ZVFHMIN-NEXT: mul a0, a0, a2 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a2, a0 +; ZVFHMIN-NEXT: slli a0, a0, 1 +; ZVFHMIN-NEXT: add a0, a0, a2 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -8287,8 +9865,10 @@ define @vfnmadd_vv_nxv32f16_unmasked_commuted( @vfnmadd_vv_nxv32f16_unmasked_commuted( @vfnmadd_vv_nxv32f16_unmasked_commuted( @vfnmadd_vf_nxv32f16( %va, half ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 40 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 2 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: sub sp, sp, a1 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 @@ -8422,8 +10008,10 @@ define @vfnmadd_vf_nxv32f16( %va, half ; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16 ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: li a5, 24 -; ZVFHMIN-NEXT: mul a4, a4, a5 +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill @@ -8434,10 +10022,10 @@ define @vfnmadd_vf_nxv32f16( %va, half ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: bltu a0, a2, .LBB254_2 +; ZVFHMIN-NEXT: bltu a0, a2, .LBB290_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a3, a2 -; ZVFHMIN-NEXT: .LBB254_2: +; ZVFHMIN-NEXT: .LBB290_2: ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 5 ; ZVFHMIN-NEXT: add a4, sp, a4 @@ -8445,8 +10033,10 @@ define @vfnmadd_vf_nxv32f16( %va, half ; ZVFHMIN-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: li a5, 24 -; ZVFHMIN-NEXT: mul a4, a4, a5 +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload @@ -8467,8 +10057,10 @@ define @vfnmadd_vf_nxv32f16( %va, half ; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20 ; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: li a4, 24 -; ZVFHMIN-NEXT: mul a3, a3, a4 +; ZVFHMIN-NEXT: slli a3, a3, 3 +; ZVFHMIN-NEXT: mv a4, a3 +; ZVFHMIN-NEXT: slli a3, a3, 1 +; ZVFHMIN-NEXT: add a3, a3, a4 ; ZVFHMIN-NEXT: add a3, sp, a3 ; ZVFHMIN-NEXT: addi a3, a3, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill @@ -8493,8 +10085,10 @@ define @vfnmadd_vf_nxv32f16( %va, half ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 24 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload @@ -8514,8 +10108,10 @@ define @vfnmadd_vf_nxv32f16( %va, half ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: li a1, 40 -; ZVFHMIN-NEXT: mul a0, a0, a1 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 2 +; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: addi sp, sp, 16 ; ZVFHMIN-NEXT: ret @@ -8546,8 +10142,10 @@ define @vfnmadd_vf_nxv32f16_commute( %v ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v24, a1 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 24 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill @@ -8572,13 +10170,15 @@ define @vfnmadd_vf_nxv32f16_commute( %v ; ZVFHMIN-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vmv4r.v v4, v12 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: bltu a0, a2, .LBB255_2 +; ZVFHMIN-NEXT: bltu a0, a2, .LBB291_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a3, a2 -; ZVFHMIN-NEXT: .LBB255_2: +; ZVFHMIN-NEXT: .LBB291_2: ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: li a5, 24 -; ZVFHMIN-NEXT: mul a4, a4, a5 +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload @@ -8613,8 +10213,10 @@ define @vfnmadd_vf_nxv32f16_commute( %v ; ZVFHMIN-NEXT: vsetvli a2, zero, e8, mf2, ta, ma ; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a1 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 24 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload @@ -8674,8 +10276,10 @@ define @vfnmadd_vf_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v24, a1 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 24 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill @@ -8711,8 +10315,10 @@ define @vfnmadd_vf_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: li a4, 24 -; ZVFHMIN-NEXT: mul a2, a2, a4 +; ZVFHMIN-NEXT: slli a2, a2, 3 +; ZVFHMIN-NEXT: mv a4, a2 +; ZVFHMIN-NEXT: slli a2, a2, 1 +; ZVFHMIN-NEXT: add a2, a2, a4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload @@ -8740,13 +10346,15 @@ define @vfnmadd_vf_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: bltu a0, a1, .LBB256_2 +; ZVFHMIN-NEXT: bltu a0, a1, .LBB292_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB256_2: +; ZVFHMIN-NEXT: .LBB292_2: ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 24 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload @@ -8793,8 +10401,10 @@ define @vfnmadd_vf_nxv32f16_unmasked_commute( @vfnmadd_vf_nxv32f16_unmasked_commute( @vfnmadd_vf_nxv32f16_unmasked_commute( @vfnmadd_vf_nxv32f16_neg_splat( ; ZVFHMIN-NEXT: sub sp, sp, a1 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 24 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill @@ -8939,13 +10555,15 @@ define @vfnmadd_vf_nxv32f16_neg_splat( ; ZVFHMIN-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vmv4r.v v4, v12 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: bltu a0, a2, .LBB258_2 +; ZVFHMIN-NEXT: bltu a0, a2, .LBB294_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a3, a2 -; ZVFHMIN-NEXT: .LBB258_2: +; ZVFHMIN-NEXT: .LBB294_2: ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: li a5, 24 -; ZVFHMIN-NEXT: mul a4, a4, a5 +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload @@ -8984,8 +10602,10 @@ define @vfnmadd_vf_nxv32f16_neg_splat( ; ZVFHMIN-NEXT: vsetvli a2, zero, e8, mf2, ta, ma ; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a1 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 24 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload @@ -9032,8 +10652,10 @@ define @vfnmadd_vf_nxv32f16_neg_splat_commute( @vfnmadd_vf_nxv32f16_neg_splat_commute( @vfnmadd_vf_nxv32f16_neg_splat_commute( @vfnmadd_vf_nxv32f16_neg_splat_unmasked( @vfnmadd_vf_nxv32f16_neg_splat_unmasked( @vfnmadd_vf_nxv32f16_neg_splat_unmasked( @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute( @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute( @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute( @vfnmsub_vv_nxv32f16( %va, @vfnmsub_vv_nxv32f16( %va, @vfnmsub_vv_nxv32f16( %va, @vfnmsub_vv_nxv32f16_commuted( % ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: li a3, 40 -; ZVFHMIN-NEXT: mul a2, a2, a3 +; ZVFHMIN-NEXT: slli a2, a2, 3 +; ZVFHMIN-NEXT: mv a3, a2 +; ZVFHMIN-NEXT: slli a2, a2, 2 +; ZVFHMIN-NEXT: add a2, a2, a3 ; ZVFHMIN-NEXT: sub sp, sp, a2 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb ; ZVFHMIN-NEXT: vl8re16.v v24, (a0) @@ -9544,15 +11190,17 @@ define @vfnmsub_vv_nxv32f16_commuted( % ; ZVFHMIN-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16 ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: li a5, 24 -; ZVFHMIN-NEXT: mul a4, a4, a5 +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: bltu a1, a2, .LBB263_2 +; ZVFHMIN-NEXT: bltu a1, a2, .LBB299_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a3, a2 -; ZVFHMIN-NEXT: .LBB263_2: +; ZVFHMIN-NEXT: .LBB299_2: ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 5 ; ZVFHMIN-NEXT: add a4, sp, a4 @@ -9560,8 +11208,10 @@ define @vfnmsub_vv_nxv32f16_commuted( % ; ZVFHMIN-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: li a5, 24 -; ZVFHMIN-NEXT: mul a4, a4, a5 +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload @@ -9577,8 +11227,10 @@ define @vfnmsub_vv_nxv32f16_commuted( % ; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v4 ; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: li a4, 24 -; ZVFHMIN-NEXT: mul a3, a3, a4 +; ZVFHMIN-NEXT: slli a3, a3, 3 +; ZVFHMIN-NEXT: mv a4, a3 +; ZVFHMIN-NEXT: slli a3, a3, 1 +; ZVFHMIN-NEXT: add a3, a3, a4 ; ZVFHMIN-NEXT: add a3, sp, a3 ; ZVFHMIN-NEXT: addi a3, a3, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill @@ -9608,8 +11260,10 @@ define @vfnmsub_vv_nxv32f16_commuted( % ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: li a2, 24 -; ZVFHMIN-NEXT: mul a0, a0, a2 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a2, a0 +; ZVFHMIN-NEXT: slli a0, a0, 1 +; ZVFHMIN-NEXT: add a0, a0, a2 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -9626,8 +11280,10 @@ define @vfnmsub_vv_nxv32f16_commuted( % ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: li a1, 40 -; ZVFHMIN-NEXT: mul a0, a0, a1 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 2 +; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: addi sp, sp, 16 ; ZVFHMIN-NEXT: ret @@ -9654,8 +11310,10 @@ define @vfnmsub_vv_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: sub sp, sp, a2 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: li a3, 24 -; ZVFHMIN-NEXT: mul a2, a2, a3 +; ZVFHMIN-NEXT: slli a2, a2, 3 +; ZVFHMIN-NEXT: mv a3, a2 +; ZVFHMIN-NEXT: slli a2, a2, 1 +; ZVFHMIN-NEXT: add a2, a2, a3 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill @@ -9692,8 +11350,10 @@ define @vfnmsub_vv_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: addi a2, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: li a4, 24 -; ZVFHMIN-NEXT: mul a2, a2, a4 +; ZVFHMIN-NEXT: slli a2, a2, 3 +; ZVFHMIN-NEXT: mv a4, a2 +; ZVFHMIN-NEXT: slli a2, a2, 1 +; ZVFHMIN-NEXT: add a2, a2, a4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload @@ -9721,13 +11381,15 @@ define @vfnmsub_vv_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: bltu a1, a0, .LBB264_2 +; ZVFHMIN-NEXT: bltu a1, a0, .LBB300_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: .LBB264_2: +; ZVFHMIN-NEXT: .LBB300_2: ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: li a2, 24 -; ZVFHMIN-NEXT: mul a0, a0, a2 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a2, a0 +; ZVFHMIN-NEXT: slli a0, a0, 1 +; ZVFHMIN-NEXT: add a0, a0, a2 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -9770,8 +11432,10 @@ define @vfnmsub_vv_nxv32f16_unmasked_commuted( @vfnmsub_vv_nxv32f16_unmasked_commuted( @vfnmsub_vv_nxv32f16_unmasked_commuted( @vfnmsub_vf_nxv32f16( %va, half ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 40 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 2 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: sub sp, sp, a1 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 @@ -9907,14 +11577,16 @@ define @vfnmsub_vf_nxv32f16( %va, half ; ZVFHMIN-NEXT: addi a3, a3, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: mv a3, a0 -; ZVFHMIN-NEXT: bltu a0, a2, .LBB266_2 +; ZVFHMIN-NEXT: bltu a0, a2, .LBB302_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a3, a2 -; ZVFHMIN-NEXT: .LBB266_2: +; ZVFHMIN-NEXT: .LBB302_2: ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16 ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: li a5, 24 -; ZVFHMIN-NEXT: mul a4, a4, a5 +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill @@ -9948,8 +11620,10 @@ define @vfnmsub_vf_nxv32f16( %va, half ; ZVFHMIN-NEXT: vsetvli a2, zero, e8, mf2, ta, ma ; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a1 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 24 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload @@ -9984,8 +11658,10 @@ define @vfnmsub_vf_nxv32f16( %va, half ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: li a1, 40 -; ZVFHMIN-NEXT: mul a0, a0, a1 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 2 +; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: addi sp, sp, 16 ; ZVFHMIN-NEXT: ret @@ -10020,8 +11696,10 @@ define @vfnmsub_vf_nxv32f16_commute( %v ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v24, a1 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 24 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill @@ -10034,10 +11712,10 @@ define @vfnmsub_vf_nxv32f16_commute( %v ; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: mv a3, a0 -; ZVFHMIN-NEXT: bltu a0, a2, .LBB267_2 +; ZVFHMIN-NEXT: bltu a0, a2, .LBB303_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a3, a2 -; ZVFHMIN-NEXT: .LBB267_2: +; ZVFHMIN-NEXT: .LBB303_2: ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 4 ; ZVFHMIN-NEXT: add a4, sp, a4 @@ -10050,8 +11728,10 @@ define @vfnmsub_vf_nxv32f16_commute( %v ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: li a5, 24 -; ZVFHMIN-NEXT: mul a4, a4, a5 +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload @@ -10087,8 +11767,10 @@ define @vfnmsub_vf_nxv32f16_commute( %v ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 24 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload @@ -10136,8 +11818,10 @@ define @vfnmsub_vf_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v24, a1 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 24 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill @@ -10172,8 +11856,10 @@ define @vfnmsub_vf_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: li a4, 24 -; ZVFHMIN-NEXT: mul a2, a2, a4 +; ZVFHMIN-NEXT: slli a2, a2, 3 +; ZVFHMIN-NEXT: mv a4, a2 +; ZVFHMIN-NEXT: slli a2, a2, 1 +; ZVFHMIN-NEXT: add a2, a2, a4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload @@ -10195,10 +11881,10 @@ define @vfnmsub_vf_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: bltu a0, a1, .LBB268_2 +; ZVFHMIN-NEXT: bltu a0, a1, .LBB304_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB268_2: +; ZVFHMIN-NEXT: .LBB304_2: ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 4 ; ZVFHMIN-NEXT: add a1, sp, a1 @@ -10206,8 +11892,10 @@ define @vfnmsub_vf_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 24 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload @@ -10253,8 +11941,10 @@ define @vfnmsub_vf_nxv32f16_unmasked_commute( @vfnmsub_vf_nxv32f16_unmasked_commute( @vfnmsub_vf_nxv32f16_unmasked_commute( @vfnmsub_vf_nxv32f16_unmasked_commute( @vfnmsub_vf_nxv32f16_neg_splat( ; ZVFHMIN-NEXT: sub sp, sp, a1 ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 24 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill @@ -10393,13 +12089,15 @@ define @vfnmsub_vf_nxv32f16_neg_splat( ; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 ; ZVFHMIN-NEXT: mv a3, a0 -; ZVFHMIN-NEXT: bltu a0, a2, .LBB270_2 +; ZVFHMIN-NEXT: bltu a0, a2, .LBB306_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a3, a2 -; ZVFHMIN-NEXT: .LBB270_2: +; ZVFHMIN-NEXT: .LBB306_2: ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: li a5, 24 -; ZVFHMIN-NEXT: mul a4, a4, a5 +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload @@ -10434,8 +12132,10 @@ define @vfnmsub_vf_nxv32f16_neg_splat( ; ZVFHMIN-NEXT: vsetvli a2, zero, e8, mf2, ta, ma ; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a1 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: li a2, 24 -; ZVFHMIN-NEXT: mul a1, a1, a2 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload @@ -10483,13 +12183,18 @@ define @vfnmsub_vf_nxv32f16_neg_splat_commute( @vfnmsub_vf_nxv32f16_neg_splat_commute( @vfnmsub_vf_nxv32f16_neg_splat_commute( @vfnmsub_vf_nxv32f16_neg_splat_commute( @vfnmsub_vf_nxv32f16_neg_splat_unmasked( @vfnmsub_vf_nxv32f16_neg_splat_unmasked( @vfnmsub_vf_nxv32f16_neg_splat_unmasked( @vfnmsub_vf_nxv32f16_neg_splat_unmasked( @vfnmsub_vf_nxv32f16_neg_splat_unmasked_commute( @vfnmsub_vf_nxv32f16_neg_splat_unmasked_commute( @vfnmsub_vf_nxv32f16_neg_splat_unmasked_commute( @vfnmsub_vf_nxv32f16_neg_splat_unmasked_commute( @vfmadd_vv_nxv1bf16( %va, %vb, %vc) strictfp { +; CHECK-LABEL: vfmadd_vv_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfmadd.vv v9, v10, v11 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %vd = call @llvm.experimental.constrained.fma.nxv1bf16( %va, %vb, %vc, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret %vd +} + +define @vfmadd_vf_nxv1bf16( %va, %vb, bfloat %c) strictfp { +; CHECK-LABEL: vfmadd_vf_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfmadd.vv v12, v9, v11 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %c, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vd = call @llvm.experimental.constrained.fma.nxv1bf16( %va, %splat, %vb, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret %vd +} + + +define @vfmadd_vv_nxv2bf16( %va, %vb, %vc) strictfp { +; CHECK-LABEL: vfmadd_vv_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfmadd.vv v10, v9, v11 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %vd = call @llvm.experimental.constrained.fma.nxv2bf16( %va, %vc, %vb, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret %vd +} + +define @vfmadd_vf_nxv2bf16( %va, %vb, bfloat %c) strictfp { +; CHECK-LABEL: vfmadd_vf_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfmadd.vv v9, v8, v11 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %c, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vd = call @llvm.experimental.constrained.fma.nxv2bf16( %vb, %splat, %va, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret %vd +} + + +define @vfmadd_vv_nxv4bf16( %va, %vb, %vc) strictfp { +; CHECK-LABEL: vfmadd_vv_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmadd.vv v14, v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v14 +; CHECK-NEXT: ret + %vd = call @llvm.experimental.constrained.fma.nxv4bf16( %vb, %va, %vc, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret %vd +} + +define @vfmadd_vf_nxv4bf16( %va, %vb, bfloat %c) strictfp { +; CHECK-LABEL: vfmadd_vf_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmadd.vv v16, v14, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %c, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vd = call @llvm.experimental.constrained.fma.nxv4bf16( %va, %splat, %vb, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret %vd +} + + +define @vfmadd_vv_nxv8bf16( %va, %vb, %vc) strictfp { +; CHECK-LABEL: vfmadd_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v20, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfmadd.vv v12, v20, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %vd = call @llvm.experimental.constrained.fma.nxv8bf16( %vb, %vc, %va, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret %vd +} + +define @vfmadd_vf_nxv8bf16( %va, %vb, bfloat %c) strictfp { +; CHECK-LABEL: vfmadd_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v12, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v20, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfmadd.vv v24, v20, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %c, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vd = call @llvm.experimental.constrained.fma.nxv8bf16( %vb, %splat, %va, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret %vd +} + + +define @vfmadd_vv_nxv16bf16( %va, %vb, %vc) strictfp { +; CHECK-LABEL: vfmadd_vv_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs4r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmadd.vv v24, v0, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %vd = call @llvm.experimental.constrained.fma.nxv16bf16( %vc, %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret %vd +} + +define @vfmadd_vf_nxv16bf16( %va, %vb, bfloat %c) strictfp { +; CHECK-LABEL: vfmadd_vf_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vmv.v.x v16, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs4r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 +; CHECK-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmadd.vv v16, v0, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %c, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vd = call @llvm.experimental.constrained.fma.nxv16bf16( %va, %splat, %vb, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret %vd +} + + +define @vfmadd_vv_nxv32bf16( %va, %vb, %vc) strictfp { +; CHECK-LABEL: vfmadd_vv_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 5 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; CHECK-NEXT: vl8re16.v v0, (a0) +; CHECK-NEXT: vmv8r.v v24, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vmv8r.v v16, v8 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vmv8r.v v8, v0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmadd.vv v0, v16, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmadd.vv v16, v8, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %vd = call @llvm.experimental.constrained.fma.nxv32bf16( %vc, %vb, %va, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret %vd +} + +define @vfmadd_vf_nxv32bf16( %va, %vb, bfloat %c) strictfp { +; CHECK-LABEL: vfmadd_vf_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; CHECK-NEXT: vmv8r.v v24, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; CHECK-NEXT: vmv.v.x v16, a0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmadd.vv v0, v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v20 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmadd.vv v16, v8, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %c, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vd = call @llvm.experimental.constrained.fma.nxv32bf16( %vb, %splat, %va, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret %vd +} + declare @llvm.experimental.constrained.fma.nxv1f16(, , , metadata, metadata) define @vfmadd_vv_nxv1f16( %va, %vb, %vc) strictfp { diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmadd-sdnode.ll index a80a943c2e1dbe..2df2212c43db09 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmadd-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmadd-sdnode.ll @@ -1,16 +1,573 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN ; This tests a mix of vfmacc and vfmadd by using different operand orders to ; trigger commuting in TwoAddressInstructionPass. +define @vfmadd_vv_nxv1bf16( %va, %vb, %vc) { +; CHECK-LABEL: vfmadd_vv_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfmadd.vv v12, v10, v11 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %vd = call @llvm.fma.v1bf16( %va, %vb, %vc) + ret %vd +} + +define @vfmadd_vv_nxv1bf16_commuted( %va, %vb, %vc) { +; CHECK-LABEL: vfmadd_vv_nxv1bf16_commuted: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfmadd.vv v9, v8, v11 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %vd = call @llvm.fma.v1bf16( %vb, %vc, %va) + ret %vd +} + +define @vfmadd_vf_nxv1bf16( %va, %vb, bfloat %c) { +; CHECK-LABEL: vfmadd_vf_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfmadd.vv v12, v9, v11 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %c, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vd = call @llvm.fma.v1bf16( %va, %splat, %vb) + ret %vd +} + +declare @llvm.fma.v2bf16(, , ) + +define @vfmadd_vv_nxv2bf16( %va, %vb, %vc) { +; CHECK-LABEL: vfmadd_vv_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfmadd.vv v12, v9, v11 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %vd = call @llvm.fma.v2bf16( %va, %vc, %vb) + ret %vd +} + +define @vfmadd_vf_nxv2bf16( %va, %vb, bfloat %c) { +; CHECK-LABEL: vfmadd_vf_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfmadd.vv v9, v8, v11 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %c, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vd = call @llvm.fma.v2bf16( %vb, %splat, %va) + ret %vd +} + +declare @llvm.fma.v4bf16(, , ) + +define @vfmadd_vv_nxv4bf16( %va, %vb, %vc) { +; CHECK-LABEL: vfmadd_vv_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmadd.vv v14, v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v14 +; CHECK-NEXT: ret + %vd = call @llvm.fma.v4bf16( %vb, %va, %vc) + ret %vd +} + +define @vfmadd_vf_nxv4bf16( %va, %vb, bfloat %c) { +; CHECK-LABEL: vfmadd_vf_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmadd.vv v16, v14, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %c, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vd = call @llvm.fma.v4bf16( %va, %splat, %vb) + ret %vd +} + +declare @llvm.fma.v8bf16(, , ) + +define @vfmadd_vv_nxv8bf16( %va, %vb, %vc) { +; CHECK-LABEL: vfmadd_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v20, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfmadd.vv v24, v20, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: ret + %vd = call @llvm.fma.v8bf16( %vb, %vc, %va) + ret %vd +} + +define @vfmadd_vf_nxv8bf16( %va, %vb, bfloat %c) { +; CHECK-LABEL: vfmadd_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v12, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v20, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfmadd.vv v24, v20, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %c, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vd = call @llvm.fma.v8bf16( %vb, %splat, %va) + ret %vd +} + +declare @llvm.fma.v16bf16(, , ) + +define @vfmadd_vv_nxv16bf16( %va, %vb, %vc) { +; CHECK-LABEL: vfmadd_vv_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmadd.vv v16, v0, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %vd = call @llvm.fma.v16bf16( %vc, %va, %vb) + ret %vd +} + +define @vfmadd_vf_nxv16bf16( %va, %vb, bfloat %c) { +; CHECK-LABEL: vfmadd_vf_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vmv.v.x v16, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs4r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 +; CHECK-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmadd.vv v16, v0, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %c, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vd = call @llvm.fma.v16bf16( %va, %splat, %vb) + ret %vd +} + +declare @llvm.fma.v32bf16(, , ) + +define @vfmadd_vv_nxv32bf16( %va, %vb, %vc) { +; ZVFH-LABEL: vfmadd_vv_nxv32bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: addi sp, sp, -16 +; ZVFH-NEXT: .cfi_def_cfa_offset 16 +; ZVFH-NEXT: csrr a1, vlenb +; ZVFH-NEXT: slli a1, a1, 5 +; ZVFH-NEXT: sub sp, sp, a1 +; ZVFH-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; ZVFH-NEXT: vl8re16.v v0, (a0) +; ZVFH-NEXT: vmv8r.v v24, v16 +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 4 +; ZVFH-NEXT: add a0, sp, a0 +; ZVFH-NEXT: addi a0, a0, 16 +; ZVFH-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; ZVFH-NEXT: vmv8r.v v16, v8 +; ZVFH-NEXT: addi a0, sp, 16 +; ZVFH-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v16 +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 3 +; ZVFH-NEXT: mv a1, a0 +; ZVFH-NEXT: slli a0, a0, 1 +; ZVFH-NEXT: add a0, a0, a1 +; ZVFH-NEXT: add a0, sp, a0 +; ZVFH-NEXT: addi a0, a0, 16 +; ZVFH-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v24 +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 3 +; ZVFH-NEXT: add a0, sp, a0 +; ZVFH-NEXT: addi a0, a0, 16 +; ZVFH-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; ZVFH-NEXT: vmv8r.v v8, v0 +; ZVFH-NEXT: vfwcvtbf16.f.f.v v0, v8 +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 3 +; ZVFH-NEXT: mv a1, a0 +; ZVFH-NEXT: slli a0, a0, 1 +; ZVFH-NEXT: add a0, a0, a1 +; ZVFH-NEXT: add a0, sp, a0 +; ZVFH-NEXT: addi a0, a0, 16 +; ZVFH-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 3 +; ZVFH-NEXT: add a0, sp, a0 +; ZVFH-NEXT: addi a0, a0, 16 +; ZVFH-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFH-NEXT: vfmadd.vv v0, v16, v24 +; ZVFH-NEXT: addi a0, sp, 16 +; ZVFH-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v20 +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 3 +; ZVFH-NEXT: add a0, sp, a0 +; ZVFH-NEXT: addi a0, a0, 16 +; ZVFH-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 4 +; ZVFH-NEXT: add a0, sp, a0 +; ZVFH-NEXT: addi a0, a0, 16 +; ZVFH-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v20 +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 3 +; ZVFH-NEXT: mv a1, a0 +; ZVFH-NEXT: slli a0, a0, 1 +; ZVFH-NEXT: add a0, a0, a1 +; ZVFH-NEXT: add a0, sp, a0 +; ZVFH-NEXT: addi a0, a0, 16 +; ZVFH-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v12 +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 3 +; ZVFH-NEXT: mv a1, a0 +; ZVFH-NEXT: slli a0, a0, 1 +; ZVFH-NEXT: add a0, a0, a1 +; ZVFH-NEXT: add a0, sp, a0 +; ZVFH-NEXT: addi a0, a0, 16 +; ZVFH-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 3 +; ZVFH-NEXT: add a0, sp, a0 +; ZVFH-NEXT: addi a0, a0, 16 +; ZVFH-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFH-NEXT: vfmadd.vv v16, v8, v24 +; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v0 +; ZVFH-NEXT: vfncvtbf16.f.f.w v12, v16 +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 5 +; ZVFH-NEXT: add sp, sp, a0 +; ZVFH-NEXT: addi sp, sp, 16 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfmadd_vv_nxv32bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: addi sp, sp, -16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: slli a1, a1, 5 +; ZVFHMIN-NEXT: sub sp, sp, a1 +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; ZVFHMIN-NEXT: vl8re16.v v0, (a0) +; ZVFHMIN-NEXT: vmv8r.v v24, v16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vmv8r.v v16, v8 +; ZVFHMIN-NEXT: addi a0, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: li a1, 24 +; ZVFHMIN-NEXT: mul a0, a0, a1 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v24 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vmv8r.v v8, v0 +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v0, v8 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: li a1, 24 +; ZVFHMIN-NEXT: mul a0, a0, a1 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfmadd.vv v0, v16, v24 +; ZVFHMIN-NEXT: addi a0, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v20 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v20 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: li a1, 24 +; ZVFHMIN-NEXT: mul a0, a0, a1 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v12 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: li a1, 24 +; ZVFHMIN-NEXT: mul a0, a0, a1 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24 +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v0 +; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v12, v16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 5 +; ZVFHMIN-NEXT: add sp, sp, a0 +; ZVFHMIN-NEXT: addi sp, sp, 16 +; ZVFHMIN-NEXT: ret + %vd = call @llvm.fma.v32bf16( %vc, %vb, %va) + ret %vd +} + +define @vfmadd_vf_nxv32bf16( %va, %vb, bfloat %c) { +; ZVFH-LABEL: vfmadd_vf_nxv32bf16: +; ZVFH: # %bb.0: +; ZVFH-NEXT: addi sp, sp, -16 +; ZVFH-NEXT: .cfi_def_cfa_offset 16 +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 3 +; ZVFH-NEXT: mv a1, a0 +; ZVFH-NEXT: slli a0, a0, 1 +; ZVFH-NEXT: add a0, a0, a1 +; ZVFH-NEXT: sub sp, sp, a0 +; ZVFH-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; ZVFH-NEXT: vmv8r.v v24, v16 +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 3 +; ZVFH-NEXT: add a0, sp, a0 +; ZVFH-NEXT: addi a0, a0, 16 +; ZVFH-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; ZVFH-NEXT: fmv.x.h a0, fa0 +; ZVFH-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; ZVFH-NEXT: vmv.v.x v16, a0 +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 4 +; ZVFH-NEXT: add a0, sp, a0 +; ZVFH-NEXT: addi a0, a0, 16 +; ZVFH-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v8 +; ZVFH-NEXT: addi a0, sp, 16 +; ZVFH-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v24 +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 4 +; ZVFH-NEXT: add a0, sp, a0 +; ZVFH-NEXT: addi a0, a0, 16 +; ZVFH-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; ZVFH-NEXT: vfwcvtbf16.f.f.v v0, v24 +; ZVFH-NEXT: addi a0, sp, 16 +; ZVFH-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFH-NEXT: vfmadd.vv v0, v16, v24 +; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v12 +; ZVFH-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 3 +; ZVFH-NEXT: add a0, sp, a0 +; ZVFH-NEXT: addi a0, a0, 16 +; ZVFH-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v20 +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 4 +; ZVFH-NEXT: add a0, sp, a0 +; ZVFH-NEXT: addi a0, a0, 16 +; ZVFH-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v20 +; ZVFH-NEXT: addi a0, sp, 16 +; ZVFH-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFH-NEXT: vfmadd.vv v24, v8, v16 +; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v0 +; ZVFH-NEXT: vfncvtbf16.f.f.w v12, v24 +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 3 +; ZVFH-NEXT: mv a1, a0 +; ZVFH-NEXT: slli a0, a0, 1 +; ZVFH-NEXT: add a0, a0, a1 +; ZVFH-NEXT: add sp, sp, a0 +; ZVFH-NEXT: addi sp, sp, 16 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vfmadd_vf_nxv32bf16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: addi sp, sp, -16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: li a1, 24 +; ZVFHMIN-NEXT: mul a0, a0, a1 +; ZVFHMIN-NEXT: sub sp, sp, a0 +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; ZVFHMIN-NEXT: vmv8r.v v24, v16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: fmv.x.h a0, fa0 +; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v16, a0 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v8 +; ZVFHMIN-NEXT: addi a0, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v24 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v0, v24 +; ZVFHMIN-NEXT: addi a0, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfmadd.vv v0, v16, v24 +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v12 +; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v20 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v20 +; ZVFHMIN-NEXT: addi a0, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16 +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v0 +; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v12, v24 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: li a1, 24 +; ZVFHMIN-NEXT: mul a0, a0, a1 +; ZVFHMIN-NEXT: add sp, sp, a0 +; ZVFHMIN-NEXT: addi sp, sp, 16 +; ZVFHMIN-NEXT: ret + %head = insertelement poison, bfloat %c, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vd = call @llvm.fma.v32bf16( %vb, %splat, %va) + ret %vd +} + declare @llvm.fma.v1f16(, , ) define @vfmadd_vv_nxv1f16( %va, %vb, %vc) { diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmax-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmax-sdnode.ll index caf37b7a0a12bd..b5604add6d25bd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmax-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmax-sdnode.ll @@ -1,12 +1,243 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN + +define @vfmax_nxv1bf16_vv( %a, %b) { +; CHECK-LABEL: vfmax_nxv1bf16_vv: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfmax.vv v9, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.maxnum.nxv1bf16( %a, %b) + ret %v +} + +define @vfmax_nxv1bf16_vf( %a, bfloat %b) { +; CHECK-LABEL: vfmax_nxv1bf16_vf: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfmax.vv v9, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %v = call @llvm.maxnum.nxv1bf16( %a, %splat) + ret %v +} + +declare @llvm.maxnum.nxv2bf16(, ) + +define @vfmax_nxv2bf16_vv( %a, %b) { +; CHECK-LABEL: vfmax_nxv2bf16_vv: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfmax.vv v9, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.maxnum.nxv2bf16( %a, %b) + ret %v +} + +define @vfmax_nxv2bf16_vf( %a, bfloat %b) { +; CHECK-LABEL: vfmax_nxv2bf16_vf: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfmax.vv v9, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %v = call @llvm.maxnum.nxv2bf16( %a, %splat) + ret %v +} + +declare @llvm.maxnum.nxv4bf16(, ) + +define @vfmax_nxv4bf16_vv( %a, %b) { +; CHECK-LABEL: vfmax_nxv4bf16_vv: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmax.vv v10, v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.maxnum.nxv4bf16( %a, %b) + ret %v +} + +define @vfmax_nxv4bf16_vf( %a, bfloat %b) { +; CHECK-LABEL: vfmax_nxv4bf16_vf: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmax.vv v10, v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %v = call @llvm.maxnum.nxv4bf16( %a, %splat) + ret %v +} + +declare @llvm.maxnum.nxv8bf16(, ) + +define @vfmax_nxv8bf16_vv( %a, %b) { +; CHECK-LABEL: vfmax_nxv8bf16_vv: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfmax.vv v12, v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.maxnum.nxv8bf16( %a, %b) + ret %v +} + +define @vfmax_nxv8bf16_vf( %a, bfloat %b) { +; CHECK-LABEL: vfmax_nxv8bf16_vf: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfmax.vv v12, v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %v = call @llvm.maxnum.nxv8bf16( %a, %splat) + ret %v +} + +declare @llvm.maxnum.nxv16bf16(, ) + +define @vfmax_nxv16bf16_vv( %a, %b) { +; CHECK-LABEL: vfmax_nxv16bf16_vv: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmax.vv v16, v24, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %v = call @llvm.maxnum.nxv16bf16( %a, %b) + ret %v +} + +define @vfmax_nxv16bf16_vf( %a, bfloat %b) { +; CHECK-LABEL: vfmax_nxv16bf16_vf: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vmv.v.x v12, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmax.vv v16, v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %v = call @llvm.maxnum.nxv16bf16( %a, %splat) + ret %v +} + +declare @llvm.maxnum.nxv32bf16(, ) + +define @vfmax_nxv32bf16_vv( %a, %b) { +; CHECK-LABEL: vfmax_nxv32bf16_vv: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmax.vv v24, v0, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmax.vv v16, v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: ret + %v = call @llvm.maxnum.nxv32bf16( %a, %b) + ret %v +} + +define @vfmax_nxv32bf16_vf( %a, bfloat %b) { +; CHECK-LABEL: vfmax_nxv32bf16_vf: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; CHECK-NEXT: vmv.v.x v16, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmax.vv v24, v24, v0 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v20 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmax.vv v16, v24, v0 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %v = call @llvm.maxnum.nxv32bf16( %a, %splat) + ret %v +} declare @llvm.maxnum.nxv1f16(, ) diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll index 7ab999ea4fa7ee..6e38881b4d60fb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll @@ -1,13 +1,278 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN + +declare @llvm.vp.maxnum.nxv1bf16(, , , i32) + +define @vfmax_vv_nxv1bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfmax_vv_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfmax.vv v9, v9, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.maxnum.nxv1bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfmax_vv_nxv1bf16_unmasked( %va, %vb, i32 zeroext %evl) { +; CHECK-LABEL: vfmax_vv_nxv1bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfmax.vv v9, v9, v10 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.maxnum.nxv1bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.maxnum.nxv2bf16(, , , i32) + +define @vfmax_vv_nxv2bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfmax_vv_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfmax.vv v9, v9, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.maxnum.nxv2bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfmax_vv_nxv2bf16_unmasked( %va, %vb, i32 zeroext %evl) { +; CHECK-LABEL: vfmax_vv_nxv2bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfmax.vv v9, v9, v10 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.maxnum.nxv2bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.maxnum.nxv4bf16(, , , i32) + +define @vfmax_vv_nxv4bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfmax_vv_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfmax.vv v10, v12, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.vp.maxnum.nxv4bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfmax_vv_nxv4bf16_unmasked( %va, %vb, i32 zeroext %evl) { +; CHECK-LABEL: vfmax_vv_nxv4bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfmax.vv v10, v12, v10 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.vp.maxnum.nxv4bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.maxnum.nxv8bf16(, , , i32) +define @vfmax_vv_nxv8bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfmax_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfmax.vv v12, v16, v12, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.vp.maxnum.nxv8bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfmax_vv_nxv8bf16_unmasked( %va, %vb, i32 zeroext %evl) { +; CHECK-LABEL: vfmax_vv_nxv8bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfmax.vv v12, v16, v12 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.vp.maxnum.nxv8bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.maxnum.nxv16bf16(, , , i32) + +define @vfmax_vv_nxv16bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfmax_vv_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfmax.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %v = call @llvm.vp.maxnum.nxv16bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfmax_vv_nxv16bf16_unmasked( %va, %vb, i32 zeroext %evl) { +; CHECK-LABEL: vfmax_vv_nxv16bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfmax.vv v16, v24, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %v = call @llvm.vp.maxnum.nxv16bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.maxnum.nxv32bf16(, , , i32) + +define @vfmax_vv_nxv32bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfmax_vv_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: vmv1r.v v7, v0 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v0, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfmax.vv v16, v16, v24, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: bltu a0, a1, .LBB10_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB10_2: +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfmax.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.maxnum.nxv32bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfmax_vv_nxv32bf16_unmasked( %va, %vb, i32 zeroext %evl) { +; CHECK-LABEL: vfmax_vv_nxv32bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma +; CHECK-NEXT: vmset.m v24 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v24, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfmax.vv v16, v16, v24, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: bltu a0, a1, .LBB11_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB11_2: +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfmax.vv v16, v24, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.maxnum.nxv32bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} declare @llvm.vp.maxnum.nxv1f16(, , , i32) define @vfmax_vv_nxv1f16( %va, %vb, %m, i32 zeroext %evl) { @@ -264,10 +529,10 @@ define @vfmax_vv_nxv32f16( %va, @vfmax_vv_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: vfmax.vv v16, v16, v24, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 -; ZVFHMIN-NEXT: bltu a0, a1, .LBB11_2 +; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB11_2: +; ZVFHMIN-NEXT: .LBB23_2: ; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmin-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmin-sdnode.ll index b47e14f4f26be6..9212ddab5b1ebf 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmin-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmin-sdnode.ll @@ -1,12 +1,243 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN + +define @vfmin_nxv1bf16_vv( %a, %b) { +; CHECK-LABEL: vfmin_nxv1bf16_vv: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfmin.vv v9, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.minnum.nxv1bf16( %a, %b) + ret %v +} + +define @vfmin_nxv1bf16_vf( %a, bfloat %b) { +; CHECK-LABEL: vfmin_nxv1bf16_vf: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfmin.vv v9, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %v = call @llvm.minnum.nxv1bf16( %a, %splat) + ret %v +} + +declare @llvm.minnum.nxv2bf16(, ) + +define @vfmin_nxv2bf16_vv( %a, %b) { +; CHECK-LABEL: vfmin_nxv2bf16_vv: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfmin.vv v9, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.minnum.nxv2bf16( %a, %b) + ret %v +} + +define @vfmin_nxv2bf16_vf( %a, bfloat %b) { +; CHECK-LABEL: vfmin_nxv2bf16_vf: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfmin.vv v9, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %v = call @llvm.minnum.nxv2bf16( %a, %splat) + ret %v +} + +declare @llvm.minnum.nxv4bf16(, ) + +define @vfmin_nxv4bf16_vv( %a, %b) { +; CHECK-LABEL: vfmin_nxv4bf16_vv: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmin.vv v10, v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.minnum.nxv4bf16( %a, %b) + ret %v +} + +define @vfmin_nxv4bf16_vf( %a, bfloat %b) { +; CHECK-LABEL: vfmin_nxv4bf16_vf: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmin.vv v10, v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %v = call @llvm.minnum.nxv4bf16( %a, %splat) + ret %v +} + +declare @llvm.minnum.nxv8bf16(, ) + +define @vfmin_nxv8bf16_vv( %a, %b) { +; CHECK-LABEL: vfmin_nxv8bf16_vv: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfmin.vv v12, v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.minnum.nxv8bf16( %a, %b) + ret %v +} + +define @vfmin_nxv8bf16_vf( %a, bfloat %b) { +; CHECK-LABEL: vfmin_nxv8bf16_vf: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfmin.vv v12, v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %v = call @llvm.minnum.nxv8bf16( %a, %splat) + ret %v +} + +declare @llvm.minnum.nxv16bf16(, ) + +define @vfmin_nxv16bf16_vv( %a, %b) { +; CHECK-LABEL: vfmin_nxv16bf16_vv: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmin.vv v16, v24, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %v = call @llvm.minnum.nxv16bf16( %a, %b) + ret %v +} + +define @vfmin_nxv16bf16_vf( %a, bfloat %b) { +; CHECK-LABEL: vfmin_nxv16bf16_vf: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vmv.v.x v12, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmin.vv v16, v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %v = call @llvm.minnum.nxv16bf16( %a, %splat) + ret %v +} + +declare @llvm.minnum.nxv32bf16(, ) + +define @vfmin_nxv32bf16_vv( %a, %b) { +; CHECK-LABEL: vfmin_nxv32bf16_vv: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmin.vv v24, v0, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmin.vv v16, v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: ret + %v = call @llvm.minnum.nxv32bf16( %a, %b) + ret %v +} + +define @vfmin_nxv32bf16_vf( %a, bfloat %b) { +; CHECK-LABEL: vfmin_nxv32bf16_vf: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; CHECK-NEXT: vmv.v.x v16, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmin.vv v24, v24, v0 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v20 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmin.vv v16, v24, v0 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %v = call @llvm.minnum.nxv32bf16( %a, %splat) + ret %v +} declare @llvm.minnum.nxv1f16(, ) diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll index e928df85b5bb56..f1d6b2100ae980 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll @@ -1,13 +1,278 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN + +declare @llvm.vp.minnum.nxv1bf16(, , , i32) + +define @vfmin_vv_nxv1bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfmin_vv_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfmin.vv v9, v9, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.minnum.nxv1bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfmin_vv_nxv1bf16_unmasked( %va, %vb, i32 zeroext %evl) { +; CHECK-LABEL: vfmin_vv_nxv1bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfmin.vv v9, v9, v10 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.minnum.nxv1bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.minnum.nxv2bf16(, , , i32) + +define @vfmin_vv_nxv2bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfmin_vv_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfmin.vv v9, v9, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.minnum.nxv2bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfmin_vv_nxv2bf16_unmasked( %va, %vb, i32 zeroext %evl) { +; CHECK-LABEL: vfmin_vv_nxv2bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfmin.vv v9, v9, v10 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.minnum.nxv2bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.minnum.nxv4bf16(, , , i32) + +define @vfmin_vv_nxv4bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfmin_vv_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfmin.vv v10, v12, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.vp.minnum.nxv4bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfmin_vv_nxv4bf16_unmasked( %va, %vb, i32 zeroext %evl) { +; CHECK-LABEL: vfmin_vv_nxv4bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfmin.vv v10, v12, v10 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.vp.minnum.nxv4bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.minnum.nxv8bf16(, , , i32) +define @vfmin_vv_nxv8bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfmin_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfmin.vv v12, v16, v12, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.vp.minnum.nxv8bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfmin_vv_nxv8bf16_unmasked( %va, %vb, i32 zeroext %evl) { +; CHECK-LABEL: vfmin_vv_nxv8bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfmin.vv v12, v16, v12 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.vp.minnum.nxv8bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.minnum.nxv16bf16(, , , i32) + +define @vfmin_vv_nxv16bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfmin_vv_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfmin.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %v = call @llvm.vp.minnum.nxv16bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfmin_vv_nxv16bf16_unmasked( %va, %vb, i32 zeroext %evl) { +; CHECK-LABEL: vfmin_vv_nxv16bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfmin.vv v16, v24, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %v = call @llvm.vp.minnum.nxv16bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.minnum.nxv32bf16(, , , i32) + +define @vfmin_vv_nxv32bf16( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfmin_vv_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: vmv1r.v v7, v0 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v0, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfmin.vv v16, v16, v24, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: bltu a0, a1, .LBB10_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB10_2: +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfmin.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.minnum.nxv32bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfmin_vv_nxv32bf16_unmasked( %va, %vb, i32 zeroext %evl) { +; CHECK-LABEL: vfmin_vv_nxv32bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma +; CHECK-NEXT: vmset.m v24 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v24, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfmin.vv v16, v16, v24, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: bltu a0, a1, .LBB11_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB11_2: +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfmin.vv v16, v24, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.minnum.nxv32bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} declare @llvm.vp.minnum.nxv1f16(, , , i32) define @vfmin_vv_nxv1f16( %va, %vb, %m, i32 zeroext %evl) { @@ -264,10 +529,10 @@ define @vfmin_vv_nxv32f16( %va, @vfmin_vv_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: vfmin.vv v16, v16, v24, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 -; ZVFHMIN-NEXT: bltu a0, a1, .LBB11_2 +; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB11_2: +; ZVFHMIN-NEXT: .LBB23_2: ; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmul-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmul-constrained-sdnode.ll index e82fdf065574f9..999b06ba5a5791 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmul-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmul-constrained-sdnode.ll @@ -1,12 +1,239 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN + +define @vfmul_vv_nxv1bf16( %va, %vb) strictfp { +; CHECK-LABEL: vfmul_vv_nxv1bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfmul.vv v9, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret +entry: + %vc = call @llvm.experimental.constrained.fmul.nxv1bf16( %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfmul_vf_nxv1bf16( %va, bfloat %b) strictfp { +; CHECK-LABEL: vfmul_vf_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfmul.vv v9, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = call @llvm.experimental.constrained.fmul.nxv1bf16( %va, %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfmul_vv_nxv2bf16( %va, %vb) strictfp { +; CHECK-LABEL: vfmul_vv_nxv2bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfmul.vv v9, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret +entry: + %vc = call @llvm.experimental.constrained.fmul.nxv2bf16( %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfmul_vf_nxv2bf16( %va, bfloat %b) strictfp { +; CHECK-LABEL: vfmul_vf_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfmul.vv v9, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = call @llvm.experimental.constrained.fmul.nxv2bf16( %va, %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfmul_vv_nxv4bf16( %va, %vb) strictfp { +; CHECK-LABEL: vfmul_vv_nxv4bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmul.vv v10, v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret +entry: + %vc = call @llvm.experimental.constrained.fmul.nxv4bf16( %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfmul_vf_nxv4bf16( %va, bfloat %b) strictfp { +; CHECK-LABEL: vfmul_vf_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmul.vv v10, v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = call @llvm.experimental.constrained.fmul.nxv4bf16( %va, %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfmul_vv_nxv8bf16( %va, %vb) strictfp { +; CHECK-LABEL: vfmul_vv_nxv8bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfmul.vv v12, v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret +entry: + %vc = call @llvm.experimental.constrained.fmul.nxv8bf16( %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfmul_vf_nxv8bf16( %va, bfloat %b) strictfp { +; CHECK-LABEL: vfmul_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfmul.vv v12, v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = call @llvm.experimental.constrained.fmul.nxv8bf16( %va, %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfmul_vv_nxv16bf16( %va, %vb) strictfp { +; CHECK-LABEL: vfmul_vv_nxv16bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmul.vv v16, v24, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret +entry: + %vc = call @llvm.experimental.constrained.fmul.nxv16bf16( %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfmul_vf_nxv16bf16( %va, bfloat %b) strictfp { +; CHECK-LABEL: vfmul_vf_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vmv.v.x v12, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmul.vv v16, v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = call @llvm.experimental.constrained.fmul.nxv16bf16( %va, %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfmul_vv_nxv32bf16( %va, %vb) strictfp { +; CHECK-LABEL: vfmul_vv_nxv32bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmul.vv v24, v0, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmul.vv v16, v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: ret +entry: + %vc = call @llvm.experimental.constrained.fmul.nxv32bf16( %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfmul_vf_nxv32bf16( %va, bfloat %b) strictfp { +; CHECK-LABEL: vfmul_vf_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; CHECK-NEXT: vmv.v.x v16, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmul.vv v24, v24, v0 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmul.vv v16, v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = call @llvm.experimental.constrained.fmul.nxv32bf16( %va, %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} declare @llvm.experimental.constrained.fmul.nxv1f16(, , metadata, metadata) define @vfmul_vv_nxv1f16( %va, %vb) strictfp { diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmul-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmul-sdnode.ll index 70d664aa50ec4f..2ab04a45c8183d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmul-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmul-sdnode.ll @@ -1,12 +1,252 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN + +define @vfmul_vv_nxv1bf16( %va, %vb) { +; CHECK-LABEL: vfmul_vv_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfmul.vv v9, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %vc = fmul %va, %vb + ret %vc +} + +define @vfmul_vf_nxv1bf16( %va, bfloat %b) { +; CHECK-LABEL: vfmul_vf_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfmul.vv v9, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fmul %va, %splat + ret %vc +} + +define @vfmul_vv_nxv2bf16( %va, %vb) { +; CHECK-LABEL: vfmul_vv_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfmul.vv v9, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %vc = fmul %va, %vb + ret %vc +} + +define @vfmul_vf_nxv2bf16( %va, bfloat %b) { +; CHECK-LABEL: vfmul_vf_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfmul.vv v9, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fmul %va, %splat + ret %vc +} + +define @vfmul_vv_nxv4bf16( %va, %vb) { +; CHECK-LABEL: vfmul_vv_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmul.vv v10, v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %vc = fmul %va, %vb + ret %vc +} + +define @vfmul_vf_nxv4bf16( %va, bfloat %b) { +; CHECK-LABEL: vfmul_vf_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmul.vv v10, v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fmul %va, %splat + ret %vc +} + +define @vfmul_vv_nxv8bf16( %va, %vb) { +; CHECK-LABEL: vfmul_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfmul.vv v12, v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %vc = fmul %va, %vb + ret %vc +} + +define @vfmul_vf_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: vfmul_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfmul.vv v12, v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fmul %va, %splat + ret %vc +} + +define @vfmul_fv_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: vfmul_fv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfmul.vv v12, v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fmul %splat, %va + ret %vc +} + +define @vfmul_vv_nxv16bf16( %va, %vb) { +; CHECK-LABEL: vfmul_vv_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmul.vv v16, v24, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %vc = fmul %va, %vb + ret %vc +} + +define @vfmul_vf_nxv16bf16( %va, bfloat %b) { +; CHECK-LABEL: vfmul_vf_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vmv.v.x v12, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmul.vv v16, v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fmul %va, %splat + ret %vc +} + +define @vfmul_vv_nxv32bf16( %va, %vb) { +; CHECK-LABEL: vfmul_vv_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmul.vv v24, v0, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmul.vv v16, v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: ret + %vc = fmul %va, %vb + ret %vc +} + +define @vfmul_vf_nxv32bf16( %va, bfloat %b) { +; CHECK-LABEL: vfmul_vf_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; CHECK-NEXT: vmv.v.x v16, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmul.vv v24, v24, v0 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v20 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmul.vv v16, v24, v0 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fmul %va, %splat + ret %vc +} define @vfmul_vv_nxv1f16( %va, %vb) { ; ZVFH-LABEL: vfmul_vv_nxv1f16: diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-constrained-sdnode.ll index 806b817fd6c4a2..9da1e0a576d5b7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-constrained-sdnode.ll @@ -1,12 +1,110 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN + +define @vfsqrt_nxv1bf16( %v) strictfp { +; CHECK-LABEL: vfsqrt_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfsqrt.v v9, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %r = call @llvm.experimental.constrained.sqrt.nxv1bf16( %v, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret %r +} + + +define @vfsqrt_nxv2bf16( %v) strictfp { +; CHECK-LABEL: vfsqrt_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfsqrt.v v9, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %r = call @llvm.experimental.constrained.sqrt.nxv2bf16( %v, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret %r +} + + +define @vfsqrt_nxv4bf16( %v) strictfp { +; CHECK-LABEL: vfsqrt_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfsqrt.v v10, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %r = call @llvm.experimental.constrained.sqrt.nxv4bf16( %v, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret %r +} + + +define @vfsqrt_nxv8bf16( %v) strictfp { +; CHECK-LABEL: vfsqrt_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfsqrt.v v12, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %r = call @llvm.experimental.constrained.sqrt.nxv8bf16( %v, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret %r +} + + +define @vfsqrt_nxv16bf16( %v) strictfp { +; CHECK-LABEL: vfsqrt_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfsqrt.v v16, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %r = call @llvm.experimental.constrained.sqrt.nxv16bf16( %v, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret %r +} + + +define @vfsqrt_nxv32bf16( %v) strictfp { +; CHECK-LABEL: vfsqrt_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfsqrt.v v16, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfsqrt.v v16, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: ret + %r = call @llvm.experimental.constrained.sqrt.nxv32bf16( %v, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret %r +} declare @llvm.experimental.constrained.sqrt.nxv1f16(, metadata, metadata) diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll index 329a078cd16633..de31a02cd15452 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll @@ -1,12 +1,105 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN + +define @vfsqrt_nxv1bf16( %v) { +; CHECK-LABEL: vfsqrt_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfsqrt.v v9, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %r = call @llvm.sqrt.nxv1bf16( %v) + ret %r +} + +define @vfsqrt_nxv2bf16( %v) { +; CHECK-LABEL: vfsqrt_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfsqrt.v v9, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %r = call @llvm.sqrt.nxv2bf16( %v) + ret %r +} + +define @vfsqrt_nxv4bf16( %v) { +; CHECK-LABEL: vfsqrt_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfsqrt.v v10, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %r = call @llvm.sqrt.nxv4bf16( %v) + ret %r +} + +define @vfsqrt_nxv8bf16( %v) { +; CHECK-LABEL: vfsqrt_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfsqrt.v v12, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %r = call @llvm.sqrt.nxv8bf16( %v) + ret %r +} + +define @vfsqrt_nxv16bf16( %v) { +; CHECK-LABEL: vfsqrt_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfsqrt.v v16, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %r = call @llvm.sqrt.nxv16bf16( %v) + ret %r +} + +define @vfsqrt_nxv32bf16( %v) { +; CHECK-LABEL: vfsqrt_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfsqrt.v v16, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfsqrt.v v16, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: ret + %r = call @llvm.sqrt.nxv32bf16( %v) + ret %r +} declare @llvm.sqrt.nxv1f16() diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll index bd229e0220a4b6..574c2e05263015 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll @@ -1,13 +1,236 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN + +declare @llvm.vp.sqrt.nxv1bf16(, , i32) + +define @vfsqrt_vv_nxv1bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfsqrt_vv_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfsqrt.v v9, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.sqrt.nxv1bf16( %va, %m, i32 %evl) + ret %v +} + +define @vfsqrt_vv_nxv1bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vfsqrt_vv_nxv1bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfsqrt.v v9, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.sqrt.nxv1bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.sqrt.nxv2bf16(, , i32) + +define @vfsqrt_vv_nxv2bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfsqrt_vv_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfsqrt.v v9, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.sqrt.nxv2bf16( %va, %m, i32 %evl) + ret %v +} + +define @vfsqrt_vv_nxv2bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vfsqrt_vv_nxv2bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfsqrt.v v9, v9 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.sqrt.nxv2bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.sqrt.nxv4bf16(, , i32) + +define @vfsqrt_vv_nxv4bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfsqrt_vv_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfsqrt.v v10, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.vp.sqrt.nxv4bf16( %va, %m, i32 %evl) + ret %v +} + +define @vfsqrt_vv_nxv4bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vfsqrt_vv_nxv4bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfsqrt.v v10, v10 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.vp.sqrt.nxv4bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.sqrt.nxv8bf16(, , i32) + +define @vfsqrt_vv_nxv8bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfsqrt_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfsqrt.v v12, v12, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.vp.sqrt.nxv8bf16( %va, %m, i32 %evl) + ret %v +} + +define @vfsqrt_vv_nxv8bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vfsqrt_vv_nxv8bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfsqrt.v v12, v12 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.vp.sqrt.nxv8bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.sqrt.nxv16bf16(, , i32) + +define @vfsqrt_vv_nxv16bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfsqrt_vv_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfsqrt.v v16, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %v = call @llvm.vp.sqrt.nxv16bf16( %va, %m, i32 %evl) + ret %v +} + +define @vfsqrt_vv_nxv16bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vfsqrt_vv_nxv16bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfsqrt.v v16, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %v = call @llvm.vp.sqrt.nxv16bf16( %va, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.sqrt.nxv32bf16(, , i32) + +define @vfsqrt_vv_nxv32bf16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfsqrt_vv_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v16, v0 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v0, a2 +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfsqrt.v v24, v24, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 +; CHECK-NEXT: bltu a0, a1, .LBB10_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB10_2: +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfsqrt.v v16, v24, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %v = call @llvm.vp.sqrt.nxv32bf16( %va, %m, i32 %evl) + ret %v +} +define @vfsqrt_vv_nxv32bf16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vfsqrt_vv_nxv32bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma +; CHECK-NEXT: vmset.m v16 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v16, a2 +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfsqrt.v v16, v16, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: bltu a0, a1, .LBB11_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB11_2: +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfsqrt.v v16, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %v = call @llvm.vp.sqrt.nxv32bf16( %va, splat (i1 true), i32 %evl) + ret %v +} declare @llvm.vp.sqrt.nxv1f16(, , i32) define @vfsqrt_vv_nxv1f16( %va, %m, i32 zeroext %evl) { @@ -245,10 +468,10 @@ define @vfsqrt_vv_nxv32f16( %va, @vfsqrt_vv_nxv32f16_unmasked( %v ; ZVFHMIN-NEXT: vfsqrt.v v16, v16, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 -; ZVFHMIN-NEXT: bltu a0, a1, .LBB11_2 +; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB11_2: +; ZVFHMIN-NEXT: .LBB23_2: ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfsqrt.v v16, v16 @@ -537,10 +760,10 @@ define @vfsqrt_vv_nxv16f64( %va, @vfsqrt_vv_nxv16f64_unmasked( @vfsub_vv_nxv1bf16( %va, %vb) strictfp { +; CHECK-LABEL: vfsub_vv_nxv1bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfsub.vv v9, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret +entry: + %vc = call @llvm.experimental.constrained.fsub.nxv1bf16( %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfsub_vf_nxv1bf16( %va, bfloat %b) strictfp { +; CHECK-LABEL: vfsub_vf_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfsub.vv v9, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = call @llvm.experimental.constrained.fsub.nxv1bf16( %va, %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfsub_vv_nxv2bf16( %va, %vb) strictfp { +; CHECK-LABEL: vfsub_vv_nxv2bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfsub.vv v9, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret +entry: + %vc = call @llvm.experimental.constrained.fsub.nxv2bf16( %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfsub_vf_nxv2bf16( %va, bfloat %b) strictfp { +; CHECK-LABEL: vfsub_vf_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfsub.vv v9, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = call @llvm.experimental.constrained.fsub.nxv2bf16( %va, %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfsub_vv_nxv4bf16( %va, %vb) strictfp { +; CHECK-LABEL: vfsub_vv_nxv4bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfsub.vv v10, v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret +entry: + %vc = call @llvm.experimental.constrained.fsub.nxv4bf16( %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfsub_vf_nxv4bf16( %va, bfloat %b) strictfp { +; CHECK-LABEL: vfsub_vf_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfsub.vv v10, v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = call @llvm.experimental.constrained.fsub.nxv4bf16( %va, %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfsub_vv_nxv8bf16( %va, %vb) strictfp { +; CHECK-LABEL: vfsub_vv_nxv8bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfsub.vv v12, v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret +entry: + %vc = call @llvm.experimental.constrained.fsub.nxv8bf16( %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfsub_vf_nxv8bf16( %va, bfloat %b) strictfp { +; CHECK-LABEL: vfsub_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfsub.vv v12, v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = call @llvm.experimental.constrained.fsub.nxv8bf16( %va, %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfsub_fv_nxv8bf16( %va, bfloat %b) strictfp { +; CHECK-LABEL: vfsub_fv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfsub.vv v12, v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = call @llvm.experimental.constrained.fsub.nxv8bf16( %splat, %va, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfsub_vv_nxv16bf16( %va, %vb) strictfp { +; CHECK-LABEL: vfsub_vv_nxv16bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfsub.vv v16, v24, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret +entry: + %vc = call @llvm.experimental.constrained.fsub.nxv16bf16( %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfsub_vf_nxv16bf16( %va, bfloat %b) strictfp { +; CHECK-LABEL: vfsub_vf_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vmv.v.x v12, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfsub.vv v16, v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = call @llvm.experimental.constrained.fsub.nxv16bf16( %va, %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfsub_vv_nxv32bf16( %va, %vb) strictfp { +; CHECK-LABEL: vfsub_vv_nxv32bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfsub.vv v24, v0, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfsub.vv v16, v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: ret +entry: + %vc = call @llvm.experimental.constrained.fsub.nxv32bf16( %va, %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} + +define @vfsub_vf_nxv32bf16( %va, bfloat %b) strictfp { +; CHECK-LABEL: vfsub_vf_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; CHECK-NEXT: vmv.v.x v16, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfsub.vv v24, v24, v0 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfsub.vv v16, v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = call @llvm.experimental.constrained.fsub.nxv32bf16( %va, %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret %vc +} declare @llvm.experimental.constrained.fsub.nxv1f16(, , metadata, metadata) define @vfsub_vv_nxv1f16( %va, %vb) strictfp { diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-sdnode.ll index bd73398fd04b56..e56cfd9ee4eb1a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfsub-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-sdnode.ll @@ -1,12 +1,252 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN + +define @vfsub_vv_nxv1bf16( %va, %vb) { +; CHECK-LABEL: vfsub_vv_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfsub.vv v9, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %vc = fsub %va, %vb + ret %vc +} + +define @vfsub_vf_nxv1bf16( %va, bfloat %b) { +; CHECK-LABEL: vfsub_vf_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfsub.vv v9, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fsub %va, %splat + ret %vc +} + +define @vfsub_vv_nxv2bf16( %va, %vb) { +; CHECK-LABEL: vfsub_vv_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfsub.vv v9, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %vc = fsub %va, %vb + ret %vc +} + +define @vfsub_vf_nxv2bf16( %va, bfloat %b) { +; CHECK-LABEL: vfsub_vf_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfsub.vv v9, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fsub %va, %splat + ret %vc +} + +define @vfsub_vv_nxv4bf16( %va, %vb) { +; CHECK-LABEL: vfsub_vv_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfsub.vv v10, v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %vc = fsub %va, %vb + ret %vc +} + +define @vfsub_vf_nxv4bf16( %va, bfloat %b) { +; CHECK-LABEL: vfsub_vf_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfsub.vv v10, v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fsub %va, %splat + ret %vc +} + +define @vfsub_vv_nxv8bf16( %va, %vb) { +; CHECK-LABEL: vfsub_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfsub.vv v12, v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %vc = fsub %va, %vb + ret %vc +} + +define @vfsub_vf_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: vfsub_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfsub.vv v12, v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fsub %va, %splat + ret %vc +} + +define @vfsub_fv_nxv8bf16( %va, bfloat %b) { +; CHECK-LABEL: vfsub_fv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfsub.vv v12, v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fsub %splat, %va + ret %vc +} + +define @vfsub_vv_nxv16bf16( %va, %vb) { +; CHECK-LABEL: vfsub_vv_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfsub.vv v16, v24, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %vc = fsub %va, %vb + ret %vc +} + +define @vfsub_vf_nxv16bf16( %va, bfloat %b) { +; CHECK-LABEL: vfsub_vf_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vmv.v.x v12, a0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfsub.vv v16, v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fsub %va, %splat + ret %vc +} + +define @vfsub_vv_nxv32bf16( %va, %vb) { +; CHECK-LABEL: vfsub_vv_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfsub.vv v24, v0, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfsub.vv v16, v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: ret + %vc = fsub %va, %vb + ret %vc +} + +define @vfsub_vf_nxv32bf16( %va, bfloat %b) { +; CHECK-LABEL: vfsub_vf_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a0, fa0 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; CHECK-NEXT: vmv.v.x v16, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfsub.vv v24, v24, v0 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v20 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfsub.vv v16, v24, v0 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: ret + %head = insertelement poison, bfloat %b, i32 0 + %splat = shufflevector %head, poison, zeroinitializer + %vc = fsub %va, %splat + ret %vc +} define @vfsub_vv_nxv1f16( %va, %vb) { ; ZVFH-LABEL: vfsub_vv_nxv1f16: diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll index fda6d0c48d4a6e..449130e59876f0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll @@ -1,13 +1,622 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN + +declare @llvm.vp.fsub.nxv1bf16(, , , i32) + +define @vfsub_vv_nxv1bf16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfsub_vv_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfsub.vv v9, v9, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.fsub.nxv1bf16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vfsub_vv_nxv1bf16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vfsub_vv_nxv1bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfsub.vv v9, v9, v10 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.fsub.nxv1bf16( %va, %b, splat (i1 true), i32 %evl) + ret %v +} + +define @vfsub_vf_nxv1bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfsub_vf_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfsub.vv v9, v10, v8, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fsub.nxv1bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfsub_vf_nxv1bf16_unmasked( %va, bfloat %b, i32 zeroext %evl) { +; CHECK-LABEL: vfsub_vf_nxv1bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfsub.vv v9, v10, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fsub.nxv1bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.fsub.nxv2bf16(, , , i32) + +define @vfsub_vv_nxv2bf16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfsub_vv_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfsub.vv v9, v9, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.fsub.nxv2bf16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vfsub_vv_nxv2bf16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vfsub_vv_nxv2bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfsub.vv v9, v9, v10 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.fsub.nxv2bf16( %va, %b, splat (i1 true), i32 %evl) + ret %v +} + +define @vfsub_vf_nxv2bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfsub_vf_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfsub.vv v9, v10, v8, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fsub.nxv2bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfsub_vf_nxv2bf16_unmasked( %va, bfloat %b, i32 zeroext %evl) { +; CHECK-LABEL: vfsub_vf_nxv2bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfsub.vv v9, v10, v8 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fsub.nxv2bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.fsub.nxv4bf16(, , , i32) + +define @vfsub_vv_nxv4bf16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfsub_vv_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfsub.vv v10, v12, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.vp.fsub.nxv4bf16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vfsub_vv_nxv4bf16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vfsub_vv_nxv4bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfsub.vv v10, v12, v10 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %v = call @llvm.vp.fsub.nxv4bf16( %va, %b, splat (i1 true), i32 %evl) + ret %v +} + +define @vfsub_vf_nxv4bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfsub_vf_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfsub.vv v10, v10, v12, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fsub.nxv4bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfsub_vf_nxv4bf16_unmasked( %va, bfloat %b, i32 zeroext %evl) { +; CHECK-LABEL: vfsub_vf_nxv4bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfsub.vv v10, v10, v12 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fsub.nxv4bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.fsub.nxv8bf16(, , , i32) +define @vfsub_vv_nxv8bf16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfsub_vv_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfsub.vv v12, v16, v12, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.vp.fsub.nxv8bf16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vfsub_vv_nxv8bf16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vfsub_vv_nxv8bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfsub.vv v12, v16, v12 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.vp.fsub.nxv8bf16( %va, %b, splat (i1 true), i32 %evl) + ret %v +} + +define @vfsub_vf_nxv8bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfsub_vf_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfsub.vv v12, v12, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fsub.nxv8bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfsub_vf_nxv8bf16_unmasked( %va, bfloat %b, i32 zeroext %evl) { +; CHECK-LABEL: vfsub_vf_nxv8bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfsub.vv v12, v12, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fsub.nxv8bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.fsub.nxv16bf16(, , , i32) + +define @vfsub_vv_nxv16bf16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfsub_vv_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfsub.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %v = call @llvm.vp.fsub.nxv16bf16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vfsub_vv_nxv16bf16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vfsub_vv_nxv16bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfsub.vv v16, v24, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %v = call @llvm.vp.fsub.nxv16bf16( %va, %b, splat (i1 true), i32 %evl) + ret %v +} + +define @vfsub_vf_nxv16bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfsub_vf_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfsub.vv v16, v16, v24, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fsub.nxv16bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfsub_vf_nxv16bf16_unmasked( %va, bfloat %b, i32 zeroext %evl) { +; CHECK-LABEL: vfsub_vf_nxv16bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfsub.vv v16, v16, v24 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fsub.nxv16bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} + +declare @llvm.vp.fsub.nxv32bf16(, , , i32) + +define @vfsub_vv_nxv32bf16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfsub_vv_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: vmv1r.v v7, v0 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v0, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfsub.vv v16, v16, v24, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: bltu a0, a1, .LBB20_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB20_2: +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfsub.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.fsub.nxv32bf16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vfsub_vv_nxv32bf16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vfsub_vv_nxv32bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma +; CHECK-NEXT: vmset.m v24 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v24, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfsub.vv v16, v16, v24, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: bltu a0, a1, .LBB21_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB21_2: +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfsub.vv v16, v24, v16 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.fsub.nxv32bf16( %va, %b, splat (i1 true), i32 %evl) + ret %v +} + +define @vfsub_vf_nxv32bf16( %va, bfloat %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfsub_vf_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x12, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 18 * vlenb +; CHECK-NEXT: vmv8r.v v24, v8 +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma +; CHECK-NEXT: vmv.v.x v16, a1 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a2, a1, 3 +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: slli a4, a4, 3 +; CHECK-NEXT: add a4, sp, a4 +; CHECK-NEXT: addi a4, a4, 16 +; CHECK-NEXT: vs1r.v v0, (a4) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v0, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a4, a2, 3 +; CHECK-NEXT: add a2, a4, a2 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfsub.vv v16, v8, v16, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: bltu a0, a1, .LBB22_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB22_2: +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a2, a1, 3 +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfsub.vv v16, v16, v24, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fsub.nxv32bf16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vfsub_vf_nxv32bf16_unmasked( %va, bfloat %b, i32 zeroext %evl) { +; CHECK-LABEL: vfsub_vf_nxv32bf16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: vmv8r.v v16, v8 +; CHECK-NEXT: fmv.x.h a1, fa0 +; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a1, a2, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: sltu a4, a0, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: vmset.m v24 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v24, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vfsub.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: bltu a0, a1, .LBB23_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB23_2: +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfsub.vv v16, v16, v24 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %elt.head = insertelement poison, bfloat %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.fsub.nxv32bf16( %va, %vb, splat (i1 true), i32 %evl) + ret %v +} declare @llvm.vp.fsub.nxv1f16(, , , i32) define @vfsub_vv_nxv1f16( %va, %b, %m, i32 zeroext %evl) { @@ -514,10 +1123,10 @@ define @vfsub_vv_nxv32f16( %va, @vfsub_vv_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: vfsub.vv v16, v16, v24, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 -; ZVFHMIN-NEXT: bltu a0, a1, .LBB21_2 +; ZVFHMIN-NEXT: bltu a0, a1, .LBB45_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB21_2: +; ZVFHMIN-NEXT: .LBB45_2: ; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 @@ -649,10 +1258,10 @@ define @vfsub_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: vfsub.vv v16, v8, v16, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 -; ZVFHMIN-NEXT: bltu a0, a1, .LBB22_2 +; ZVFHMIN-NEXT: bltu a0, a1, .LBB46_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB22_2: +; ZVFHMIN-NEXT: .LBB46_2: ; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 @@ -730,10 +1339,10 @@ define @vfsub_vf_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: vfsub.vv v16, v24, v16, v0.t ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 -; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2 +; ZVFHMIN-NEXT: bltu a0, a1, .LBB47_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 -; ZVFHMIN-NEXT: .LBB23_2: +; ZVFHMIN-NEXT: .LBB47_2: ; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll index 7d78fa5a8f3ef2..0f8e74942d58d0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll @@ -1,12 +1,288 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ +; RUN: --check-prefixes=CHECK,ZVFHMIN + +declare bfloat @llvm.vp.reduce.fadd.nxv1bf16(bfloat, , , i32) + +define bfloat @vpreduce_fadd_nxv1bf16(bfloat %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_fadd_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: fcvt.s.bf16 fa5, fa0 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; CHECK-NEXT: vfmv.s.f v8, fa5 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfredusum.vs v8, v9, v8, v0.t +; CHECK-NEXT: vfmv.f.s fa5, v8 +; CHECK-NEXT: fcvt.bf16.s fa0, fa5 +; CHECK-NEXT: ret + %r = call reassoc bfloat @llvm.vp.reduce.fadd.nxv1bf16(bfloat %s, %v, %m, i32 %evl) + ret bfloat %r +} + +define bfloat @vpreduce_ord_fadd_nxv1bf16(bfloat %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_ord_fadd_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: fcvt.s.bf16 fa5, fa0 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; CHECK-NEXT: vfmv.s.f v8, fa5 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfredosum.vs v8, v9, v8, v0.t +; CHECK-NEXT: vfmv.f.s fa5, v8 +; CHECK-NEXT: fcvt.bf16.s fa0, fa5 +; CHECK-NEXT: ret + %r = call bfloat @llvm.vp.reduce.fadd.nxv1bf16(bfloat %s, %v, %m, i32 %evl) + ret bfloat %r +} + +declare bfloat @llvm.vp.reduce.fadd.nxv2bf16(bfloat, , , i32) + +define bfloat @vpreduce_fadd_nxv2bf16(bfloat %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_fadd_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: fcvt.s.bf16 fa5, fa0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-NEXT: vfmv.s.f v8, fa5 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfredusum.vs v8, v9, v8, v0.t +; CHECK-NEXT: vfmv.f.s fa5, v8 +; CHECK-NEXT: fcvt.bf16.s fa0, fa5 +; CHECK-NEXT: ret + %r = call reassoc bfloat @llvm.vp.reduce.fadd.nxv2bf16(bfloat %s, %v, %m, i32 %evl) + ret bfloat %r +} + +define bfloat @vpreduce_ord_fadd_nxv2bf16(bfloat %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_ord_fadd_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 +; CHECK-NEXT: fcvt.s.bf16 fa5, fa0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-NEXT: vfmv.s.f v8, fa5 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfredosum.vs v8, v9, v8, v0.t +; CHECK-NEXT: vfmv.f.s fa5, v8 +; CHECK-NEXT: fcvt.bf16.s fa0, fa5 +; CHECK-NEXT: ret + %r = call bfloat @llvm.vp.reduce.fadd.nxv2bf16(bfloat %s, %v, %m, i32 %evl) + ret bfloat %r +} + +declare bfloat @llvm.vp.reduce.fadd.nxv4bf16(bfloat, , , i32) + +define bfloat @vpreduce_fadd_nxv4bf16(bfloat %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_fadd_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: fcvt.s.bf16 fa5, fa0 +; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; CHECK-NEXT: vfmv.s.f v8, fa5 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfredusum.vs v8, v10, v8, v0.t +; CHECK-NEXT: vfmv.f.s fa5, v8 +; CHECK-NEXT: fcvt.bf16.s fa0, fa5 +; CHECK-NEXT: ret + %r = call reassoc bfloat @llvm.vp.reduce.fadd.nxv4bf16(bfloat %s, %v, %m, i32 %evl) + ret bfloat %r +} + +define bfloat @vpreduce_ord_fadd_nxv4bf16(bfloat %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_ord_fadd_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: fcvt.s.bf16 fa5, fa0 +; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; CHECK-NEXT: vfmv.s.f v8, fa5 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfredosum.vs v8, v10, v8, v0.t +; CHECK-NEXT: vfmv.f.s fa5, v8 +; CHECK-NEXT: fcvt.bf16.s fa0, fa5 +; CHECK-NEXT: ret + %r = call bfloat @llvm.vp.reduce.fadd.nxv4bf16(bfloat %s, %v, %m, i32 %evl) + ret bfloat %r +} + +declare bfloat @llvm.vp.reduce.fadd.nxv64bf16(bfloat, , , i32) + +define bfloat @vpreduce_fadd_nxv64bf16(bfloat %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_fadd_nxv64bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: srli a1, a3, 1 +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v7, v0, a1 +; CHECK-NEXT: slli a5, a3, 2 +; CHECK-NEXT: sub a1, a0, a5 +; CHECK-NEXT: sltu a2, a0, a1 +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: and a1, a2, a1 +; CHECK-NEXT: slli a4, a3, 1 +; CHECK-NEXT: sub a2, a1, a4 +; CHECK-NEXT: sltu a6, a1, a2 +; CHECK-NEXT: bltu a1, a4, .LBB6_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a1, a4 +; CHECK-NEXT: .LBB6_2: +; CHECK-NEXT: addi a6, a6, -1 +; CHECK-NEXT: bltu a0, a5, .LBB6_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: mv a0, a5 +; CHECK-NEXT: .LBB6_4: +; CHECK-NEXT: and a2, a6, a2 +; CHECK-NEXT: sub a5, a0, a4 +; CHECK-NEXT: sltu a6, a0, a5 +; CHECK-NEXT: addi a6, a6, -1 +; CHECK-NEXT: and a5, a6, a5 +; CHECK-NEXT: srli a3, a3, 2 +; CHECK-NEXT: vsetvli a6, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v6, v0, a3 +; CHECK-NEXT: bltu a0, a4, .LBB6_6 +; CHECK-NEXT: # %bb.5: +; CHECK-NEXT: mv a0, a4 +; CHECK-NEXT: .LBB6_6: +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: fcvt.s.bf16 fa5, fa0 +; CHECK-NEXT: vsetivli zero, 1, e32, m8, ta, ma +; CHECK-NEXT: vfmv.s.f v8, fa5 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfredusum.vs v8, v24, v8, v0.t +; CHECK-NEXT: vfmv.f.s fa5, v8 +; CHECK-NEXT: fcvt.bf16.s fa5, fa5 +; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 +; CHECK-NEXT: vsetivli zero, 1, e32, m8, ta, ma +; CHECK-NEXT: vfmv.s.f v8, fa5 +; CHECK-NEXT: vsetvli zero, a5, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vmv1r.v v0, v6 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfredusum.vs v8, v24, v8, v0.t +; CHECK-NEXT: vfmv.f.s fa5, v8 +; CHECK-NEXT: fcvt.bf16.s fa5, fa5 +; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 +; CHECK-NEXT: vsetivli zero, 1, e32, m8, ta, ma +; CHECK-NEXT: vfmv.s.f v8, fa5 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfredusum.vs v8, v24, v8, v0.t +; CHECK-NEXT: vfmv.f.s fa5, v8 +; CHECK-NEXT: fcvt.bf16.s fa5, fa5 +; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 +; CHECK-NEXT: vsetivli zero, 1, e32, m8, ta, ma +; CHECK-NEXT: vfmv.s.f v8, fa5 +; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v7, a3 +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfredusum.vs v8, v24, v8, v0.t +; CHECK-NEXT: vfmv.f.s fa5, v8 +; CHECK-NEXT: fcvt.bf16.s fa0, fa5 +; CHECK-NEXT: ret + %r = call reassoc bfloat @llvm.vp.reduce.fadd.nxv64bf16(bfloat %s, %v, %m, i32 %evl) + ret bfloat %r +} + +define bfloat @vpreduce_ord_fadd_nxv64bf16(bfloat %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_ord_fadd_nxv64bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: srli a1, a3, 1 +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v7, v0, a1 +; CHECK-NEXT: slli a5, a3, 2 +; CHECK-NEXT: sub a1, a0, a5 +; CHECK-NEXT: sltu a2, a0, a1 +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: and a1, a2, a1 +; CHECK-NEXT: slli a4, a3, 1 +; CHECK-NEXT: sub a2, a1, a4 +; CHECK-NEXT: sltu a6, a1, a2 +; CHECK-NEXT: bltu a1, a4, .LBB7_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a1, a4 +; CHECK-NEXT: .LBB7_2: +; CHECK-NEXT: addi a6, a6, -1 +; CHECK-NEXT: bltu a0, a5, .LBB7_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: mv a0, a5 +; CHECK-NEXT: .LBB7_4: +; CHECK-NEXT: and a2, a6, a2 +; CHECK-NEXT: sub a5, a0, a4 +; CHECK-NEXT: sltu a6, a0, a5 +; CHECK-NEXT: addi a6, a6, -1 +; CHECK-NEXT: and a5, a6, a5 +; CHECK-NEXT: srli a3, a3, 2 +; CHECK-NEXT: vsetvli a6, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v6, v0, a3 +; CHECK-NEXT: bltu a0, a4, .LBB7_6 +; CHECK-NEXT: # %bb.5: +; CHECK-NEXT: mv a0, a4 +; CHECK-NEXT: .LBB7_6: +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8 +; CHECK-NEXT: fcvt.s.bf16 fa5, fa0 +; CHECK-NEXT: vsetivli zero, 1, e32, m8, ta, ma +; CHECK-NEXT: vfmv.s.f v8, fa5 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vfredosum.vs v8, v24, v8, v0.t +; CHECK-NEXT: vfmv.f.s fa5, v8 +; CHECK-NEXT: fcvt.bf16.s fa5, fa5 +; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 +; CHECK-NEXT: vsetivli zero, 1, e32, m8, ta, ma +; CHECK-NEXT: vfmv.s.f v8, fa5 +; CHECK-NEXT: vsetvli zero, a5, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vmv1r.v v0, v6 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfredosum.vs v8, v24, v8, v0.t +; CHECK-NEXT: vfmv.f.s fa5, v8 +; CHECK-NEXT: fcvt.bf16.s fa5, fa5 +; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 +; CHECK-NEXT: vsetivli zero, 1, e32, m8, ta, ma +; CHECK-NEXT: vfmv.s.f v8, fa5 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfredosum.vs v8, v24, v8, v0.t +; CHECK-NEXT: vfmv.f.s fa5, v8 +; CHECK-NEXT: fcvt.bf16.s fa5, fa5 +; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 +; CHECK-NEXT: vsetivli zero, 1, e32, m8, ta, ma +; CHECK-NEXT: vfmv.s.f v8, fa5 +; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v7, a3 +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfredosum.vs v8, v24, v8, v0.t +; CHECK-NEXT: vfmv.f.s fa5, v8 +; CHECK-NEXT: fcvt.bf16.s fa0, fa5 +; CHECK-NEXT: ret + %r = call bfloat @llvm.vp.reduce.fadd.nxv64bf16(bfloat %s, %v, %m, i32 %evl) + ret bfloat %r +} declare half @llvm.vp.reduce.fadd.nxv1f16(half, , , i32) @@ -184,10 +460,10 @@ define half @vpreduce_fadd_nxv64f16(half %s, %v, %v, %v, %v, %v, %v, %val, ; CHECK-NEXT: feq.s a1, fa0, fa0 ; CHECK-NEXT: xori a1, a1, 1 ; CHECK-NEXT: or a0, a0, a1 -; CHECK-NEXT: beqz a0, .LBB22_2 +; CHECK-NEXT: beqz a0, .LBB30_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB22_2: +; CHECK-NEXT: .LBB30_2: ; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %s = call float @llvm.vp.reduce.fminimum.nxv4f32(float %start, %val, %m, i32 %evl) @@ -616,12 +892,12 @@ define float @vreduce_fmaximum_nxv4f32(float %start, %val, ; CHECK-NEXT: feq.s a1, fa0, fa0 ; CHECK-NEXT: xori a1, a1, 1 ; CHECK-NEXT: or a0, a0, a1 -; CHECK-NEXT: beqz a0, .LBB23_2 +; CHECK-NEXT: beqz a0, .LBB31_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB23_2: +; CHECK-NEXT: .LBB31_2: ; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %s = call float @llvm.vp.reduce.fmaximum.nxv4f32(float %start, %val, %m, i32 %evl) @@ -666,12 +942,12 @@ define float @vreduce_fminimum_v4f32(float %start, <4 x float> %val, <4 x i1> %m ; CHECK-NEXT: feq.s a1, fa0, fa0 ; CHECK-NEXT: xori a1, a1, 1 ; CHECK-NEXT: or a0, a0, a1 -; CHECK-NEXT: beqz a0, .LBB26_2 +; CHECK-NEXT: beqz a0, .LBB34_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB26_2: +; CHECK-NEXT: .LBB34_2: ; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %s = call float @llvm.vp.reduce.fminimum.v4f32(float %start, <4 x float> %val, <4 x i1> %m, i32 %evl) @@ -690,12 +966,12 @@ define float @vreduce_fmaximum_v4f32(float %start, <4 x float> %val, <4 x i1> %m ; CHECK-NEXT: feq.s a1, fa0, fa0 ; CHECK-NEXT: xori a1, a1, 1 ; CHECK-NEXT: or a0, a0, a1 -; CHECK-NEXT: beqz a0, .LBB27_2 +; CHECK-NEXT: beqz a0, .LBB35_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB27_2: +; CHECK-NEXT: .LBB35_2: ; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %s = call float @llvm.vp.reduce.fmaximum.v4f32(float %start, <4 x float> %val, <4 x i1> %m, i32 %evl) From 41d5fed09e7d31922e7869c72116a4c3adc11a4a Mon Sep 17 00:00:00 2001 From: Tom Eccles Date: Wed, 18 Sep 2024 10:40:41 +0100 Subject: [PATCH 030/321] [flang][Semantics] set scope even for module subroutines outside modules (#109009) The missing scope information led to a crash in OpenMP semantic checks run before printing the error that was already discovered in the code. The following block has to be skipped for this invalid code so that we don't emit a second spurious error. Fixes #82913 --- flang/lib/Semantics/resolve-names.cpp | 7 +++++-- flang/test/Semantics/OpenMP/bad_module_subroutine.f90 | 6 ++++++ 2 files changed, 11 insertions(+), 2 deletions(-) create mode 100644 flang/test/Semantics/OpenMP/bad_module_subroutine.f90 diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index b99f308e1c7fab..7c692440d24730 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -4351,15 +4351,18 @@ bool SubprogramVisitor::BeginSubprogram(const parser::Name &name, Symbol::Flag subpFlag, bool hasModulePrefix, const parser::LanguageBindingSpec *bindingSpec, const ProgramTree::EntryStmtList *entryStmts) { + bool isValid{true}; if (hasModulePrefix && !currScope().IsModule() && !currScope().IsSubmodule()) { // C1547 Say(name, "'%s' is a MODULE procedure which must be declared within a " "MODULE or SUBMODULE"_err_en_US); - return false; + // Don't return here because it can be useful to have the scope set for + // other semantic checks run before we print the errors + isValid = false; } Symbol *moduleInterface{nullptr}; - if (hasModulePrefix && !inInterfaceBlock()) { + if (isValid && hasModulePrefix && !inInterfaceBlock()) { moduleInterface = FindSeparateModuleProcedureInterface(name); if (moduleInterface && &moduleInterface->owner() == &currScope()) { // Subprogram is MODULE FUNCTION or MODULE SUBROUTINE with an interface diff --git a/flang/test/Semantics/OpenMP/bad_module_subroutine.f90 b/flang/test/Semantics/OpenMP/bad_module_subroutine.f90 new file mode 100644 index 00000000000000..339d6bf27e7dfd --- /dev/null +++ b/flang/test/Semantics/OpenMP/bad_module_subroutine.f90 @@ -0,0 +1,6 @@ +! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp +! Test that we don't crash on this code inside of openmp semantics checks + +!ERROR: 'e' is a MODULE procedure which must be declared within a MODULE or SUBMODULE +impure elemental module subroutine e() +end subroutine From 707169acb5520149cd5f96cc8f381ca51107d356 Mon Sep 17 00:00:00 2001 From: Balazs Benics Date: Wed, 18 Sep 2024 11:43:56 +0200 Subject: [PATCH 031/321] [clang][NFC] Remove trailing spaces from Sema diag messages (#109098) --- clang/include/clang/Basic/DiagnosticSemaKinds.td | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index bfda5b521c8fd2..ba813af960af6f 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -3290,7 +3290,7 @@ def err_attribute_unsupported_m_profile def err_duplicate_target_attribute : Error<"%select{unsupported|duplicate|unknown}0%select{| CPU|" " tune CPU}1 '%2' in the '%select{target|target_clones|target_version}3' " - "attribute string; ">; + "attribute string;">; // The err_*_attribute_argument_not_int are separate because they're used by // VerifyIntegerConstantExpression. def err_aligned_attribute_argument_not_int : Error< @@ -9915,7 +9915,7 @@ def err_defaulted_comparison_constexpr_mismatch : Error< "three-way comparison operator}0 cannot be " "declared %select{constexpr|consteval}2 because " "%select{it|for which the corresponding implicit 'operator==' }0 " - "invokes a non-constexpr comparison function ">; + "invokes a non-constexpr comparison function">; def note_defaulted_comparison_not_constexpr : Note< "non-constexpr comparison function would be used to compare " "%select{|member %1|base class %1}0">; @@ -11559,7 +11559,7 @@ def err_omp_wrong_device_function_call : Error< "function with 'device_type(%0)' is not available on %select{device|host}1">; def note_omp_marked_device_type_here : Note<"marked as 'device_type(%0)' here">; def err_omp_declare_target_has_local_vars : Error< - "local variable '%0' should not be used in 'declare target' directive; ">; + "local variable '%0' should not be used in 'declare target' directive;">; def warn_omp_declare_target_after_first_use : Warning< "declaration marked as declare target after first use, it may lead to incorrect results">, InGroup; From f5ad9e1ca582216010d07e7b0ca2f3e77f71c859 Mon Sep 17 00:00:00 2001 From: Mahesh-Attarde <145317060+mahesh-attarde@users.noreply.github.com> Date: Wed, 18 Sep 2024 02:55:29 -0700 Subject: [PATCH 032/321] [X86][AVX10.2] Support AVX10.2-COMEF new instructions. (#108063) Ref.: https://cdrdv2.intel.com/v1/dl/getContent/828965 Chapter 8 AVX10 COMPARE SCALAR FP WITH ENHANCED EFLAGS INSTRUCTIONS --------- Co-authored-by: mattarde --- llvm/lib/Target/X86/X86ISelLowering.cpp | 31 ++- llvm/lib/Target/X86/X86ISelLowering.h | 4 + llvm/lib/Target/X86/X86InstrAVX10.td | 46 ++++ llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 3 +- llvm/test/CodeGen/X86/comi-flags.ll | 237 ++++++++++++------ .../MC/Disassembler/X86/avx10.2-com-ef-32.txt | 195 ++++++++++++++ .../MC/Disassembler/X86/avx10.2-com-ef-64.txt | 195 ++++++++++++++ llvm/test/MC/X86/avx10.2-com-ef-32-att.s | 194 ++++++++++++++ llvm/test/MC/X86/avx10.2-com-ef-32-intel.s | 194 ++++++++++++++ llvm/test/MC/X86/avx10.2-com-ef-64-att.s | 194 ++++++++++++++ llvm/test/MC/X86/avx10.2-com-ef-64-intel.s | 194 ++++++++++++++ llvm/test/TableGen/x86-fold-tables.inc | 6 + 12 files changed, 1405 insertions(+), 88 deletions(-) create mode 100644 llvm/test/MC/Disassembler/X86/avx10.2-com-ef-32.txt create mode 100644 llvm/test/MC/Disassembler/X86/avx10.2-com-ef-64.txt create mode 100644 llvm/test/MC/X86/avx10.2-com-ef-32-att.s create mode 100644 llvm/test/MC/X86/avx10.2-com-ef-32-intel.s create mode 100644 llvm/test/MC/X86/avx10.2-com-ef-64-att.s create mode 100644 llvm/test/MC/X86/avx10.2-com-ef-64-intel.s diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index f6d42ade600885..9bc5f2c9399574 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -26159,22 +26159,43 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, if (CC == ISD::SETLT || CC == ISD::SETLE) std::swap(LHS, RHS); - SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS); + // For AVX10.2, Support EQ and NE. + bool HasAVX10_2_COMX = + Subtarget.hasAVX10_2() && (CC == ISD::SETEQ || CC == ISD::SETNE); + + // AVX10.2 COMPARE supports only v2f64, v4f32 or v8f16. + // For BF type we need to fall back. + bool HasAVX10_2_COMX_Ty = (LHS.getSimpleValueType() != MVT::v8bf16); + + auto ComiOpCode = IntrData->Opc0; + auto isUnordered = (ComiOpCode == X86ISD::UCOMI); + + if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) + ComiOpCode = isUnordered ? X86ISD::UCOMX : X86ISD::COMX; + + SDValue Comi = DAG.getNode(ComiOpCode, dl, MVT::i32, LHS, RHS); + SDValue SetCC; switch (CC) { - case ISD::SETEQ: { // (ZF = 0 and PF = 0) + case ISD::SETEQ: { SetCC = getSETCC(X86::COND_E, Comi, dl, DAG); + if (HasAVX10_2_COMX & HasAVX10_2_COMX_Ty) // ZF == 1 + break; + // (ZF = 1 and PF = 0) SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG); SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP); break; } - case ISD::SETNE: { // (ZF = 1 or PF = 1) + case ISD::SETNE: { SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG); + if (HasAVX10_2_COMX & HasAVX10_2_COMX_Ty) // ZF == 0 + break; + // (ZF = 0 or PF = 1) SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG); SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP); break; } - case ISD::SETGT: // (CF = 0 and ZF = 0) + case ISD::SETGT: // (CF = 0 and ZF = 0) case ISD::SETLT: { // Condition opposite to GT. Operands swapped above. SetCC = getSETCC(X86::COND_A, Comi, dl, DAG); break; @@ -34083,6 +34104,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(STRICT_FCMPS) NODE_NAME_CASE(COMI) NODE_NAME_CASE(UCOMI) + NODE_NAME_CASE(COMX) + NODE_NAME_CASE(UCOMX) NODE_NAME_CASE(CMPM) NODE_NAME_CASE(CMPMM) NODE_NAME_CASE(STRICT_CMPM) diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 5fb58867568012..ae7da8efb5f91a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -87,6 +87,10 @@ namespace llvm { COMI, UCOMI, + // X86 compare with Intrinsics similar to COMI. + COMX, + UCOMX, + /// X86 bit-test instructions. BT, diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td index ada2bbaffd6645..f0334109a32b68 100644 --- a/llvm/lib/Target/X86/X86InstrAVX10.td +++ b/llvm/lib/Target/X86/X86InstrAVX10.td @@ -1537,3 +1537,49 @@ defm VFNMADD132NEPBF16 : avx10_fma3p_132_bf16<0x9C, "vfnmadd132nepbf16", X86any_ defm VFNMSUB132NEPBF16 : avx10_fma3p_132_bf16<0x9E, "vfnmsub132nepbf16", X86any_Fnmsub, X86Fnmsub, SchedWriteFMA>; } + +//------------------------------------------------- +// AVX10 COMEF instructions +//------------------------------------------------- +multiclass avx10_com_ef_int Opc, X86VectorVTInfo _, SDNode OpNode, + string OpcodeStr, + Domain d, + X86FoldableSchedWrite sched = WriteFComX> { + let ExeDomain = d, mayRaiseFPException = 1 in { + def rr_Int : AVX512, + EVEX, EVEX_V128, Sched<[sched]>, SIMD_EXC; + let mayLoad = 1 in { + def rm_Int : AVX512, + EVEX, EVEX_V128, Sched<[sched]>, SIMD_EXC; + } + def rrb_Int : AVX512, + EVEX, EVEX_V128, EVEX_B, Sched<[sched]>, SIMD_EXC; + } +} + +let Defs = [EFLAGS], Uses = [MXCSR], Predicates = [HasAVX10_2] in { + defm VCOMXSDZ : avx10_com_ef_int<0x2f, v2f64x_info, X86comi512, + "vcomxsd", SSEPackedDouble>, + TB, XS, VEX_LIG, REX_W, EVEX_CD8<64, CD8VT1>; + defm VCOMXSHZ : avx10_com_ef_int<0x2f, v8f16x_info, X86comi512, + "vcomxsh", SSEPackedSingle>, + T_MAP5, XD, EVEX_CD8<16, CD8VT1>; + defm VCOMXSSZ : avx10_com_ef_int<0x2f, v4f32x_info, X86comi512, + "vcomxss", SSEPackedSingle>, + TB, XD, VEX_LIG, EVEX_CD8<32, CD8VT1>; + defm VUCOMXSDZ : avx10_com_ef_int<0x2e, v2f64x_info, X86ucomi512, + "vucomxsd", SSEPackedDouble>, + TB, XS, VEX_LIG, REX_W, EVEX_CD8<64, CD8VT1>; + defm VUCOMXSHZ : avx10_com_ef_int<0x2e, v8f16x_info, X86ucomi512, + "vucomxsh", SSEPackedSingle>, + T_MAP5, XD, EVEX_CD8<16, CD8VT1>; + defm VUCOMXSSZ : avx10_com_ef_int<0x2e, v4f32x_info, X86ucomi512, + "vucomxss", SSEPackedSingle>, + TB, XD, VEX_LIG, EVEX_CD8<32, CD8VT1>; +} diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index af39b1ab82d6ea..ed1bff05b7316c 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -61,7 +61,8 @@ def X86hadd : SDNode<"X86ISD::HADD", SDTIntBinOp>; def X86hsub : SDNode<"X86ISD::HSUB", SDTIntBinOp>; def X86comi : SDNode<"X86ISD::COMI", SDTX86FCmp>; def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86FCmp>; - +def X86comi512 : SDNode<"X86ISD::COMX", SDTX86FCmp>; +def X86ucomi512 : SDNode<"X86ISD::UCOMX", SDTX86FCmp>; def SDTX86Cmps : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; def X86cmps : SDNode<"X86ISD::FSETCC", SDTX86Cmps>; diff --git a/llvm/test/CodeGen/X86/comi-flags.ll b/llvm/test/CodeGen/X86/comi-flags.ll index 8b7a089f0ce872..6f520aa57dcd09 100644 --- a/llvm/test/CodeGen/X86/comi-flags.ll +++ b/llvm/test/CodeGen/X86/comi-flags.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefix=SSE -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,NO-AVX10_2 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=AVX,AVX10_2 ; ; SSE @@ -17,15 +18,22 @@ define i32 @test_x86_sse_comieq_ss(<4 x float> %a0, <4 x float> %a1, i32 %a2, i3 ; SSE-NEXT: cmovnel %esi, %eax ; SSE-NEXT: retq ; -; AVX-LABEL: test_x86_sse_comieq_ss: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vcomiss %xmm1, %xmm0 -; AVX-NEXT: setnp %cl -; AVX-NEXT: sete %dl -; AVX-NEXT: testb %cl, %dl -; AVX-NEXT: cmovnel %esi, %eax -; AVX-NEXT: retq +; NO-AVX10_2-LABEL: test_x86_sse_comieq_ss: +; NO-AVX10_2: # %bb.0: +; NO-AVX10_2-NEXT: movl %edi, %eax +; NO-AVX10_2-NEXT: vcomiss %xmm1, %xmm0 +; NO-AVX10_2-NEXT: setnp %cl +; NO-AVX10_2-NEXT: sete %dl +; NO-AVX10_2-NEXT: testb %cl, %dl +; NO-AVX10_2-NEXT: cmovnel %esi, %eax +; NO-AVX10_2-NEXT: retq +; +; AVX10_2-LABEL: test_x86_sse_comieq_ss: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: movl %edi, %eax +; AVX10_2-NEXT: vcomxss %xmm1, %xmm0 +; AVX10_2-NEXT: cmovel %esi, %eax +; AVX10_2-NEXT: retq %call = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1) %cmp = icmp eq i32 %call, 0 %res = select i1 %cmp, i32 %a2, i32 %a3 @@ -126,13 +134,20 @@ define i32 @test_x86_sse_comineq_ss(<4 x float> %a0, <4 x float> %a1, i32 %a2, i ; SSE-NEXT: cmovpl %edi, %eax ; SSE-NEXT: retq ; -; AVX-LABEL: test_x86_sse_comineq_ss: -; AVX: # %bb.0: -; AVX-NEXT: movl %esi, %eax -; AVX-NEXT: vcomiss %xmm1, %xmm0 -; AVX-NEXT: cmovnel %edi, %eax -; AVX-NEXT: cmovpl %edi, %eax -; AVX-NEXT: retq +; NO-AVX10_2-LABEL: test_x86_sse_comineq_ss: +; NO-AVX10_2: # %bb.0: +; NO-AVX10_2-NEXT: movl %esi, %eax +; NO-AVX10_2-NEXT: vcomiss %xmm1, %xmm0 +; NO-AVX10_2-NEXT: cmovnel %edi, %eax +; NO-AVX10_2-NEXT: cmovpl %edi, %eax +; NO-AVX10_2-NEXT: retq +; +; AVX10_2-LABEL: test_x86_sse_comineq_ss: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: movl %edi, %eax +; AVX10_2-NEXT: vcomxss %xmm1, %xmm0 +; AVX10_2-NEXT: cmovel %esi, %eax +; AVX10_2-NEXT: retq %call = call i32 @llvm.x86.sse.comineq.ss(<4 x float> %a0, <4 x float> %a1) %cmp = icmp ne i32 %call, 0 %res = select i1 %cmp, i32 %a2, i32 %a3 @@ -151,15 +166,22 @@ define i32 @test_x86_sse_ucomieq_ss(<4 x float> %a0, <4 x float> %a1, i32 %a2, i ; SSE-NEXT: cmovnel %esi, %eax ; SSE-NEXT: retq ; -; AVX-LABEL: test_x86_sse_ucomieq_ss: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vucomiss %xmm1, %xmm0 -; AVX-NEXT: setnp %cl -; AVX-NEXT: sete %dl -; AVX-NEXT: testb %cl, %dl -; AVX-NEXT: cmovnel %esi, %eax -; AVX-NEXT: retq +; NO-AVX10_2-LABEL: test_x86_sse_ucomieq_ss: +; NO-AVX10_2: # %bb.0: +; NO-AVX10_2-NEXT: movl %edi, %eax +; NO-AVX10_2-NEXT: vucomiss %xmm1, %xmm0 +; NO-AVX10_2-NEXT: setnp %cl +; NO-AVX10_2-NEXT: sete %dl +; NO-AVX10_2-NEXT: testb %cl, %dl +; NO-AVX10_2-NEXT: cmovnel %esi, %eax +; NO-AVX10_2-NEXT: retq +; +; AVX10_2-LABEL: test_x86_sse_ucomieq_ss: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: movl %edi, %eax +; AVX10_2-NEXT: vucomxss %xmm1, %xmm0 +; AVX10_2-NEXT: cmovel %esi, %eax +; AVX10_2-NEXT: retq %call = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1) %cmp = icmp eq i32 %call, 0 %res = select i1 %cmp, i32 %a2, i32 %a3 @@ -260,13 +282,20 @@ define i32 @test_x86_sse_ucomineq_ss(<4 x float> %a0, <4 x float> %a1, i32 %a2, ; SSE-NEXT: cmovpl %edi, %eax ; SSE-NEXT: retq ; -; AVX-LABEL: test_x86_sse_ucomineq_ss: -; AVX: # %bb.0: -; AVX-NEXT: movl %esi, %eax -; AVX-NEXT: vucomiss %xmm1, %xmm0 -; AVX-NEXT: cmovnel %edi, %eax -; AVX-NEXT: cmovpl %edi, %eax -; AVX-NEXT: retq +; NO-AVX10_2-LABEL: test_x86_sse_ucomineq_ss: +; NO-AVX10_2: # %bb.0: +; NO-AVX10_2-NEXT: movl %esi, %eax +; NO-AVX10_2-NEXT: vucomiss %xmm1, %xmm0 +; NO-AVX10_2-NEXT: cmovnel %edi, %eax +; NO-AVX10_2-NEXT: cmovpl %edi, %eax +; NO-AVX10_2-NEXT: retq +; +; AVX10_2-LABEL: test_x86_sse_ucomineq_ss: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: movl %edi, %eax +; AVX10_2-NEXT: vucomxss %xmm1, %xmm0 +; AVX10_2-NEXT: cmovel %esi, %eax +; AVX10_2-NEXT: retq %call = call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %a0, <4 x float> %a1) %cmp = icmp ne i32 %call, 0 %res = select i1 %cmp, i32 %a2, i32 %a3 @@ -289,15 +318,22 @@ define i32 @test_x86_sse2_comieq_sd(<2 x double> %a0, <2 x double> %a1, i32 %a2, ; SSE-NEXT: cmovnel %esi, %eax ; SSE-NEXT: retq ; -; AVX-LABEL: test_x86_sse2_comieq_sd: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vcomisd %xmm1, %xmm0 -; AVX-NEXT: setnp %cl -; AVX-NEXT: sete %dl -; AVX-NEXT: testb %cl, %dl -; AVX-NEXT: cmovnel %esi, %eax -; AVX-NEXT: retq +; NO-AVX10_2-LABEL: test_x86_sse2_comieq_sd: +; NO-AVX10_2: # %bb.0: +; NO-AVX10_2-NEXT: movl %edi, %eax +; NO-AVX10_2-NEXT: vcomisd %xmm1, %xmm0 +; NO-AVX10_2-NEXT: setnp %cl +; NO-AVX10_2-NEXT: sete %dl +; NO-AVX10_2-NEXT: testb %cl, %dl +; NO-AVX10_2-NEXT: cmovnel %esi, %eax +; NO-AVX10_2-NEXT: retq +; +; AVX10_2-LABEL: test_x86_sse2_comieq_sd: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: movl %edi, %eax +; AVX10_2-NEXT: vcomxsd %xmm1, %xmm0 +; AVX10_2-NEXT: cmovel %esi, %eax +; AVX10_2-NEXT: retq %call = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1) ; [#uses=1] %cmp = icmp eq i32 %call, 0 %res = select i1 %cmp, i32 %a2, i32 %a3 @@ -398,13 +434,20 @@ define i32 @test_x86_sse2_comineq_sd(<2 x double> %a0, <2 x double> %a1, i32 %a2 ; SSE-NEXT: cmovpl %edi, %eax ; SSE-NEXT: retq ; -; AVX-LABEL: test_x86_sse2_comineq_sd: -; AVX: # %bb.0: -; AVX-NEXT: movl %esi, %eax -; AVX-NEXT: vcomisd %xmm1, %xmm0 -; AVX-NEXT: cmovnel %edi, %eax -; AVX-NEXT: cmovpl %edi, %eax -; AVX-NEXT: retq +; NO-AVX10_2-LABEL: test_x86_sse2_comineq_sd: +; NO-AVX10_2: # %bb.0: +; NO-AVX10_2-NEXT: movl %esi, %eax +; NO-AVX10_2-NEXT: vcomisd %xmm1, %xmm0 +; NO-AVX10_2-NEXT: cmovnel %edi, %eax +; NO-AVX10_2-NEXT: cmovpl %edi, %eax +; NO-AVX10_2-NEXT: retq +; +; AVX10_2-LABEL: test_x86_sse2_comineq_sd: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: movl %edi, %eax +; AVX10_2-NEXT: vcomxsd %xmm1, %xmm0 +; AVX10_2-NEXT: cmovel %esi, %eax +; AVX10_2-NEXT: retq %call = call i32 @llvm.x86.sse2.comineq.sd(<2 x double> %a0, <2 x double> %a1) ; [#uses=1] %cmp = icmp ne i32 %call, 0 %res = select i1 %cmp, i32 %a2, i32 %a3 @@ -423,15 +466,22 @@ define i32 @test_x86_sse2_ucomieq_sd(<2 x double> %a0, <2 x double> %a1, i32 %a2 ; SSE-NEXT: cmovnel %esi, %eax ; SSE-NEXT: retq ; -; AVX-LABEL: test_x86_sse2_ucomieq_sd: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vucomisd %xmm1, %xmm0 -; AVX-NEXT: setnp %cl -; AVX-NEXT: sete %dl -; AVX-NEXT: testb %cl, %dl -; AVX-NEXT: cmovnel %esi, %eax -; AVX-NEXT: retq +; NO-AVX10_2-LABEL: test_x86_sse2_ucomieq_sd: +; NO-AVX10_2: # %bb.0: +; NO-AVX10_2-NEXT: movl %edi, %eax +; NO-AVX10_2-NEXT: vucomisd %xmm1, %xmm0 +; NO-AVX10_2-NEXT: setnp %cl +; NO-AVX10_2-NEXT: sete %dl +; NO-AVX10_2-NEXT: testb %cl, %dl +; NO-AVX10_2-NEXT: cmovnel %esi, %eax +; NO-AVX10_2-NEXT: retq +; +; AVX10_2-LABEL: test_x86_sse2_ucomieq_sd: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: movl %edi, %eax +; AVX10_2-NEXT: vucomxsd %xmm1, %xmm0 +; AVX10_2-NEXT: cmovel %esi, %eax +; AVX10_2-NEXT: retq %call = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1) ; [#uses=1] %cmp = icmp eq i32 %call, 0 %res = select i1 %cmp, i32 %a2, i32 %a3 @@ -532,13 +582,20 @@ define i32 @test_x86_sse2_ucomineq_sd(<2 x double> %a0, <2 x double> %a1, i32 %a ; SSE-NEXT: cmovpl %edi, %eax ; SSE-NEXT: retq ; -; AVX-LABEL: test_x86_sse2_ucomineq_sd: -; AVX: # %bb.0: -; AVX-NEXT: movl %esi, %eax -; AVX-NEXT: vucomisd %xmm1, %xmm0 -; AVX-NEXT: cmovnel %edi, %eax -; AVX-NEXT: cmovpl %edi, %eax -; AVX-NEXT: retq +; NO-AVX10_2-LABEL: test_x86_sse2_ucomineq_sd: +; NO-AVX10_2: # %bb.0: +; NO-AVX10_2-NEXT: movl %esi, %eax +; NO-AVX10_2-NEXT: vucomisd %xmm1, %xmm0 +; NO-AVX10_2-NEXT: cmovnel %edi, %eax +; NO-AVX10_2-NEXT: cmovpl %edi, %eax +; NO-AVX10_2-NEXT: retq +; +; AVX10_2-LABEL: test_x86_sse2_ucomineq_sd: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: movl %edi, %eax +; AVX10_2-NEXT: vucomxsd %xmm1, %xmm0 +; AVX10_2-NEXT: cmovel %esi, %eax +; AVX10_2-NEXT: retq %call = call i32 @llvm.x86.sse2.ucomineq.sd(<2 x double> %a0, <2 x double> %a1) ; [#uses=1] %cmp = icmp ne i32 %call, 0 %res = select i1 %cmp, i32 %a2, i32 %a3 @@ -557,15 +614,22 @@ define void @PR38960_eq(<4 x float> %A, <4 x float> %B) { ; SSE-NEXT: # %bb.1: # %if.end ; SSE-NEXT: retq ; -; AVX-LABEL: PR38960_eq: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vcomiss %xmm1, %xmm0 -; AVX-NEXT: setnp %al -; AVX-NEXT: sete %cl -; AVX-NEXT: testb %al, %cl -; AVX-NEXT: jne foo@PLT # TAILCALL -; AVX-NEXT: # %bb.1: # %if.end -; AVX-NEXT: retq +; NO-AVX10_2-LABEL: PR38960_eq: +; NO-AVX10_2: # %bb.0: # %entry +; NO-AVX10_2-NEXT: vcomiss %xmm1, %xmm0 +; NO-AVX10_2-NEXT: setnp %al +; NO-AVX10_2-NEXT: sete %cl +; NO-AVX10_2-NEXT: testb %al, %cl +; NO-AVX10_2-NEXT: jne foo@PLT # TAILCALL +; NO-AVX10_2-NEXT: # %bb.1: # %if.end +; NO-AVX10_2-NEXT: retq +; +; AVX10_2-LABEL: PR38960_eq: +; AVX10_2: # %bb.0: # %entry +; AVX10_2-NEXT: vcomxss %xmm1, %xmm0 +; AVX10_2-NEXT: je foo@PLT # TAILCALL +; AVX10_2-NEXT: # %bb.1: # %if.end +; AVX10_2-NEXT: retq entry: %call = tail call i32 @llvm.x86.sse.comieq.ss(<4 x float> %A, <4 x float> %B) #3 %cmp = icmp eq i32 %call, 0 @@ -590,15 +654,22 @@ define void @PR38960_neq(<4 x float> %A, <4 x float> %B) { ; SSE-NEXT: # %bb.1: # %if.end ; SSE-NEXT: retq ; -; AVX-LABEL: PR38960_neq: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vcomiss %xmm1, %xmm0 -; AVX-NEXT: setp %al -; AVX-NEXT: setne %cl -; AVX-NEXT: orb %al, %cl -; AVX-NEXT: jne foo@PLT # TAILCALL -; AVX-NEXT: # %bb.1: # %if.end -; AVX-NEXT: retq +; NO-AVX10_2-LABEL: PR38960_neq: +; NO-AVX10_2: # %bb.0: # %entry +; NO-AVX10_2-NEXT: vcomiss %xmm1, %xmm0 +; NO-AVX10_2-NEXT: setp %al +; NO-AVX10_2-NEXT: setne %cl +; NO-AVX10_2-NEXT: orb %al, %cl +; NO-AVX10_2-NEXT: jne foo@PLT # TAILCALL +; NO-AVX10_2-NEXT: # %bb.1: # %if.end +; NO-AVX10_2-NEXT: retq +; +; AVX10_2-LABEL: PR38960_neq: +; AVX10_2: # %bb.0: # %entry +; AVX10_2-NEXT: vcomxss %xmm1, %xmm0 +; AVX10_2-NEXT: jne foo@PLT # TAILCALL +; AVX10_2-NEXT: # %bb.1: # %if.end +; AVX10_2-NEXT: retq entry: %call = tail call i32 @llvm.x86.sse.comineq.ss(<4 x float> %A, <4 x float> %B) #3 %cmp = icmp eq i32 %call, 0 diff --git a/llvm/test/MC/Disassembler/X86/avx10.2-com-ef-32.txt b/llvm/test/MC/Disassembler/X86/avx10.2-com-ef-32.txt new file mode 100644 index 00000000000000..e7adacbbf88c88 --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/avx10.2-com-ef-32.txt @@ -0,0 +1,195 @@ +# RUN: llvm-mc --disassemble %s -triple=i386 | FileCheck %s --check-prefixes=ATT +# RUN: llvm-mc --disassemble %s -triple=i386 --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL + +# ATT: vcomxsd %xmm3, %xmm2 +# INTEL: vcomxsd xmm2, xmm3 +0x62,0xf1,0xfe,0x08,0x2f,0xd3 + +# ATT: vcomxsd {sae}, %xmm3, %xmm2 +# INTEL: vcomxsd xmm2, xmm3, {sae} +0x62,0xf1,0xfe,0x18,0x2f,0xd3 + +# ATT: vcomxsd 268435456(%esp,%esi,8), %xmm2 +# INTEL: vcomxsd xmm2, qword ptr [esp + 8*esi + 268435456] +0x62,0xf1,0xfe,0x08,0x2f,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcomxsd 291(%edi,%eax,4), %xmm2 +# INTEL: vcomxsd xmm2, qword ptr [edi + 4*eax + 291] +0x62,0xf1,0xfe,0x08,0x2f,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcomxsd (%eax), %xmm2 +# INTEL: vcomxsd xmm2, qword ptr [eax] +0x62,0xf1,0xfe,0x08,0x2f,0x10 + +# ATT: vcomxsd -256(,%ebp,2), %xmm2 +# INTEL: vcomxsd xmm2, qword ptr [2*ebp - 256] +0x62,0xf1,0xfe,0x08,0x2f,0x14,0x6d,0x00,0xff,0xff,0xff + +# ATT: vcomxsd 1016(%ecx), %xmm2 +# INTEL: vcomxsd xmm2, qword ptr [ecx + 1016] +0x62,0xf1,0xfe,0x08,0x2f,0x51,0x7f + +# ATT: vcomxsd -1024(%edx), %xmm2 +# INTEL: vcomxsd xmm2, qword ptr [edx - 1024] +0x62,0xf1,0xfe,0x08,0x2f,0x52,0x80 + +# ATT: vcomxsh %xmm3, %xmm2 +# INTEL: vcomxsh xmm2, xmm3 +0x62,0xf5,0x7f,0x08,0x2f,0xd3 + +# ATT: vcomxsh {sae}, %xmm3, %xmm2 +# INTEL: vcomxsh xmm2, xmm3, {sae} +0x62,0xf5,0x7f,0x18,0x2f,0xd3 + +# ATT: vcomxsh 268435456(%esp,%esi,8), %xmm2 +# INTEL: vcomxsh xmm2, word ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7f,0x08,0x2f,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcomxsh 291(%edi,%eax,4), %xmm2 +# INTEL: vcomxsh xmm2, word ptr [edi + 4*eax + 291] +0x62,0xf5,0x7f,0x08,0x2f,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcomxsh (%eax), %xmm2 +# INTEL: vcomxsh xmm2, word ptr [eax] +0x62,0xf5,0x7f,0x08,0x2f,0x10 + +# ATT: vcomxsh -64(,%ebp,2), %xmm2 +# INTEL: vcomxsh xmm2, word ptr [2*ebp - 64] +0x62,0xf5,0x7f,0x08,0x2f,0x14,0x6d,0xc0,0xff,0xff,0xff + +# ATT: vcomxsh 254(%ecx), %xmm2 +# INTEL: vcomxsh xmm2, word ptr [ecx + 254] +0x62,0xf5,0x7f,0x08,0x2f,0x51,0x7f + +# ATT: vcomxsh -256(%edx), %xmm2 +# INTEL: vcomxsh xmm2, word ptr [edx - 256] +0x62,0xf5,0x7f,0x08,0x2f,0x52,0x80 + +# ATT: vcomxss %xmm3, %xmm2 +# INTEL: vcomxss xmm2, xmm3 +0x62,0xf1,0x7f,0x08,0x2f,0xd3 + +# ATT: vcomxss {sae}, %xmm3, %xmm2 +# INTEL: vcomxss xmm2, xmm3, {sae} +0x62,0xf1,0x7f,0x18,0x2f,0xd3 + +# ATT: vcomxss 268435456(%esp,%esi,8), %xmm2 +# INTEL: vcomxss xmm2, dword ptr [esp + 8*esi + 268435456] +0x62,0xf1,0x7f,0x08,0x2f,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcomxss 291(%edi,%eax,4), %xmm2 +# INTEL: vcomxss xmm2, dword ptr [edi + 4*eax + 291] +0x62,0xf1,0x7f,0x08,0x2f,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcomxss (%eax), %xmm2 +# INTEL: vcomxss xmm2, dword ptr [eax] +0x62,0xf1,0x7f,0x08,0x2f,0x10 + +# ATT: vcomxss -128(,%ebp,2), %xmm2 +# INTEL: vcomxss xmm2, dword ptr [2*ebp - 128] +0x62,0xf1,0x7f,0x08,0x2f,0x14,0x6d,0x80,0xff,0xff,0xff + +# ATT: vcomxss 508(%ecx), %xmm2 +# INTEL: vcomxss xmm2, dword ptr [ecx + 508] +0x62,0xf1,0x7f,0x08,0x2f,0x51,0x7f + +# ATT: vcomxss -512(%edx), %xmm2 +# INTEL: vcomxss xmm2, dword ptr [edx - 512] +0x62,0xf1,0x7f,0x08,0x2f,0x52,0x80 + +# ATT: vucomxsd %xmm3, %xmm2 +# INTEL: vucomxsd xmm2, xmm3 +0x62,0xf1,0xfe,0x08,0x2e,0xd3 + +# ATT: vucomxsd {sae}, %xmm3, %xmm2 +# INTEL: vucomxsd xmm2, xmm3, {sae} +0x62,0xf1,0xfe,0x18,0x2e,0xd3 + +# ATT: vucomxsd 268435456(%esp,%esi,8), %xmm2 +# INTEL: vucomxsd xmm2, qword ptr [esp + 8*esi + 268435456] +0x62,0xf1,0xfe,0x08,0x2e,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vucomxsd 291(%edi,%eax,4), %xmm2 +# INTEL: vucomxsd xmm2, qword ptr [edi + 4*eax + 291] +0x62,0xf1,0xfe,0x08,0x2e,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vucomxsd (%eax), %xmm2 +# INTEL: vucomxsd xmm2, qword ptr [eax] +0x62,0xf1,0xfe,0x08,0x2e,0x10 + +# ATT: vucomxsd -256(,%ebp,2), %xmm2 +# INTEL: vucomxsd xmm2, qword ptr [2*ebp - 256] +0x62,0xf1,0xfe,0x08,0x2e,0x14,0x6d,0x00,0xff,0xff,0xff + +# ATT: vucomxsd 1016(%ecx), %xmm2 +# INTEL: vucomxsd xmm2, qword ptr [ecx + 1016] +0x62,0xf1,0xfe,0x08,0x2e,0x51,0x7f + +# ATT: vucomxsd -1024(%edx), %xmm2 +# INTEL: vucomxsd xmm2, qword ptr [edx - 1024] +0x62,0xf1,0xfe,0x08,0x2e,0x52,0x80 + +# ATT: vucomxsh %xmm3, %xmm2 +# INTEL: vucomxsh xmm2, xmm3 +0x62,0xf5,0x7f,0x08,0x2e,0xd3 + +# ATT: vucomxsh {sae}, %xmm3, %xmm2 +# INTEL: vucomxsh xmm2, xmm3, {sae} +0x62,0xf5,0x7f,0x18,0x2e,0xd3 + +# ATT: vucomxsh 268435456(%esp,%esi,8), %xmm2 +# INTEL: vucomxsh xmm2, word ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7f,0x08,0x2e,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vucomxsh 291(%edi,%eax,4), %xmm2 +# INTEL: vucomxsh xmm2, word ptr [edi + 4*eax + 291] +0x62,0xf5,0x7f,0x08,0x2e,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vucomxsh (%eax), %xmm2 +# INTEL: vucomxsh xmm2, word ptr [eax] +0x62,0xf5,0x7f,0x08,0x2e,0x10 + +# ATT: vucomxsh -64(,%ebp,2), %xmm2 +# INTEL: vucomxsh xmm2, word ptr [2*ebp - 64] +0x62,0xf5,0x7f,0x08,0x2e,0x14,0x6d,0xc0,0xff,0xff,0xff + +# ATT: vucomxsh 254(%ecx), %xmm2 +# INTEL: vucomxsh xmm2, word ptr [ecx + 254] +0x62,0xf5,0x7f,0x08,0x2e,0x51,0x7f + +# ATT: vucomxsh -256(%edx), %xmm2 +# INTEL: vucomxsh xmm2, word ptr [edx - 256] +0x62,0xf5,0x7f,0x08,0x2e,0x52,0x80 + +# ATT: vucomxss %xmm3, %xmm2 +# INTEL: vucomxss xmm2, xmm3 +0x62,0xf1,0x7f,0x08,0x2e,0xd3 + +# ATT: vucomxss {sae}, %xmm3, %xmm2 +# INTEL: vucomxss xmm2, xmm3, {sae} +0x62,0xf1,0x7f,0x18,0x2e,0xd3 + +# ATT: vucomxss 268435456(%esp,%esi,8), %xmm2 +# INTEL: vucomxss xmm2, dword ptr [esp + 8*esi + 268435456] +0x62,0xf1,0x7f,0x08,0x2e,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vucomxss 291(%edi,%eax,4), %xmm2 +# INTEL: vucomxss xmm2, dword ptr [edi + 4*eax + 291] +0x62,0xf1,0x7f,0x08,0x2e,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vucomxss (%eax), %xmm2 +# INTEL: vucomxss xmm2, dword ptr [eax] +0x62,0xf1,0x7f,0x08,0x2e,0x10 + +# ATT: vucomxss -128(,%ebp,2), %xmm2 +# INTEL: vucomxss xmm2, dword ptr [2*ebp - 128] +0x62,0xf1,0x7f,0x08,0x2e,0x14,0x6d,0x80,0xff,0xff,0xff + +# ATT: vucomxss 508(%ecx), %xmm2 +# INTEL: vucomxss xmm2, dword ptr [ecx + 508] +0x62,0xf1,0x7f,0x08,0x2e,0x51,0x7f + +# ATT: vucomxss -512(%edx), %xmm2 +# INTEL: vucomxss xmm2, dword ptr [edx - 512] +0x62,0xf1,0x7f,0x08,0x2e,0x52,0x80 + diff --git a/llvm/test/MC/Disassembler/X86/avx10.2-com-ef-64.txt b/llvm/test/MC/Disassembler/X86/avx10.2-com-ef-64.txt new file mode 100644 index 00000000000000..ea580fe8d50836 --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/avx10.2-com-ef-64.txt @@ -0,0 +1,195 @@ +# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT +# RUN: llvm-mc --disassemble %s -triple=x86_64 --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL + +# ATT: vcomxsd %xmm23, %xmm22 +# INTEL: vcomxsd xmm22, xmm23 +0x62,0xa1,0xfe,0x08,0x2f,0xf7 + +# ATT: vcomxsd {sae}, %xmm23, %xmm22 +# INTEL: vcomxsd xmm22, xmm23, {sae} +0x62,0xa1,0xfe,0x18,0x2f,0xf7 + +# ATT: vcomxsd 268435456(%rbp,%r14,8), %xmm22 +# INTEL: vcomxsd xmm22, qword ptr [rbp + 8*r14 + 268435456] +0x62,0xa1,0xfe,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcomxsd 291(%r8,%rax,4), %xmm22 +# INTEL: vcomxsd xmm22, qword ptr [r8 + 4*rax + 291] +0x62,0xc1,0xfe,0x08,0x2f,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcomxsd (%rip), %xmm22 +# INTEL: vcomxsd xmm22, qword ptr [rip] +0x62,0xe1,0xfe,0x08,0x2f,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcomxsd -256(,%rbp,2), %xmm22 +# INTEL: vcomxsd xmm22, qword ptr [2*rbp - 256] +0x62,0xe1,0xfe,0x08,0x2f,0x34,0x6d,0x00,0xff,0xff,0xff + +# ATT: vcomxsd 1016(%rcx), %xmm22 +# INTEL: vcomxsd xmm22, qword ptr [rcx + 1016] +0x62,0xe1,0xfe,0x08,0x2f,0x71,0x7f + +# ATT: vcomxsd -1024(%rdx), %xmm22 +# INTEL: vcomxsd xmm22, qword ptr [rdx - 1024] +0x62,0xe1,0xfe,0x08,0x2f,0x72,0x80 + +# ATT: vcomxsh %xmm23, %xmm22 +# INTEL: vcomxsh xmm22, xmm23 +0x62,0xa5,0x7f,0x08,0x2f,0xf7 + +# ATT: vcomxsh {sae}, %xmm23, %xmm22 +# INTEL: vcomxsh xmm22, xmm23, {sae} +0x62,0xa5,0x7f,0x18,0x2f,0xf7 + +# ATT: vcomxsh 268435456(%rbp,%r14,8), %xmm22 +# INTEL: vcomxsh xmm22, word ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x7f,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcomxsh 291(%r8,%rax,4), %xmm22 +# INTEL: vcomxsh xmm22, word ptr [r8 + 4*rax + 291] +0x62,0xc5,0x7f,0x08,0x2f,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcomxsh (%rip), %xmm22 +# INTEL: vcomxsh xmm22, word ptr [rip] +0x62,0xe5,0x7f,0x08,0x2f,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcomxsh -64(,%rbp,2), %xmm22 +# INTEL: vcomxsh xmm22, word ptr [2*rbp - 64] +0x62,0xe5,0x7f,0x08,0x2f,0x34,0x6d,0xc0,0xff,0xff,0xff + +# ATT: vcomxsh 254(%rcx), %xmm22 +# INTEL: vcomxsh xmm22, word ptr [rcx + 254] +0x62,0xe5,0x7f,0x08,0x2f,0x71,0x7f + +# ATT: vcomxsh -256(%rdx), %xmm22 +# INTEL: vcomxsh xmm22, word ptr [rdx - 256] +0x62,0xe5,0x7f,0x08,0x2f,0x72,0x80 + +# ATT: vcomxss %xmm23, %xmm22 +# INTEL: vcomxss xmm22, xmm23 +0x62,0xa1,0x7f,0x08,0x2f,0xf7 + +# ATT: vcomxss {sae}, %xmm23, %xmm22 +# INTEL: vcomxss xmm22, xmm23, {sae} +0x62,0xa1,0x7f,0x18,0x2f,0xf7 + +# ATT: vcomxss 268435456(%rbp,%r14,8), %xmm22 +# INTEL: vcomxss xmm22, dword ptr [rbp + 8*r14 + 268435456] +0x62,0xa1,0x7f,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcomxss 291(%r8,%rax,4), %xmm22 +# INTEL: vcomxss xmm22, dword ptr [r8 + 4*rax + 291] +0x62,0xc1,0x7f,0x08,0x2f,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcomxss (%rip), %xmm22 +# INTEL: vcomxss xmm22, dword ptr [rip] +0x62,0xe1,0x7f,0x08,0x2f,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcomxss -128(,%rbp,2), %xmm22 +# INTEL: vcomxss xmm22, dword ptr [2*rbp - 128] +0x62,0xe1,0x7f,0x08,0x2f,0x34,0x6d,0x80,0xff,0xff,0xff + +# ATT: vcomxss 508(%rcx), %xmm22 +# INTEL: vcomxss xmm22, dword ptr [rcx + 508] +0x62,0xe1,0x7f,0x08,0x2f,0x71,0x7f + +# ATT: vcomxss -512(%rdx), %xmm22 +# INTEL: vcomxss xmm22, dword ptr [rdx - 512] +0x62,0xe1,0x7f,0x08,0x2f,0x72,0x80 + +# ATT: vucomxsd %xmm23, %xmm22 +# INTEL: vucomxsd xmm22, xmm23 +0x62,0xa1,0xfe,0x08,0x2e,0xf7 + +# ATT: vucomxsd {sae}, %xmm23, %xmm22 +# INTEL: vucomxsd xmm22, xmm23, {sae} +0x62,0xa1,0xfe,0x18,0x2e,0xf7 + +# ATT: vucomxsd 268435456(%rbp,%r14,8), %xmm22 +# INTEL: vucomxsd xmm22, qword ptr [rbp + 8*r14 + 268435456] +0x62,0xa1,0xfe,0x08,0x2e,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vucomxsd 291(%r8,%rax,4), %xmm22 +# INTEL: vucomxsd xmm22, qword ptr [r8 + 4*rax + 291] +0x62,0xc1,0xfe,0x08,0x2e,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vucomxsd (%rip), %xmm22 +# INTEL: vucomxsd xmm22, qword ptr [rip] +0x62,0xe1,0xfe,0x08,0x2e,0x35,0x00,0x00,0x00,0x00 + +# ATT: vucomxsd -256(,%rbp,2), %xmm22 +# INTEL: vucomxsd xmm22, qword ptr [2*rbp - 256] +0x62,0xe1,0xfe,0x08,0x2e,0x34,0x6d,0x00,0xff,0xff,0xff + +# ATT: vucomxsd 1016(%rcx), %xmm22 +# INTEL: vucomxsd xmm22, qword ptr [rcx + 1016] +0x62,0xe1,0xfe,0x08,0x2e,0x71,0x7f + +# ATT: vucomxsd -1024(%rdx), %xmm22 +# INTEL: vucomxsd xmm22, qword ptr [rdx - 1024] +0x62,0xe1,0xfe,0x08,0x2e,0x72,0x80 + +# ATT: vucomxsh %xmm23, %xmm22 +# INTEL: vucomxsh xmm22, xmm23 +0x62,0xa5,0x7f,0x08,0x2e,0xf7 + +# ATT: vucomxsh {sae}, %xmm23, %xmm22 +# INTEL: vucomxsh xmm22, xmm23, {sae} +0x62,0xa5,0x7f,0x18,0x2e,0xf7 + +# ATT: vucomxsh 268435456(%rbp,%r14,8), %xmm22 +# INTEL: vucomxsh xmm22, word ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x7f,0x08,0x2e,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vucomxsh 291(%r8,%rax,4), %xmm22 +# INTEL: vucomxsh xmm22, word ptr [r8 + 4*rax + 291] +0x62,0xc5,0x7f,0x08,0x2e,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vucomxsh (%rip), %xmm22 +# INTEL: vucomxsh xmm22, word ptr [rip] +0x62,0xe5,0x7f,0x08,0x2e,0x35,0x00,0x00,0x00,0x00 + +# ATT: vucomxsh -64(,%rbp,2), %xmm22 +# INTEL: vucomxsh xmm22, word ptr [2*rbp - 64] +0x62,0xe5,0x7f,0x08,0x2e,0x34,0x6d,0xc0,0xff,0xff,0xff + +# ATT: vucomxsh 254(%rcx), %xmm22 +# INTEL: vucomxsh xmm22, word ptr [rcx + 254] +0x62,0xe5,0x7f,0x08,0x2e,0x71,0x7f + +# ATT: vucomxsh -256(%rdx), %xmm22 +# INTEL: vucomxsh xmm22, word ptr [rdx - 256] +0x62,0xe5,0x7f,0x08,0x2e,0x72,0x80 + +# ATT: vucomxss %xmm23, %xmm22 +# INTEL: vucomxss xmm22, xmm23 +0x62,0xa1,0x7f,0x08,0x2e,0xf7 + +# ATT: vucomxss {sae}, %xmm23, %xmm22 +# INTEL: vucomxss xmm22, xmm23, {sae} +0x62,0xa1,0x7f,0x18,0x2e,0xf7 + +# ATT: vucomxss 268435456(%rbp,%r14,8), %xmm22 +# INTEL: vucomxss xmm22, dword ptr [rbp + 8*r14 + 268435456] +0x62,0xa1,0x7f,0x08,0x2e,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vucomxss 291(%r8,%rax,4), %xmm22 +# INTEL: vucomxss xmm22, dword ptr [r8 + 4*rax + 291] +0x62,0xc1,0x7f,0x08,0x2e,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vucomxss (%rip), %xmm22 +# INTEL: vucomxss xmm22, dword ptr [rip] +0x62,0xe1,0x7f,0x08,0x2e,0x35,0x00,0x00,0x00,0x00 + +# ATT: vucomxss -128(,%rbp,2), %xmm22 +# INTEL: vucomxss xmm22, dword ptr [2*rbp - 128] +0x62,0xe1,0x7f,0x08,0x2e,0x34,0x6d,0x80,0xff,0xff,0xff + +# ATT: vucomxss 508(%rcx), %xmm22 +# INTEL: vucomxss xmm22, dword ptr [rcx + 508] +0x62,0xe1,0x7f,0x08,0x2e,0x71,0x7f + +# ATT: vucomxss -512(%rdx), %xmm22 +# INTEL: vucomxss xmm22, dword ptr [rdx - 512] +0x62,0xe1,0x7f,0x08,0x2e,0x72,0x80 + diff --git a/llvm/test/MC/X86/avx10.2-com-ef-32-att.s b/llvm/test/MC/X86/avx10.2-com-ef-32-att.s new file mode 100644 index 00000000000000..8883bb3d6775a6 --- /dev/null +++ b/llvm/test/MC/X86/avx10.2-com-ef-32-att.s @@ -0,0 +1,194 @@ +// RUN: llvm-mc -triple i386 --show-encoding %s | FileCheck %s + +// CHECK: vcomxsd %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2f,0xd3] + vcomxsd %xmm3, %xmm2 + +// CHECK: vcomxsd {sae}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf1,0xfe,0x18,0x2f,0xd3] + vcomxsd {sae}, %xmm3, %xmm2 + +// CHECK: vcomxsd 268435456(%esp,%esi,8), %xmm2 +// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2f,0x94,0xf4,0x00,0x00,0x00,0x10] + vcomxsd 268435456(%esp,%esi,8), %xmm2 + +// CHECK: vcomxsd 291(%edi,%eax,4), %xmm2 +// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2f,0x94,0x87,0x23,0x01,0x00,0x00] + vcomxsd 291(%edi,%eax,4), %xmm2 + +// CHECK: vcomxsd (%eax), %xmm2 +// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2f,0x10] + vcomxsd (%eax), %xmm2 + +// CHECK: vcomxsd -256(,%ebp,2), %xmm2 +// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2f,0x14,0x6d,0x00,0xff,0xff,0xff] + vcomxsd -256(,%ebp,2), %xmm2 + +// CHECK: vcomxsd 1016(%ecx), %xmm2 +// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2f,0x51,0x7f] + vcomxsd 1016(%ecx), %xmm2 + +// CHECK: vcomxsd -1024(%edx), %xmm2 +// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2f,0x52,0x80] + vcomxsd -1024(%edx), %xmm2 + +// CHECK: vcomxsh %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2f,0xd3] + vcomxsh %xmm3, %xmm2 + +// CHECK: vcomxsh {sae}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7f,0x18,0x2f,0xd3] + vcomxsh {sae}, %xmm3, %xmm2 + +// CHECK: vcomxsh 268435456(%esp,%esi,8), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2f,0x94,0xf4,0x00,0x00,0x00,0x10] + vcomxsh 268435456(%esp,%esi,8), %xmm2 + +// CHECK: vcomxsh 291(%edi,%eax,4), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2f,0x94,0x87,0x23,0x01,0x00,0x00] + vcomxsh 291(%edi,%eax,4), %xmm2 + +// CHECK: vcomxsh (%eax), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2f,0x10] + vcomxsh (%eax), %xmm2 + +// CHECK: vcomxsh -64(,%ebp,2), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2f,0x14,0x6d,0xc0,0xff,0xff,0xff] + vcomxsh -64(,%ebp,2), %xmm2 + +// CHECK: vcomxsh 254(%ecx), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2f,0x51,0x7f] + vcomxsh 254(%ecx), %xmm2 + +// CHECK: vcomxsh -256(%edx), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2f,0x52,0x80] + vcomxsh -256(%edx), %xmm2 + +// CHECK: vcomxss %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2f,0xd3] + vcomxss %xmm3, %xmm2 + +// CHECK: vcomxss {sae}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf1,0x7f,0x18,0x2f,0xd3] + vcomxss {sae}, %xmm3, %xmm2 + +// CHECK: vcomxss 268435456(%esp,%esi,8), %xmm2 +// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2f,0x94,0xf4,0x00,0x00,0x00,0x10] + vcomxss 268435456(%esp,%esi,8), %xmm2 + +// CHECK: vcomxss 291(%edi,%eax,4), %xmm2 +// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2f,0x94,0x87,0x23,0x01,0x00,0x00] + vcomxss 291(%edi,%eax,4), %xmm2 + +// CHECK: vcomxss (%eax), %xmm2 +// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2f,0x10] + vcomxss (%eax), %xmm2 + +// CHECK: vcomxss -128(,%ebp,2), %xmm2 +// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2f,0x14,0x6d,0x80,0xff,0xff,0xff] + vcomxss -128(,%ebp,2), %xmm2 + +// CHECK: vcomxss 508(%ecx), %xmm2 +// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2f,0x51,0x7f] + vcomxss 508(%ecx), %xmm2 + +// CHECK: vcomxss -512(%edx), %xmm2 +// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2f,0x52,0x80] + vcomxss -512(%edx), %xmm2 + +// CHECK: vucomxsd %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2e,0xd3] + vucomxsd %xmm3, %xmm2 + +// CHECK: vucomxsd {sae}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf1,0xfe,0x18,0x2e,0xd3] + vucomxsd {sae}, %xmm3, %xmm2 + +// CHECK: vucomxsd 268435456(%esp,%esi,8), %xmm2 +// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2e,0x94,0xf4,0x00,0x00,0x00,0x10] + vucomxsd 268435456(%esp,%esi,8), %xmm2 + +// CHECK: vucomxsd 291(%edi,%eax,4), %xmm2 +// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2e,0x94,0x87,0x23,0x01,0x00,0x00] + vucomxsd 291(%edi,%eax,4), %xmm2 + +// CHECK: vucomxsd (%eax), %xmm2 +// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2e,0x10] + vucomxsd (%eax), %xmm2 + +// CHECK: vucomxsd -256(,%ebp,2), %xmm2 +// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2e,0x14,0x6d,0x00,0xff,0xff,0xff] + vucomxsd -256(,%ebp,2), %xmm2 + +// CHECK: vucomxsd 1016(%ecx), %xmm2 +// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2e,0x51,0x7f] + vucomxsd 1016(%ecx), %xmm2 + +// CHECK: vucomxsd -1024(%edx), %xmm2 +// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2e,0x52,0x80] + vucomxsd -1024(%edx), %xmm2 + +// CHECK: vucomxsh %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2e,0xd3] + vucomxsh %xmm3, %xmm2 + +// CHECK: vucomxsh {sae}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7f,0x18,0x2e,0xd3] + vucomxsh {sae}, %xmm3, %xmm2 + +// CHECK: vucomxsh 268435456(%esp,%esi,8), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2e,0x94,0xf4,0x00,0x00,0x00,0x10] + vucomxsh 268435456(%esp,%esi,8), %xmm2 + +// CHECK: vucomxsh 291(%edi,%eax,4), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2e,0x94,0x87,0x23,0x01,0x00,0x00] + vucomxsh 291(%edi,%eax,4), %xmm2 + +// CHECK: vucomxsh (%eax), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2e,0x10] + vucomxsh (%eax), %xmm2 + +// CHECK: vucomxsh -64(,%ebp,2), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2e,0x14,0x6d,0xc0,0xff,0xff,0xff] + vucomxsh -64(,%ebp,2), %xmm2 + +// CHECK: vucomxsh 254(%ecx), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2e,0x51,0x7f] + vucomxsh 254(%ecx), %xmm2 + +// CHECK: vucomxsh -256(%edx), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2e,0x52,0x80] + vucomxsh -256(%edx), %xmm2 + +// CHECK: vucomxss %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2e,0xd3] + vucomxss %xmm3, %xmm2 + +// CHECK: vucomxss {sae}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf1,0x7f,0x18,0x2e,0xd3] + vucomxss {sae}, %xmm3, %xmm2 + +// CHECK: vucomxss 268435456(%esp,%esi,8), %xmm2 +// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2e,0x94,0xf4,0x00,0x00,0x00,0x10] + vucomxss 268435456(%esp,%esi,8), %xmm2 + +// CHECK: vucomxss 291(%edi,%eax,4), %xmm2 +// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2e,0x94,0x87,0x23,0x01,0x00,0x00] + vucomxss 291(%edi,%eax,4), %xmm2 + +// CHECK: vucomxss (%eax), %xmm2 +// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2e,0x10] + vucomxss (%eax), %xmm2 + +// CHECK: vucomxss -128(,%ebp,2), %xmm2 +// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2e,0x14,0x6d,0x80,0xff,0xff,0xff] + vucomxss -128(,%ebp,2), %xmm2 + +// CHECK: vucomxss 508(%ecx), %xmm2 +// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2e,0x51,0x7f] + vucomxss 508(%ecx), %xmm2 + +// CHECK: vucomxss -512(%edx), %xmm2 +// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2e,0x52,0x80] + vucomxss -512(%edx), %xmm2 + diff --git a/llvm/test/MC/X86/avx10.2-com-ef-32-intel.s b/llvm/test/MC/X86/avx10.2-com-ef-32-intel.s new file mode 100644 index 00000000000000..9ff0484db133cd --- /dev/null +++ b/llvm/test/MC/X86/avx10.2-com-ef-32-intel.s @@ -0,0 +1,194 @@ +// RUN: llvm-mc -triple i386 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s + +// CHECK: vcomxsd xmm2, xmm3 +// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2f,0xd3] + vcomxsd xmm2, xmm3 + +// CHECK: vcomxsd xmm2, xmm3, {sae} +// CHECK: encoding: [0x62,0xf1,0xfe,0x18,0x2f,0xd3] + vcomxsd xmm2, xmm3, {sae} + +// CHECK: vcomxsd xmm2, qword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2f,0x94,0xf4,0x00,0x00,0x00,0x10] + vcomxsd xmm2, qword ptr [esp + 8*esi + 268435456] + +// CHECK: vcomxsd xmm2, qword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2f,0x94,0x87,0x23,0x01,0x00,0x00] + vcomxsd xmm2, qword ptr [edi + 4*eax + 291] + +// CHECK: vcomxsd xmm2, qword ptr [eax] +// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2f,0x10] + vcomxsd xmm2, qword ptr [eax] + +// CHECK: vcomxsd xmm2, qword ptr [2*ebp - 256] +// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2f,0x14,0x6d,0x00,0xff,0xff,0xff] + vcomxsd xmm2, qword ptr [2*ebp - 256] + +// CHECK: vcomxsd xmm2, qword ptr [ecx + 1016] +// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2f,0x51,0x7f] + vcomxsd xmm2, qword ptr [ecx + 1016] + +// CHECK: vcomxsd xmm2, qword ptr [edx - 1024] +// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2f,0x52,0x80] + vcomxsd xmm2, qword ptr [edx - 1024] + +// CHECK: vcomxsh xmm2, xmm3 +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2f,0xd3] + vcomxsh xmm2, xmm3 + +// CHECK: vcomxsh xmm2, xmm3, {sae} +// CHECK: encoding: [0x62,0xf5,0x7f,0x18,0x2f,0xd3] + vcomxsh xmm2, xmm3, {sae} + +// CHECK: vcomxsh xmm2, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2f,0x94,0xf4,0x00,0x00,0x00,0x10] + vcomxsh xmm2, word ptr [esp + 8*esi + 268435456] + +// CHECK: vcomxsh xmm2, word ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2f,0x94,0x87,0x23,0x01,0x00,0x00] + vcomxsh xmm2, word ptr [edi + 4*eax + 291] + +// CHECK: vcomxsh xmm2, word ptr [eax] +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2f,0x10] + vcomxsh xmm2, word ptr [eax] + +// CHECK: vcomxsh xmm2, word ptr [2*ebp - 64] +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2f,0x14,0x6d,0xc0,0xff,0xff,0xff] + vcomxsh xmm2, word ptr [2*ebp - 64] + +// CHECK: vcomxsh xmm2, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2f,0x51,0x7f] + vcomxsh xmm2, word ptr [ecx + 254] + +// CHECK: vcomxsh xmm2, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2f,0x52,0x80] + vcomxsh xmm2, word ptr [edx - 256] + +// CHECK: vcomxss xmm2, xmm3 +// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2f,0xd3] + vcomxss xmm2, xmm3 + +// CHECK: vcomxss xmm2, xmm3, {sae} +// CHECK: encoding: [0x62,0xf1,0x7f,0x18,0x2f,0xd3] + vcomxss xmm2, xmm3, {sae} + +// CHECK: vcomxss xmm2, dword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2f,0x94,0xf4,0x00,0x00,0x00,0x10] + vcomxss xmm2, dword ptr [esp + 8*esi + 268435456] + +// CHECK: vcomxss xmm2, dword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2f,0x94,0x87,0x23,0x01,0x00,0x00] + vcomxss xmm2, dword ptr [edi + 4*eax + 291] + +// CHECK: vcomxss xmm2, dword ptr [eax] +// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2f,0x10] + vcomxss xmm2, dword ptr [eax] + +// CHECK: vcomxss xmm2, dword ptr [2*ebp - 128] +// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2f,0x14,0x6d,0x80,0xff,0xff,0xff] + vcomxss xmm2, dword ptr [2*ebp - 128] + +// CHECK: vcomxss xmm2, dword ptr [ecx + 508] +// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2f,0x51,0x7f] + vcomxss xmm2, dword ptr [ecx + 508] + +// CHECK: vcomxss xmm2, dword ptr [edx - 512] +// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2f,0x52,0x80] + vcomxss xmm2, dword ptr [edx - 512] + +// CHECK: vucomxsd xmm2, xmm3 +// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2e,0xd3] + vucomxsd xmm2, xmm3 + +// CHECK: vucomxsd xmm2, xmm3, {sae} +// CHECK: encoding: [0x62,0xf1,0xfe,0x18,0x2e,0xd3] + vucomxsd xmm2, xmm3, {sae} + +// CHECK: vucomxsd xmm2, qword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2e,0x94,0xf4,0x00,0x00,0x00,0x10] + vucomxsd xmm2, qword ptr [esp + 8*esi + 268435456] + +// CHECK: vucomxsd xmm2, qword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2e,0x94,0x87,0x23,0x01,0x00,0x00] + vucomxsd xmm2, qword ptr [edi + 4*eax + 291] + +// CHECK: vucomxsd xmm2, qword ptr [eax] +// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2e,0x10] + vucomxsd xmm2, qword ptr [eax] + +// CHECK: vucomxsd xmm2, qword ptr [2*ebp - 256] +// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2e,0x14,0x6d,0x00,0xff,0xff,0xff] + vucomxsd xmm2, qword ptr [2*ebp - 256] + +// CHECK: vucomxsd xmm2, qword ptr [ecx + 1016] +// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2e,0x51,0x7f] + vucomxsd xmm2, qword ptr [ecx + 1016] + +// CHECK: vucomxsd xmm2, qword ptr [edx - 1024] +// CHECK: encoding: [0x62,0xf1,0xfe,0x08,0x2e,0x52,0x80] + vucomxsd xmm2, qword ptr [edx - 1024] + +// CHECK: vucomxsh xmm2, xmm3 +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2e,0xd3] + vucomxsh xmm2, xmm3 + +// CHECK: vucomxsh xmm2, xmm3, {sae} +// CHECK: encoding: [0x62,0xf5,0x7f,0x18,0x2e,0xd3] + vucomxsh xmm2, xmm3, {sae} + +// CHECK: vucomxsh xmm2, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2e,0x94,0xf4,0x00,0x00,0x00,0x10] + vucomxsh xmm2, word ptr [esp + 8*esi + 268435456] + +// CHECK: vucomxsh xmm2, word ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2e,0x94,0x87,0x23,0x01,0x00,0x00] + vucomxsh xmm2, word ptr [edi + 4*eax + 291] + +// CHECK: vucomxsh xmm2, word ptr [eax] +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2e,0x10] + vucomxsh xmm2, word ptr [eax] + +// CHECK: vucomxsh xmm2, word ptr [2*ebp - 64] +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2e,0x14,0x6d,0xc0,0xff,0xff,0xff] + vucomxsh xmm2, word ptr [2*ebp - 64] + +// CHECK: vucomxsh xmm2, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2e,0x51,0x7f] + vucomxsh xmm2, word ptr [ecx + 254] + +// CHECK: vucomxsh xmm2, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x2e,0x52,0x80] + vucomxsh xmm2, word ptr [edx - 256] + +// CHECK: vucomxss xmm2, xmm3 +// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2e,0xd3] + vucomxss xmm2, xmm3 + +// CHECK: vucomxss xmm2, xmm3, {sae} +// CHECK: encoding: [0x62,0xf1,0x7f,0x18,0x2e,0xd3] + vucomxss xmm2, xmm3, {sae} + +// CHECK: vucomxss xmm2, dword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2e,0x94,0xf4,0x00,0x00,0x00,0x10] + vucomxss xmm2, dword ptr [esp + 8*esi + 268435456] + +// CHECK: vucomxss xmm2, dword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2e,0x94,0x87,0x23,0x01,0x00,0x00] + vucomxss xmm2, dword ptr [edi + 4*eax + 291] + +// CHECK: vucomxss xmm2, dword ptr [eax] +// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2e,0x10] + vucomxss xmm2, dword ptr [eax] + +// CHECK: vucomxss xmm2, dword ptr [2*ebp - 128] +// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2e,0x14,0x6d,0x80,0xff,0xff,0xff] + vucomxss xmm2, dword ptr [2*ebp - 128] + +// CHECK: vucomxss xmm2, dword ptr [ecx + 508] +// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2e,0x51,0x7f] + vucomxss xmm2, dword ptr [ecx + 508] + +// CHECK: vucomxss xmm2, dword ptr [edx - 512] +// CHECK: encoding: [0x62,0xf1,0x7f,0x08,0x2e,0x52,0x80] + vucomxss xmm2, dword ptr [edx - 512] + diff --git a/llvm/test/MC/X86/avx10.2-com-ef-64-att.s b/llvm/test/MC/X86/avx10.2-com-ef-64-att.s new file mode 100644 index 00000000000000..2f3690537334ad --- /dev/null +++ b/llvm/test/MC/X86/avx10.2-com-ef-64-att.s @@ -0,0 +1,194 @@ +// RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s + +// CHECK: vcomxsd %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa1,0xfe,0x08,0x2f,0xf7] + vcomxsd %xmm23, %xmm22 + +// CHECK: vcomxsd {sae}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa1,0xfe,0x18,0x2f,0xf7] + vcomxsd {sae}, %xmm23, %xmm22 + +// CHECK: vcomxsd 268435456(%rbp,%r14,8), %xmm22 +// CHECK: encoding: [0x62,0xa1,0xfe,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcomxsd 268435456(%rbp,%r14,8), %xmm22 + +// CHECK: vcomxsd 291(%r8,%rax,4), %xmm22 +// CHECK: encoding: [0x62,0xc1,0xfe,0x08,0x2f,0xb4,0x80,0x23,0x01,0x00,0x00] + vcomxsd 291(%r8,%rax,4), %xmm22 + +// CHECK: vcomxsd (%rip), %xmm22 +// CHECK: encoding: [0x62,0xe1,0xfe,0x08,0x2f,0x35,0x00,0x00,0x00,0x00] + vcomxsd (%rip), %xmm22 + +// CHECK: vcomxsd -256(,%rbp,2), %xmm22 +// CHECK: encoding: [0x62,0xe1,0xfe,0x08,0x2f,0x34,0x6d,0x00,0xff,0xff,0xff] + vcomxsd -256(,%rbp,2), %xmm22 + +// CHECK: vcomxsd 1016(%rcx), %xmm22 +// CHECK: encoding: [0x62,0xe1,0xfe,0x08,0x2f,0x71,0x7f] + vcomxsd 1016(%rcx), %xmm22 + +// CHECK: vcomxsd -1024(%rdx), %xmm22 +// CHECK: encoding: [0x62,0xe1,0xfe,0x08,0x2f,0x72,0x80] + vcomxsd -1024(%rdx), %xmm22 + +// CHECK: vcomxsh %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa5,0x7f,0x08,0x2f,0xf7] + vcomxsh %xmm23, %xmm22 + +// CHECK: vcomxsh {sae}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa5,0x7f,0x18,0x2f,0xf7] + vcomxsh {sae}, %xmm23, %xmm22 + +// CHECK: vcomxsh 268435456(%rbp,%r14,8), %xmm22 +// CHECK: encoding: [0x62,0xa5,0x7f,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcomxsh 268435456(%rbp,%r14,8), %xmm22 + +// CHECK: vcomxsh 291(%r8,%rax,4), %xmm22 +// CHECK: encoding: [0x62,0xc5,0x7f,0x08,0x2f,0xb4,0x80,0x23,0x01,0x00,0x00] + vcomxsh 291(%r8,%rax,4), %xmm22 + +// CHECK: vcomxsh (%rip), %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x2f,0x35,0x00,0x00,0x00,0x00] + vcomxsh (%rip), %xmm22 + +// CHECK: vcomxsh -64(,%rbp,2), %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x2f,0x34,0x6d,0xc0,0xff,0xff,0xff] + vcomxsh -64(,%rbp,2), %xmm22 + +// CHECK: vcomxsh 254(%rcx), %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x2f,0x71,0x7f] + vcomxsh 254(%rcx), %xmm22 + +// CHECK: vcomxsh -256(%rdx), %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x2f,0x72,0x80] + vcomxsh -256(%rdx), %xmm22 + +// CHECK: vcomxss %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa1,0x7f,0x08,0x2f,0xf7] + vcomxss %xmm23, %xmm22 + +// CHECK: vcomxss {sae}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa1,0x7f,0x18,0x2f,0xf7] + vcomxss {sae}, %xmm23, %xmm22 + +// CHECK: vcomxss 268435456(%rbp,%r14,8), %xmm22 +// CHECK: encoding: [0x62,0xa1,0x7f,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcomxss 268435456(%rbp,%r14,8), %xmm22 + +// CHECK: vcomxss 291(%r8,%rax,4), %xmm22 +// CHECK: encoding: [0x62,0xc1,0x7f,0x08,0x2f,0xb4,0x80,0x23,0x01,0x00,0x00] + vcomxss 291(%r8,%rax,4), %xmm22 + +// CHECK: vcomxss (%rip), %xmm22 +// CHECK: encoding: [0x62,0xe1,0x7f,0x08,0x2f,0x35,0x00,0x00,0x00,0x00] + vcomxss (%rip), %xmm22 + +// CHECK: vcomxss -128(,%rbp,2), %xmm22 +// CHECK: encoding: [0x62,0xe1,0x7f,0x08,0x2f,0x34,0x6d,0x80,0xff,0xff,0xff] + vcomxss -128(,%rbp,2), %xmm22 + +// CHECK: vcomxss 508(%rcx), %xmm22 +// CHECK: encoding: [0x62,0xe1,0x7f,0x08,0x2f,0x71,0x7f] + vcomxss 508(%rcx), %xmm22 + +// CHECK: vcomxss -512(%rdx), %xmm22 +// CHECK: encoding: [0x62,0xe1,0x7f,0x08,0x2f,0x72,0x80] + vcomxss -512(%rdx), %xmm22 + +// CHECK: vucomxsd %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa1,0xfe,0x08,0x2e,0xf7] + vucomxsd %xmm23, %xmm22 + +// CHECK: vucomxsd {sae}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa1,0xfe,0x18,0x2e,0xf7] + vucomxsd {sae}, %xmm23, %xmm22 + +// CHECK: vucomxsd 268435456(%rbp,%r14,8), %xmm22 +// CHECK: encoding: [0x62,0xa1,0xfe,0x08,0x2e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vucomxsd 268435456(%rbp,%r14,8), %xmm22 + +// CHECK: vucomxsd 291(%r8,%rax,4), %xmm22 +// CHECK: encoding: [0x62,0xc1,0xfe,0x08,0x2e,0xb4,0x80,0x23,0x01,0x00,0x00] + vucomxsd 291(%r8,%rax,4), %xmm22 + +// CHECK: vucomxsd (%rip), %xmm22 +// CHECK: encoding: [0x62,0xe1,0xfe,0x08,0x2e,0x35,0x00,0x00,0x00,0x00] + vucomxsd (%rip), %xmm22 + +// CHECK: vucomxsd -256(,%rbp,2), %xmm22 +// CHECK: encoding: [0x62,0xe1,0xfe,0x08,0x2e,0x34,0x6d,0x00,0xff,0xff,0xff] + vucomxsd -256(,%rbp,2), %xmm22 + +// CHECK: vucomxsd 1016(%rcx), %xmm22 +// CHECK: encoding: [0x62,0xe1,0xfe,0x08,0x2e,0x71,0x7f] + vucomxsd 1016(%rcx), %xmm22 + +// CHECK: vucomxsd -1024(%rdx), %xmm22 +// CHECK: encoding: [0x62,0xe1,0xfe,0x08,0x2e,0x72,0x80] + vucomxsd -1024(%rdx), %xmm22 + +// CHECK: vucomxsh %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa5,0x7f,0x08,0x2e,0xf7] + vucomxsh %xmm23, %xmm22 + +// CHECK: vucomxsh {sae}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa5,0x7f,0x18,0x2e,0xf7] + vucomxsh {sae}, %xmm23, %xmm22 + +// CHECK: vucomxsh 268435456(%rbp,%r14,8), %xmm22 +// CHECK: encoding: [0x62,0xa5,0x7f,0x08,0x2e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vucomxsh 268435456(%rbp,%r14,8), %xmm22 + +// CHECK: vucomxsh 291(%r8,%rax,4), %xmm22 +// CHECK: encoding: [0x62,0xc5,0x7f,0x08,0x2e,0xb4,0x80,0x23,0x01,0x00,0x00] + vucomxsh 291(%r8,%rax,4), %xmm22 + +// CHECK: vucomxsh (%rip), %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x2e,0x35,0x00,0x00,0x00,0x00] + vucomxsh (%rip), %xmm22 + +// CHECK: vucomxsh -64(,%rbp,2), %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x2e,0x34,0x6d,0xc0,0xff,0xff,0xff] + vucomxsh -64(,%rbp,2), %xmm22 + +// CHECK: vucomxsh 254(%rcx), %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x2e,0x71,0x7f] + vucomxsh 254(%rcx), %xmm22 + +// CHECK: vucomxsh -256(%rdx), %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x2e,0x72,0x80] + vucomxsh -256(%rdx), %xmm22 + +// CHECK: vucomxss %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa1,0x7f,0x08,0x2e,0xf7] + vucomxss %xmm23, %xmm22 + +// CHECK: vucomxss {sae}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa1,0x7f,0x18,0x2e,0xf7] + vucomxss {sae}, %xmm23, %xmm22 + +// CHECK: vucomxss 268435456(%rbp,%r14,8), %xmm22 +// CHECK: encoding: [0x62,0xa1,0x7f,0x08,0x2e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vucomxss 268435456(%rbp,%r14,8), %xmm22 + +// CHECK: vucomxss 291(%r8,%rax,4), %xmm22 +// CHECK: encoding: [0x62,0xc1,0x7f,0x08,0x2e,0xb4,0x80,0x23,0x01,0x00,0x00] + vucomxss 291(%r8,%rax,4), %xmm22 + +// CHECK: vucomxss (%rip), %xmm22 +// CHECK: encoding: [0x62,0xe1,0x7f,0x08,0x2e,0x35,0x00,0x00,0x00,0x00] + vucomxss (%rip), %xmm22 + +// CHECK: vucomxss -128(,%rbp,2), %xmm22 +// CHECK: encoding: [0x62,0xe1,0x7f,0x08,0x2e,0x34,0x6d,0x80,0xff,0xff,0xff] + vucomxss -128(,%rbp,2), %xmm22 + +// CHECK: vucomxss 508(%rcx), %xmm22 +// CHECK: encoding: [0x62,0xe1,0x7f,0x08,0x2e,0x71,0x7f] + vucomxss 508(%rcx), %xmm22 + +// CHECK: vucomxss -512(%rdx), %xmm22 +// CHECK: encoding: [0x62,0xe1,0x7f,0x08,0x2e,0x72,0x80] + vucomxss -512(%rdx), %xmm22 + diff --git a/llvm/test/MC/X86/avx10.2-com-ef-64-intel.s b/llvm/test/MC/X86/avx10.2-com-ef-64-intel.s new file mode 100644 index 00000000000000..41aaf99270b886 --- /dev/null +++ b/llvm/test/MC/X86/avx10.2-com-ef-64-intel.s @@ -0,0 +1,194 @@ +// RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s + +// CHECK: vcomxsd xmm22, xmm23 +// CHECK: encoding: [0x62,0xa1,0xfe,0x08,0x2f,0xf7] + vcomxsd xmm22, xmm23 + +// CHECK: vcomxsd xmm22, xmm23, {sae} +// CHECK: encoding: [0x62,0xa1,0xfe,0x18,0x2f,0xf7] + vcomxsd xmm22, xmm23, {sae} + +// CHECK: vcomxsd xmm22, qword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa1,0xfe,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcomxsd xmm22, qword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcomxsd xmm22, qword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc1,0xfe,0x08,0x2f,0xb4,0x80,0x23,0x01,0x00,0x00] + vcomxsd xmm22, qword ptr [r8 + 4*rax + 291] + +// CHECK: vcomxsd xmm22, qword ptr [rip] +// CHECK: encoding: [0x62,0xe1,0xfe,0x08,0x2f,0x35,0x00,0x00,0x00,0x00] + vcomxsd xmm22, qword ptr [rip] + +// CHECK: vcomxsd xmm22, qword ptr [2*rbp - 256] +// CHECK: encoding: [0x62,0xe1,0xfe,0x08,0x2f,0x34,0x6d,0x00,0xff,0xff,0xff] + vcomxsd xmm22, qword ptr [2*rbp - 256] + +// CHECK: vcomxsd xmm22, qword ptr [rcx + 1016] +// CHECK: encoding: [0x62,0xe1,0xfe,0x08,0x2f,0x71,0x7f] + vcomxsd xmm22, qword ptr [rcx + 1016] + +// CHECK: vcomxsd xmm22, qword ptr [rdx - 1024] +// CHECK: encoding: [0x62,0xe1,0xfe,0x08,0x2f,0x72,0x80] + vcomxsd xmm22, qword ptr [rdx - 1024] + +// CHECK: vcomxsh xmm22, xmm23 +// CHECK: encoding: [0x62,0xa5,0x7f,0x08,0x2f,0xf7] + vcomxsh xmm22, xmm23 + +// CHECK: vcomxsh xmm22, xmm23, {sae} +// CHECK: encoding: [0x62,0xa5,0x7f,0x18,0x2f,0xf7] + vcomxsh xmm22, xmm23, {sae} + +// CHECK: vcomxsh xmm22, word ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x7f,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcomxsh xmm22, word ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcomxsh xmm22, word ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x7f,0x08,0x2f,0xb4,0x80,0x23,0x01,0x00,0x00] + vcomxsh xmm22, word ptr [r8 + 4*rax + 291] + +// CHECK: vcomxsh xmm22, word ptr [rip] +// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x2f,0x35,0x00,0x00,0x00,0x00] + vcomxsh xmm22, word ptr [rip] + +// CHECK: vcomxsh xmm22, word ptr [2*rbp - 64] +// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x2f,0x34,0x6d,0xc0,0xff,0xff,0xff] + vcomxsh xmm22, word ptr [2*rbp - 64] + +// CHECK: vcomxsh xmm22, word ptr [rcx + 254] +// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x2f,0x71,0x7f] + vcomxsh xmm22, word ptr [rcx + 254] + +// CHECK: vcomxsh xmm22, word ptr [rdx - 256] +// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x2f,0x72,0x80] + vcomxsh xmm22, word ptr [rdx - 256] + +// CHECK: vcomxss xmm22, xmm23 +// CHECK: encoding: [0x62,0xa1,0x7f,0x08,0x2f,0xf7] + vcomxss xmm22, xmm23 + +// CHECK: vcomxss xmm22, xmm23, {sae} +// CHECK: encoding: [0x62,0xa1,0x7f,0x18,0x2f,0xf7] + vcomxss xmm22, xmm23, {sae} + +// CHECK: vcomxss xmm22, dword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa1,0x7f,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcomxss xmm22, dword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcomxss xmm22, dword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc1,0x7f,0x08,0x2f,0xb4,0x80,0x23,0x01,0x00,0x00] + vcomxss xmm22, dword ptr [r8 + 4*rax + 291] + +// CHECK: vcomxss xmm22, dword ptr [rip] +// CHECK: encoding: [0x62,0xe1,0x7f,0x08,0x2f,0x35,0x00,0x00,0x00,0x00] + vcomxss xmm22, dword ptr [rip] + +// CHECK: vcomxss xmm22, dword ptr [2*rbp - 128] +// CHECK: encoding: [0x62,0xe1,0x7f,0x08,0x2f,0x34,0x6d,0x80,0xff,0xff,0xff] + vcomxss xmm22, dword ptr [2*rbp - 128] + +// CHECK: vcomxss xmm22, dword ptr [rcx + 508] +// CHECK: encoding: [0x62,0xe1,0x7f,0x08,0x2f,0x71,0x7f] + vcomxss xmm22, dword ptr [rcx + 508] + +// CHECK: vcomxss xmm22, dword ptr [rdx - 512] +// CHECK: encoding: [0x62,0xe1,0x7f,0x08,0x2f,0x72,0x80] + vcomxss xmm22, dword ptr [rdx - 512] + +// CHECK: vucomxsd xmm22, xmm23 +// CHECK: encoding: [0x62,0xa1,0xfe,0x08,0x2e,0xf7] + vucomxsd xmm22, xmm23 + +// CHECK: vucomxsd xmm22, xmm23, {sae} +// CHECK: encoding: [0x62,0xa1,0xfe,0x18,0x2e,0xf7] + vucomxsd xmm22, xmm23, {sae} + +// CHECK: vucomxsd xmm22, qword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa1,0xfe,0x08,0x2e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vucomxsd xmm22, qword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vucomxsd xmm22, qword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc1,0xfe,0x08,0x2e,0xb4,0x80,0x23,0x01,0x00,0x00] + vucomxsd xmm22, qword ptr [r8 + 4*rax + 291] + +// CHECK: vucomxsd xmm22, qword ptr [rip] +// CHECK: encoding: [0x62,0xe1,0xfe,0x08,0x2e,0x35,0x00,0x00,0x00,0x00] + vucomxsd xmm22, qword ptr [rip] + +// CHECK: vucomxsd xmm22, qword ptr [2*rbp - 256] +// CHECK: encoding: [0x62,0xe1,0xfe,0x08,0x2e,0x34,0x6d,0x00,0xff,0xff,0xff] + vucomxsd xmm22, qword ptr [2*rbp - 256] + +// CHECK: vucomxsd xmm22, qword ptr [rcx + 1016] +// CHECK: encoding: [0x62,0xe1,0xfe,0x08,0x2e,0x71,0x7f] + vucomxsd xmm22, qword ptr [rcx + 1016] + +// CHECK: vucomxsd xmm22, qword ptr [rdx - 1024] +// CHECK: encoding: [0x62,0xe1,0xfe,0x08,0x2e,0x72,0x80] + vucomxsd xmm22, qword ptr [rdx - 1024] + +// CHECK: vucomxsh xmm22, xmm23 +// CHECK: encoding: [0x62,0xa5,0x7f,0x08,0x2e,0xf7] + vucomxsh xmm22, xmm23 + +// CHECK: vucomxsh xmm22, xmm23, {sae} +// CHECK: encoding: [0x62,0xa5,0x7f,0x18,0x2e,0xf7] + vucomxsh xmm22, xmm23, {sae} + +// CHECK: vucomxsh xmm22, word ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x7f,0x08,0x2e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vucomxsh xmm22, word ptr [rbp + 8*r14 + 268435456] + +// CHECK: vucomxsh xmm22, word ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x7f,0x08,0x2e,0xb4,0x80,0x23,0x01,0x00,0x00] + vucomxsh xmm22, word ptr [r8 + 4*rax + 291] + +// CHECK: vucomxsh xmm22, word ptr [rip] +// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x2e,0x35,0x00,0x00,0x00,0x00] + vucomxsh xmm22, word ptr [rip] + +// CHECK: vucomxsh xmm22, word ptr [2*rbp - 64] +// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x2e,0x34,0x6d,0xc0,0xff,0xff,0xff] + vucomxsh xmm22, word ptr [2*rbp - 64] + +// CHECK: vucomxsh xmm22, word ptr [rcx + 254] +// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x2e,0x71,0x7f] + vucomxsh xmm22, word ptr [rcx + 254] + +// CHECK: vucomxsh xmm22, word ptr [rdx - 256] +// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x2e,0x72,0x80] + vucomxsh xmm22, word ptr [rdx - 256] + +// CHECK: vucomxss xmm22, xmm23 +// CHECK: encoding: [0x62,0xa1,0x7f,0x08,0x2e,0xf7] + vucomxss xmm22, xmm23 + +// CHECK: vucomxss xmm22, xmm23, {sae} +// CHECK: encoding: [0x62,0xa1,0x7f,0x18,0x2e,0xf7] + vucomxss xmm22, xmm23, {sae} + +// CHECK: vucomxss xmm22, dword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa1,0x7f,0x08,0x2e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vucomxss xmm22, dword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vucomxss xmm22, dword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc1,0x7f,0x08,0x2e,0xb4,0x80,0x23,0x01,0x00,0x00] + vucomxss xmm22, dword ptr [r8 + 4*rax + 291] + +// CHECK: vucomxss xmm22, dword ptr [rip] +// CHECK: encoding: [0x62,0xe1,0x7f,0x08,0x2e,0x35,0x00,0x00,0x00,0x00] + vucomxss xmm22, dword ptr [rip] + +// CHECK: vucomxss xmm22, dword ptr [2*rbp - 128] +// CHECK: encoding: [0x62,0xe1,0x7f,0x08,0x2e,0x34,0x6d,0x80,0xff,0xff,0xff] + vucomxss xmm22, dword ptr [2*rbp - 128] + +// CHECK: vucomxss xmm22, dword ptr [rcx + 508] +// CHECK: encoding: [0x62,0xe1,0x7f,0x08,0x2e,0x71,0x7f] + vucomxss xmm22, dword ptr [rcx + 508] + +// CHECK: vucomxss xmm22, dword ptr [rdx - 512] +// CHECK: encoding: [0x62,0xe1,0x7f,0x08,0x2e,0x72,0x80] + vucomxss xmm22, dword ptr [rdx - 512] + diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc index e85708ac1cc458..94347839d281f9 100644 --- a/llvm/test/TableGen/x86-fold-tables.inc +++ b/llvm/test/TableGen/x86-fold-tables.inc @@ -1178,6 +1178,9 @@ static const X86FoldTableEntry Table1[] = { {X86::VCOMISSrr_Int, X86::VCOMISSrm_Int, TB_NO_REVERSE}, {X86::VCOMSBF16Zrr, X86::VCOMSBF16Zrm, 0}, {X86::VCOMSBF16Zrr_Int, X86::VCOMSBF16Zrm_Int, TB_NO_REVERSE}, + {X86::VCOMXSDZrr_Int, X86::VCOMXSDZrm_Int, TB_NO_REVERSE}, + {X86::VCOMXSHZrr_Int, X86::VCOMXSHZrm_Int, TB_NO_REVERSE}, + {X86::VCOMXSSZrr_Int, X86::VCOMXSSZrm_Int, TB_NO_REVERSE}, {X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, 0}, {X86::VCVTDQ2PDZ128rr, X86::VCVTDQ2PDZ128rm, TB_NO_REVERSE}, {X86::VCVTDQ2PDZ256rr, X86::VCVTDQ2PDZ256rm, 0}, @@ -1954,6 +1957,9 @@ static const X86FoldTableEntry Table1[] = { {X86::VUCOMISSZrr_Int, X86::VUCOMISSZrm_Int, TB_NO_REVERSE}, {X86::VUCOMISSrr, X86::VUCOMISSrm, 0}, {X86::VUCOMISSrr_Int, X86::VUCOMISSrm_Int, TB_NO_REVERSE}, + {X86::VUCOMXSDZrr_Int, X86::VUCOMXSDZrm_Int, TB_NO_REVERSE}, + {X86::VUCOMXSHZrr_Int, X86::VUCOMXSHZrm_Int, TB_NO_REVERSE}, + {X86::VUCOMXSSZrr_Int, X86::VUCOMXSSZrm_Int, TB_NO_REVERSE}, {X86::XOR16ri8_ND, X86::XOR16mi8_ND, 0}, {X86::XOR16ri8_NF_ND, X86::XOR16mi8_NF_ND, 0}, {X86::XOR16ri_ND, X86::XOR16mi_ND, 0}, From 737f56fdf7d8df4f1349085fe7256e27778e4a51 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 18 Sep 2024 18:06:42 +0800 Subject: [PATCH 033/321] [RISCV] Deduplicate zvfhmin and zvfbfmin operation actions. NFC After #108937 fp16 w/o zvfh and bf16 are now in sync and should have the same lowering. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 111 ++++++++------------ 1 file changed, 41 insertions(+), 70 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 3b7e24414c490c..0f76ad6c5e9288 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1063,6 +1063,45 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, } }; + // Sets common actions for f16 and bf16 for when there's only + // zvfhmin/zvfbfmin and we need to promote to f32 for most operations. + const auto SetCommonPromoteToF32Actions = [&](MVT VT) { + setOperationAction({ISD::FP_ROUND, ISD::FP_EXTEND}, VT, Custom); + setOperationAction({ISD::STRICT_FP_ROUND, ISD::STRICT_FP_EXTEND}, VT, + Custom); + setOperationAction({ISD::VP_FP_ROUND, ISD::VP_FP_EXTEND}, VT, Custom); + setOperationAction({ISD::VP_MERGE, ISD::VP_SELECT, ISD::SELECT}, VT, + Custom); + setOperationAction(ISD::SELECT_CC, VT, Expand); + setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::VP_SINT_TO_FP, + ISD::VP_UINT_TO_FP}, + VT, Custom); + setOperationAction({ISD::CONCAT_VECTORS, ISD::INSERT_SUBVECTOR, + ISD::EXTRACT_SUBVECTOR, ISD::VECTOR_INTERLEAVE, + ISD::VECTOR_DEINTERLEAVE}, + VT, Custom); + MVT EltVT = VT.getVectorElementType(); + if (isTypeLegal(EltVT)) + setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); + else + setOperationAction(ISD::SPLAT_VECTOR, EltVT, Custom); + setOperationAction({ISD::LOAD, ISD::STORE}, VT, Custom); + + setOperationAction(ISD::FNEG, VT, Expand); + setOperationAction(ISD::FABS, VT, Expand); + setOperationAction(ISD::FCOPYSIGN, VT, Expand); + + // Custom split nxv32[b]f16 since nxv32[b]f32 is not legal. + if (getLMUL(VT) == RISCVII::VLMUL::LMUL_8) { + setOperationAction(ZvfhminZvfbfminPromoteOps, VT, Custom); + setOperationAction(ZvfhminZvfbfminPromoteVPOps, VT, Custom); + } else { + MVT F32VecVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount()); + setOperationPromotedToType(ZvfhminZvfbfminPromoteOps, VT, F32VecVT); + setOperationPromotedToType(ZvfhminZvfbfminPromoteVPOps, VT, F32VecVT); + } + }; + if (Subtarget.hasVInstructionsF16()) { for (MVT VT : F16VecVTs) { if (!isTypeLegal(VT)) @@ -1073,83 +1112,15 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, for (MVT VT : F16VecVTs) { if (!isTypeLegal(VT)) continue; - setOperationAction({ISD::FP_ROUND, ISD::FP_EXTEND}, VT, Custom); - setOperationAction({ISD::STRICT_FP_ROUND, ISD::STRICT_FP_EXTEND}, VT, - Custom); - setOperationAction({ISD::VP_FP_ROUND, ISD::VP_FP_EXTEND}, VT, Custom); - setOperationAction({ISD::VP_MERGE, ISD::VP_SELECT, ISD::SELECT}, VT, - Custom); - setOperationAction(ISD::SELECT_CC, VT, Expand); - setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP, - ISD::VP_SINT_TO_FP, ISD::VP_UINT_TO_FP}, - VT, Custom); - setOperationAction({ISD::CONCAT_VECTORS, ISD::INSERT_SUBVECTOR, - ISD::EXTRACT_SUBVECTOR, ISD::VECTOR_INTERLEAVE, - ISD::VECTOR_DEINTERLEAVE}, - VT, Custom); - if (Subtarget.hasStdExtZfhmin()) - setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); - else - setOperationAction(ISD::SPLAT_VECTOR, MVT::f16, Custom); - // load/store - setOperationAction({ISD::LOAD, ISD::STORE}, VT, Custom); - - setOperationAction(ISD::FNEG, VT, Expand); - setOperationAction(ISD::FABS, VT, Expand); - setOperationAction(ISD::FCOPYSIGN, VT, Expand); - - // Custom split nxv32f16 since nxv32f32 is not legal. - if (VT == MVT::nxv32f16) { - setOperationAction(ZvfhminZvfbfminPromoteOps, VT, Custom); - setOperationAction(ZvfhminZvfbfminPromoteVPOps, VT, Custom); - continue; - } - // Add more promote ops. - MVT F32VecVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount()); - setOperationPromotedToType(ZvfhminZvfbfminPromoteOps, VT, F32VecVT); - setOperationPromotedToType(ZvfhminZvfbfminPromoteVPOps, VT, F32VecVT); + SetCommonPromoteToF32Actions(VT); } } - // TODO: merge with zvfhmin if (Subtarget.hasVInstructionsBF16Minimal()) { for (MVT VT : BF16VecVTs) { if (!isTypeLegal(VT)) continue; - setOperationAction({ISD::FP_ROUND, ISD::FP_EXTEND}, VT, Custom); - setOperationAction({ISD::STRICT_FP_ROUND, ISD::STRICT_FP_EXTEND}, VT, - Custom); - setOperationAction({ISD::VP_FP_ROUND, ISD::VP_FP_EXTEND}, VT, Custom); - setOperationAction({ISD::VP_MERGE, ISD::VP_SELECT, ISD::SELECT}, VT, - Custom); - setOperationAction(ISD::SELECT_CC, VT, Expand); - setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP, - ISD::VP_SINT_TO_FP, ISD::VP_UINT_TO_FP}, - VT, Custom); - setOperationAction({ISD::CONCAT_VECTORS, ISD::INSERT_SUBVECTOR, - ISD::EXTRACT_SUBVECTOR, ISD::VECTOR_INTERLEAVE, - ISD::VECTOR_DEINTERLEAVE}, - VT, Custom); - if (Subtarget.hasStdExtZfbfmin()) - setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); - else - setOperationAction(ISD::SPLAT_VECTOR, MVT::bf16, Custom); - setOperationAction({ISD::LOAD, ISD::STORE}, VT, Custom); - - setOperationAction(ISD::FNEG, VT, Expand); - setOperationAction(ISD::FABS, VT, Expand); - setOperationAction(ISD::FCOPYSIGN, VT, Expand); - - // Custom split nxv32f16 since nxv32f32 is not legal. - if (VT == MVT::nxv32bf16) { - setOperationAction(ZvfhminZvfbfminPromoteOps, VT, Custom); - setOperationAction(ZvfhminZvfbfminPromoteVPOps, VT, Custom); - continue; - } - // Add more promote ops. - MVT F32VecVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount()); - setOperationPromotedToType(ZvfhminZvfbfminPromoteOps, VT, F32VecVT); - setOperationPromotedToType(ZvfhminZvfbfminPromoteVPOps, VT, F32VecVT); + SetCommonPromoteToF32Actions(VT); } } From 2e3c7dbbcbfa37ae83251bb3da388df772680689 Mon Sep 17 00:00:00 2001 From: Balazs Benics Date: Wed, 18 Sep 2024 12:22:02 +0200 Subject: [PATCH 034/321] [analyzer] Note last "fclose" call from "ensureStreamOpened" (#109112) Patch by Arseniy Zaostrovnykh! --- .../StaticAnalyzer/Checkers/StreamChecker.cpp | 50 +++++++++++++++++-- clang/test/Analysis/stream-error.c | 22 ++++---- clang/test/Analysis/stream-note.c | 9 ++++ clang/test/Analysis/stream.c | 10 ++-- 4 files changed, 70 insertions(+), 21 deletions(-) diff --git a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp index 8bb7880a3cc283..0a823a1126ce3f 100644 --- a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp @@ -1835,6 +1835,46 @@ StreamChecker::ensureStreamNonNull(SVal StreamVal, const Expr *StreamE, return StateNotNull; } +namespace { +class StreamClosedVisitor final : public BugReporterVisitor { + const SymbolRef StreamSym; + bool Satisfied = false; + +public: + explicit StreamClosedVisitor(SymbolRef StreamSym) : StreamSym(StreamSym) {} + + static void *getTag() { + static int Tag = 0; + return &Tag; + } + + void Profile(llvm::FoldingSetNodeID &ID) const override { + ID.AddPointer(getTag()); + ID.AddPointer(StreamSym); + } + + PathDiagnosticPieceRef VisitNode(const ExplodedNode *N, + BugReporterContext &BRC, + PathSensitiveBugReport &BR) override { + if (Satisfied) + return nullptr; + const StreamState *PredSS = + N->getFirstPred()->getState()->get(StreamSym); + if (PredSS && PredSS->isClosed()) + return nullptr; + + const Stmt *S = N->getStmtForDiagnostics(); + if (!S) + return nullptr; + Satisfied = true; + PathDiagnosticLocation Pos(S, BRC.getSourceManager(), + N->getLocationContext()); + llvm::StringLiteral Msg = "Stream is closed here"; + return std::make_shared(Pos, Msg); + } +}; +} // namespace + ProgramStateRef StreamChecker::ensureStreamOpened(SVal StreamVal, CheckerContext &C, ProgramStateRef State) const { @@ -1849,11 +1889,11 @@ ProgramStateRef StreamChecker::ensureStreamOpened(SVal StreamVal, if (SS->isClosed()) { // Using a stream pointer after 'fclose' causes undefined behavior // according to cppreference.com . - ExplodedNode *N = C.generateErrorNode(); - if (N) { - C.emitReport(std::make_unique( - BT_UseAfterClose, - "Stream might be already closed. Causes undefined behaviour.", N)); + if (ExplodedNode *N = C.generateErrorNode()) { + auto R = std::make_unique( + BT_UseAfterClose, "Use of a stream that might be already closed", N); + R->addVisitor(Sym); + C.emitReport(std::move(R)); return nullptr; } diff --git a/clang/test/Analysis/stream-error.c b/clang/test/Analysis/stream-error.c index 3f791d13346419..9de56c082e8258 100644 --- a/clang/test/Analysis/stream-error.c +++ b/clang/test/Analysis/stream-error.c @@ -96,7 +96,7 @@ void error_fread(void) { } } fclose(F); - Ret = fread(Buf, 1, 10, F); // expected-warning {{Stream might be already closed}} + Ret = fread(Buf, 1, 10, F); // expected-warning {{Use of a stream that might be already closed}} } void error_fwrite(void) { @@ -113,7 +113,7 @@ void error_fwrite(void) { fwrite(0, 1, 10, F); // expected-warning {{might be 'indeterminate'}} } fclose(F); - Ret = fwrite(0, 1, 10, F); // expected-warning {{Stream might be already closed}} + Ret = fwrite(0, 1, 10, F); // expected-warning {{Use of a stream that might be already closed}} } void error_fgetc(void) { @@ -135,7 +135,7 @@ void error_fgetc(void) { } } fclose(F); - fgetc(F); // expected-warning {{Stream might be already closed}} + fgetc(F); // expected-warning {{Use of a stream that might be already closed}} } void error_fgets(void) { @@ -158,7 +158,7 @@ void error_fgets(void) { } } fclose(F); - fgets(Buf, sizeof(Buf), F); // expected-warning {{Stream might be already closed}} + fgets(Buf, sizeof(Buf), F); // expected-warning {{Use of a stream that might be already closed}} } void error_fputc(int fd) { @@ -176,7 +176,7 @@ void error_fputc(int fd) { fputc('Y', F); // no-warning } fclose(F); - fputc('A', F); // expected-warning {{Stream might be already closed}} + fputc('A', F); // expected-warning {{Use of a stream that might be already closed}} } void error_fputs(void) { @@ -194,7 +194,7 @@ void error_fputs(void) { fputs("QWD", F); // expected-warning {{might be 'indeterminate'}} } fclose(F); - fputs("ABC", F); // expected-warning {{Stream might be already closed}} + fputs("ABC", F); // expected-warning {{Use of a stream that might be already closed}} } void error_fprintf(void) { @@ -211,7 +211,7 @@ void error_fprintf(void) { fprintf(F, "bbb"); // expected-warning {{might be 'indeterminate'}} } fclose(F); - fprintf(F, "ccc"); // expected-warning {{Stream might be already closed}} + fprintf(F, "ccc"); // expected-warning {{Use of a stream that might be already closed}} } void error_fscanf(int *A) { @@ -236,7 +236,7 @@ void error_fscanf(int *A) { } } fclose(F); - fscanf(F, "ccc"); // expected-warning {{Stream might be already closed}} + fscanf(F, "ccc"); // expected-warning {{Use of a stream that might be already closed}} } void error_ungetc(int TestIndeterminate) { @@ -256,7 +256,7 @@ void error_ungetc(int TestIndeterminate) { ungetc('X', F); // expected-warning {{might be 'indeterminate'}} } fclose(F); - ungetc('A', F); // expected-warning {{Stream might be already closed}} + ungetc('A', F); // expected-warning {{Use of a stream that might be already closed}} } void error_getdelim(char *P, size_t Sz) { @@ -278,7 +278,7 @@ void error_getdelim(char *P, size_t Sz) { } } fclose(F); - getdelim(&P, &Sz, '\n', F); // expected-warning {{Stream might be already closed}} + getdelim(&P, &Sz, '\n', F); // expected-warning {{Use of a stream that might be already closed}} } void error_getline(char *P, size_t Sz) { @@ -300,7 +300,7 @@ void error_getline(char *P, size_t Sz) { } } fclose(F); - getline(&P, &Sz, F); // expected-warning {{Stream might be already closed}} + getline(&P, &Sz, F); // expected-warning {{Use of a stream that might be already closed}} } void write_after_eof_is_allowed(void) { diff --git a/clang/test/Analysis/stream-note.c b/clang/test/Analysis/stream-note.c index 3aef707d50056e..2b5d1edb2814f0 100644 --- a/clang/test/Analysis/stream-note.c +++ b/clang/test/Analysis/stream-note.c @@ -264,3 +264,12 @@ void error_fseek_read_eof(void) { fgetc(F); // no warning fclose(F); } + +void check_note_at_use_after_close(void) { + FILE *F = tmpfile(); + if (!F) // expected-note {{'F' is non-null}} expected-note {{Taking false branch}} + return; + fclose(F); // expected-note {{Stream is closed here}} + rewind(F); // expected-warning {{Use of a stream that might be already closed}} + // expected-note@-1 {{Use of a stream that might be already closed}} +} diff --git a/clang/test/Analysis/stream.c b/clang/test/Analysis/stream.c index b9a5b1ba8cd494..758b40cca49319 100644 --- a/clang/test/Analysis/stream.c +++ b/clang/test/Analysis/stream.c @@ -185,7 +185,7 @@ void f_double_close(void) { if (!p) return; fclose(p); - fclose(p); // expected-warning {{Stream might be already closed}} + fclose(p); // expected-warning {{Use of a stream that might be already closed}} } void f_double_close_alias(void) { @@ -194,7 +194,7 @@ void f_double_close_alias(void) { return; FILE *p2 = p1; fclose(p1); - fclose(p2); // expected-warning {{Stream might be already closed}} + fclose(p2); // expected-warning {{Use of a stream that might be already closed}} } void f_use_after_close(void) { @@ -202,7 +202,7 @@ void f_use_after_close(void) { if (!p) return; fclose(p); - clearerr(p); // expected-warning {{Stream might be already closed}} + clearerr(p); // expected-warning {{Use of a stream that might be already closed}} } void f_open_after_close(void) { @@ -266,7 +266,7 @@ void check_freopen_2(void) { if (f2) { // Check if f1 and f2 point to the same stream. fclose(f1); - fclose(f2); // expected-warning {{Stream might be already closed.}} + fclose(f2); // expected-warning {{Use of a stream that might be already closed}} } else { // Reopen failed. // f1 is non-NULL but points to a possibly invalid stream. @@ -370,7 +370,7 @@ void fflush_after_fclose(void) { if ((Ret = fflush(F)) != 0) clang_analyzer_eval(Ret == EOF); // expected-warning {{TRUE}} fclose(F); - fflush(F); // expected-warning {{Stream might be already closed}} + fflush(F); // expected-warning {{Use of a stream that might be already closed}} } void fflush_on_open_failed_stream(void) { From adf02ae41fe0e345b00a428f4b9f438b96ead11d Mon Sep 17 00:00:00 2001 From: Piotr Sobczak Date: Wed, 18 Sep 2024 12:58:16 +0200 Subject: [PATCH 035/321] [AMDGPU] Simplify lowerBUILD_VECTOR (#109094) Simplify `lowerBUILD_VECTOR` by commoning up the way the vectors are split. Also reorder the checks to avoid a long condition inside `if`. --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 111 +++++------------- .../AMDGPU/insert_vector_elt.v2bf16.ll | 10 +- 2 files changed, 32 insertions(+), 89 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 4a861f0c03a0c5..10108866a7005a 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7443,98 +7443,49 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op, SDLoc SL(Op); EVT VT = Op.getValueType(); - if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 || - VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { - EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), - VT.getVectorNumElements() / 2); - MVT HalfIntVT = MVT::getIntegerVT(HalfVT.getSizeInBits()); + if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) { + assert(!Subtarget->hasVOP3PInsts() && "this should be legal"); - // Turn into pair of packed build_vectors. - // TODO: Special case for constants that can be materialized with s_mov_b64. - SmallVector LoOps, HiOps; - for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I != E; ++I) { - LoOps.push_back(Op.getOperand(I)); - HiOps.push_back(Op.getOperand(I + E)); - } - SDValue Lo = DAG.getBuildVector(HalfVT, SL, LoOps); - SDValue Hi = DAG.getBuildVector(HalfVT, SL, HiOps); - - SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Lo); - SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Hi); - - SDValue Blend = DAG.getBuildVector(MVT::getVectorVT(HalfIntVT, 2), SL, - { CastLo, CastHi }); - return DAG.getNode(ISD::BITCAST, SL, VT, Blend); - } + SDValue Lo = Op.getOperand(0); + SDValue Hi = Op.getOperand(1); - if (VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16) { - EVT QuarterVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), - VT.getVectorNumElements() / 4); - MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits()); - - SmallVector Parts[4]; - for (unsigned I = 0, E = VT.getVectorNumElements() / 4; I != E; ++I) { - for (unsigned P = 0; P < 4; ++P) - Parts[P].push_back(Op.getOperand(I + P * E)); - } - SDValue Casts[4]; - for (unsigned P = 0; P < 4; ++P) { - SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]); - Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec); + // Avoid adding defined bits with the zero_extend. + if (Hi.isUndef()) { + Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo); + SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo); + return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo); } - SDValue Blend = - DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 4), SL, Casts); - return DAG.getNode(ISD::BITCAST, SL, VT, Blend); - } + Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi); + Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi); - if (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v32bf16) { - EVT QuarterVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), - VT.getVectorNumElements() / 8); - MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits()); + SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi, + DAG.getConstant(16, SL, MVT::i32)); + if (Lo.isUndef()) + return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi); - SmallVector Parts[8]; - for (unsigned I = 0, E = VT.getVectorNumElements() / 8; I != E; ++I) { - for (unsigned P = 0; P < 8; ++P) - Parts[P].push_back(Op.getOperand(I + P * E)); - } - SDValue Casts[8]; - for (unsigned P = 0; P < 8; ++P) { - SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]); - Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec); - } + Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo); + Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo); - SDValue Blend = - DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 8), SL, Casts); - return DAG.getNode(ISD::BITCAST, SL, VT, Blend); + SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi); + return DAG.getNode(ISD::BITCAST, SL, VT, Or); } - assert(VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16); - assert(!Subtarget->hasVOP3PInsts() && "this should be legal"); + // Split into 2-element chunks. + const unsigned NumParts = VT.getVectorNumElements() / 2; + EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2); + MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits()); - SDValue Lo = Op.getOperand(0); - SDValue Hi = Op.getOperand(1); - - // Avoid adding defined bits with the zero_extend. - if (Hi.isUndef()) { - Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo); - SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo); - return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo); + SmallVector Casts; + for (unsigned P = 0; P < NumParts; ++P) { + SDValue Vec = DAG.getBuildVector( + PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)}); + Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec)); } - Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi); - Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi); - - SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi, - DAG.getConstant(16, SL, MVT::i32)); - if (Lo.isUndef()) - return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi); - - Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo); - Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo); - - SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi); - return DAG.getNode(ISD::BITCAST, SL, VT, Or); + SDValue Blend = + DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts); + return DAG.getNode(ISD::BITCAST, SL, VT, Blend); } bool diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll index 3135addec16183..c68138acc9b2bf 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll @@ -965,11 +965,7 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a ; GFX900-NEXT: v_mov_b32_e32 v5, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] -; GFX900-NEXT: s_mov_b32 s2, 0xffff ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_bfi_b32 v3, s2, v3, v3 -; GFX900-NEXT: v_bfi_b32 v2, s2, v2, v2 -; GFX900-NEXT: v_bfi_b32 v0, s2, v0, v0 ; GFX900-NEXT: v_perm_b32 v1, s4, v1, v5 ; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX900-NEXT: s_endpgm @@ -980,14 +976,10 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a ; GFX940-NEXT: s_load_dword s0, s[2:3], 0x10 ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GFX940-NEXT: s_mov_b32 s1, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v5, 0x5040100 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v5, 0x5040100 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_bfi_b32 v3, s1, v3, v3 -; GFX940-NEXT: v_bfi_b32 v2, s1, v2, v2 -; GFX940-NEXT: v_bfi_b32 v0, s1, v0, v0 ; GFX940-NEXT: v_perm_b32 v1, s0, v1, v5 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm From 4b529f840c7a28245f4462d9fde34f1686e96351 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 18 Sep 2024 10:53:08 +0100 Subject: [PATCH 036/321] [X86] Fold extractsubvector(permv3(src0,mask,src1),c) -> extractsubvector(permv3(src0,widensubvector(extractsubvector(mask,c)),src1),0) iff c != 0 For cross-lane shuffles, extract the mask operand (uppper) subvector directly, and make use of the free implicit extraction of the lowest subvector of the result. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 13 ++++++ .../vector-interleaved-load-i32-stride-7.ll | 40 ++++++------------- .../vector-interleaved-load-i32-stride-8.ll | 16 ++------ 3 files changed, 29 insertions(+), 40 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 9bc5f2c9399574..182f6c08366a99 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -57791,6 +57791,19 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, DAG.getTargetConstant(M, DL, MVT::i8)); } break; + case X86ISD::VPERMV3: + if (IdxVal != 0) { + SDValue Src0 = InVec.getOperand(0); + SDValue Mask = InVec.getOperand(1); + SDValue Src1 = InVec.getOperand(2); + Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits); + Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG, + DL, InSizeInBits); + SDValue Shuffle = + DAG.getNode(InOpcode, DL, InVecVT, Src0, Mask, Src1); + return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits); + } + break; } } } diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll index ed316990e48666..f616eafc24272e 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll @@ -240,21 +240,17 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm6 ; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm0 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [13,4,6,7,13,4,6,7] -; AVX512-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [13,4,6,7] ; AVX512-FCP-NEXT: vpermi2d %ymm6, %ymm1, %ymm7 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,6,7,6,13,6,7] -; AVX512-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,6,7] ; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm8 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm1 ; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512-FCP-NEXT: vmovq %xmm4, (%rcx) ; AVX512-FCP-NEXT: vmovq %xmm5, (%r8) ; AVX512-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512-FCP-NEXT: vmovq %xmm7, (%r10) -; AVX512-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX512-FCP-NEXT: vmovq %xmm8, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -309,21 +305,17 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm6 ; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm0 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [13,4,6,7,13,4,6,7] -; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [13,4,6,7] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm6, %ymm1, %ymm7 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,6,7,6,13,6,7] -; AVX512DQ-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,6,7] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm8 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm1 ; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rcx) ; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r8) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%r10) -; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX512DQ-FCP-NEXT: vmovq %xmm8, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -378,21 +370,17 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm6 ; AVX512BW-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm0 -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [13,4,6,7,13,4,6,7] -; AVX512BW-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [13,4,6,7] ; AVX512BW-FCP-NEXT: vpermi2d %ymm6, %ymm1, %ymm7 -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,6,7,6,13,6,7] -; AVX512BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,6,7] ; AVX512BW-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm8 -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm1 ; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rcx) ; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r8) ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512BW-FCP-NEXT: vmovq %xmm7, (%r10) -; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX512BW-FCP-NEXT: vmovq %xmm8, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -447,21 +435,17 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm6 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [13,4,6,7,13,4,6,7] -; AVX512DQ-BW-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [13,4,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm6, %ymm1, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,6,7,6,13,6,7] -; AVX512DQ-BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm1 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r8) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm7, (%r10) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm8, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <14 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll index 2fd173c729170b..872a8d00cc2343 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll @@ -226,10 +226,8 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm4 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [5,13,5,5,5,13,5,5] -; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,13,5,5] ; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm4, %ymm6 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] @@ -293,10 +291,8 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm4 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [5,13,5,5,5,13,5,5] -; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,13,5,5] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm4, %ymm6 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] @@ -360,10 +356,8 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm4 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [5,13,5,5,5,13,5,5] -; AVX512BW-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,13,5,5] ; AVX512BW-FCP-NEXT: vpermi2d %ymm1, %ymm4, %ymm6 -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX512BW-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] @@ -427,10 +421,8 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm4 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [5,13,5,5,5,13,5,5] -; AVX512DQ-BW-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,13,5,5] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm1, %ymm4, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX512DQ-BW-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] From 872932b7a9539b0f0b62805f339bef62c94fd52d Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Wed, 18 Sep 2024 19:10:41 +0800 Subject: [PATCH 037/321] [InstCombine] Generalize `icmp (shl nuw C2, Y), C -> icmp Y, C3` (#104696) The motivation of this patch is to fold more generalized patterns like `icmp ult (shl nuw 16, X), 64 -> icmp ult X, 2`. Alive2: https://alive2.llvm.org/ce/z/gyqjQH --- .../InstCombine/InstCombineCompares.cpp | 22 ++-- .../Transforms/InstCombine/icmp-shl-nuw.ll | 106 ++++++++++++++++++ 2 files changed, 120 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 5cdfeada7f0aa2..80d6ceca094d88 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -2227,18 +2227,24 @@ Instruction *InstCombinerImpl::foldICmpMulConstant(ICmpInst &Cmp, return NewC ? new ICmpInst(Pred, X, NewC) : nullptr; } -/// Fold icmp (shl 1, Y), C. -static Instruction *foldICmpShlOne(ICmpInst &Cmp, Instruction *Shl, - const APInt &C) { +/// Fold icmp (shl nuw C2, Y), C. +static Instruction *foldICmpShlLHSC(ICmpInst &Cmp, Instruction *Shl, + const APInt &C) { Value *Y; - if (!match(Shl, m_Shl(m_One(), m_Value(Y)))) + const APInt *C2; + if (!match(Shl, m_NUWShl(m_APInt(C2), m_Value(Y)))) return nullptr; Type *ShiftType = Shl->getType(); unsigned TypeBits = C.getBitWidth(); - bool CIsPowerOf2 = C.isPowerOf2(); ICmpInst::Predicate Pred = Cmp.getPredicate(); if (Cmp.isUnsigned()) { + if (C2->isZero() || C2->ugt(C)) + return nullptr; + APInt Div, Rem; + APInt::udivrem(C, *C2, Div, Rem); + bool CIsPowerOf2 = Rem.isZero() && Div.isPowerOf2(); + // (1 << Y) pred C -> Y pred Log2(C) if (!CIsPowerOf2) { // (1 << Y) < 30 -> Y <= 4 @@ -2251,9 +2257,9 @@ static Instruction *foldICmpShlOne(ICmpInst &Cmp, Instruction *Shl, Pred = ICmpInst::ICMP_UGT; } - unsigned CLog2 = C.logBase2(); + unsigned CLog2 = Div.logBase2(); return new ICmpInst(Pred, Y, ConstantInt::get(ShiftType, CLog2)); - } else if (Cmp.isSigned()) { + } else if (Cmp.isSigned() && C2->isOne()) { Constant *BitWidthMinusOne = ConstantInt::get(ShiftType, TypeBits - 1); // (1 << Y) > 0 -> Y != 31 // (1 << Y) > C -> Y != 31 if C is negative. @@ -2307,7 +2313,7 @@ Instruction *InstCombinerImpl::foldICmpShlConstant(ICmpInst &Cmp, const APInt *ShiftAmt; if (!match(Shl->getOperand(1), m_APInt(ShiftAmt))) - return foldICmpShlOne(Cmp, Shl, C); + return foldICmpShlLHSC(Cmp, Shl, C); // Check that the shift amount is in range. If not, don't perform undefined // shifts. When the shift is visited, it will be simplified. diff --git a/llvm/test/Transforms/InstCombine/icmp-shl-nuw.ll b/llvm/test/Transforms/InstCombine/icmp-shl-nuw.ll index 57c3abc7b9841f..9f50265004f01b 100644 --- a/llvm/test/Transforms/InstCombine/icmp-shl-nuw.ll +++ b/llvm/test/Transforms/InstCombine/icmp-shl-nuw.ll @@ -90,3 +90,109 @@ define <2 x i1> @icmp_ugt_16x2(<2 x i32>) { %d = icmp ugt <2 x i32> %c, ret <2 x i1> %d } + +define i1 @fold_icmp_shl_nuw_c1(i32 %x) { +; CHECK-LABEL: @fold_icmp_shl_nuw_c1( +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[X:%.*]], 61440 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0 +; CHECK-NEXT: ret i1 [[CMP]] +; + %lshr = lshr i32 %x, 12 + %and = and i32 %lshr, 15 + %shl = shl nuw i32 2, %and + %cmp = icmp ult i32 %shl, 4 + ret i1 %cmp +} + +define i1 @fold_icmp_shl_nuw_c2(i32 %x) { +; CHECK-LABEL: @fold_icmp_shl_nuw_c2( +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[X:%.*]], 2 +; CHECK-NEXT: ret i1 [[CMP]] +; + %shl = shl nuw i32 16, %x + %cmp = icmp ult i32 %shl, 64 + ret i1 %cmp +} + +define i1 @fold_icmp_shl_nuw_c2_non_pow2(i32 %x) { +; CHECK-LABEL: @fold_icmp_shl_nuw_c2_non_pow2( +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[X:%.*]], 2 +; CHECK-NEXT: ret i1 [[CMP]] +; + %shl = shl nuw i32 48, %x + %cmp = icmp ult i32 %shl, 192 + ret i1 %cmp +} + +define i1 @fold_icmp_shl_nuw_c2_div_non_pow2(i32 %x) { +; CHECK-LABEL: @fold_icmp_shl_nuw_c2_div_non_pow2( +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[X:%.*]], 5 +; CHECK-NEXT: ret i1 [[CMP]] +; + %shl = shl nuw i32 2, %x + %cmp = icmp ult i32 %shl, 60 + ret i1 %cmp +} + +define i1 @fold_icmp_shl_nuw_c3(i32 %x) { +; CHECK-LABEL: @fold_icmp_shl_nuw_c3( +; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[X:%.*]], 1 +; CHECK-NEXT: ret i1 [[CMP]] +; + %shl = shl nuw i32 48, %x + %cmp = icmp uge i32 %shl, 144 + ret i1 %cmp +} + +define i1 @fold_icmp_shl_nuw_c2_indivisible(i32 %x) { +; CHECK-LABEL: @fold_icmp_shl_nuw_c2_indivisible( +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[X:%.*]], 2 +; CHECK-NEXT: ret i1 [[CMP]] +; + %shl = shl nuw i32 16, %x + %cmp = icmp ult i32 %shl, 63 + ret i1 %cmp +} + +; Negative tests + +define i1 @fold_icmp_shl_c2_without_nuw(i32 %x) { +; CHECK-LABEL: @fold_icmp_shl_c2_without_nuw( +; CHECK-NEXT: [[SHL:%.*]] = shl i32 16, [[X:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[SHL]], 64 +; CHECK-NEXT: ret i1 [[CMP]] +; + %shl = shl i32 16, %x + %cmp = icmp ult i32 %shl, 64 + ret i1 %cmp +} + +; Make sure this trivial case is folded by InstSimplify. +define i1 @fold_icmp_shl_nuw_c2_precondition1(i32 %x) { +; CHECK-LABEL: @fold_icmp_shl_nuw_c2_precondition1( +; CHECK-NEXT: ret i1 true +; + %shl = shl nuw i32 0, %x + %cmp = icmp ult i32 %shl, 63 + ret i1 %cmp +} + +; Make sure this trivial case is folded by InstSimplify. +define i1 @fold_icmp_shl_nuw_c2_precondition2(i32 %x) { +; CHECK-LABEL: @fold_icmp_shl_nuw_c2_precondition2( +; CHECK-NEXT: ret i1 false +; + %shl = shl nuw i32 127, %x + %cmp = icmp ult i32 %shl, 63 + ret i1 %cmp +} + +; Make sure we don't crash on this case. +define i1 @fold_icmp_shl_nuw_c2_precondition3(i32 %x) { +; CHECK-LABEL: @fold_icmp_shl_nuw_c2_precondition3( +; CHECK-NEXT: ret i1 false +; + %shl = shl nuw i32 1, %x + %cmp = icmp ult i32 %shl, 1 + ret i1 %cmp +} From ffcff2f465ee8a7f0e0c7676c3e5c1ab889e0ce4 Mon Sep 17 00:00:00 2001 From: Shih-Po Hung Date: Wed, 18 Sep 2024 19:22:36 +0800 Subject: [PATCH 038/321] [VPlan][NFC] Fix the value name of VECTOR_GEP (#107544) This patch passes the string `"vector.gep"` to CreateGEP instead of CreateMul. --- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 7 +- .../LoopVectorize/AArch64/sve-widen-gep.ll | 72 +++++++++---------- .../LoopVectorize/AArch64/sve-widen-phi.ll | 16 ++--- .../LoopVectorize/RISCV/strided-accesses.ll | 20 +++--- .../LoopVectorize/pointer-induction.ll | 32 ++++----- .../LoopVectorize/scev-predicate-reasoning.ll | 6 +- 6 files changed, 76 insertions(+), 77 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index ecdf0b526f608d..c077e2b4eac5f1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2941,10 +2941,9 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { "scalar step must be the same across all parts"); Value *GEP = State.Builder.CreateGEP( State.Builder.getInt8Ty(), NewPointerPhi, - State.Builder.CreateMul( - StartOffset, - State.Builder.CreateVectorSplat(State.VF, ScalarStepValue), - "vector.gep")); + State.Builder.CreateMul(StartOffset, State.Builder.CreateVectorSplat( + State.VF, ScalarStepValue)), + "vector.gep"); State.set(this, GEP, Part); } } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll index a1f6ba487e84e0..6ec9eb849dd52a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll @@ -55,8 +55,8 @@ define void @pointer_induction_used_as_vector(ptr noalias %start.1, ptr noalias ; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START_1:%.*]], i64 [[TMP4]] ; CHECK-NEXT: [[IND_END2:%.*]] = getelementptr i8, ptr [[START_2:%.*]], i64 [[N_VEC]] -; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[START_2]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] @@ -70,23 +70,23 @@ define void @pointer_induction_used_as_vector(ptr noalias %start.1, ptr noalias ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[TMP12:%.*]] = call @llvm.stepvector.nxv2i64() ; CHECK-NEXT: [[TMP13:%.*]] = add [[DOTSPLAT]], [[TMP12]] -; CHECK-NEXT: [[VECTOR_GEP:%.*]] = mul [[TMP13]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[VECTOR_GEP]] +; CHECK-NEXT: [[TMP14:%.*]] = mul [[TMP13]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP14]] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START_1]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, [[TMP14]], i64 1 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr ptr, ptr [[NEXT_GEP]], i32 0 -; CHECK-NEXT: store [[TMP15]], ptr [[TMP16]], align 8 -; CHECK-NEXT: [[TMP17:%.*]] = extractelement [[TMP14]], i32 0 -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP18]], align 1 -; CHECK-NEXT: [[TMP19:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: store [[TMP19]], ptr [[TMP18]], align 1 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]] +; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START_1]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, [[VECTOR_GEP]], i64 1 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr ptr, ptr [[NEXT_GEP]], i32 0 +; CHECK-NEXT: store [[TMP16]], ptr [[TMP17]], align 8 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement [[VECTOR_GEP]], i32 0 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP18]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP19]], align 1 +; CHECK-NEXT: [[TMP20:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: store [[TMP20]], ptr [[TMP19]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -148,30 +148,30 @@ define void @pointer_induction(ptr noalias %start, i64 %N) { ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START:%.*]], i64 [[N_VEC]] -; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 -; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP6]], 0 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = mul i64 1, [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP8]], 0 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP11]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.stepvector.nxv2i64() -; CHECK-NEXT: [[TMP11:%.*]] = add [[DOTSPLAT]], [[TMP10]] -; CHECK-NEXT: [[VECTOR_GEP:%.*]] = mul [[TMP11]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[VECTOR_GEP]] -; CHECK-NEXT: [[TMP13:%.*]] = extractelement [[TMP12]], i32 0 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 1 -; CHECK-NEXT: [[TMP15:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: store [[TMP15]], ptr [[TMP14]], align 1 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX2]], [[TMP17]] -; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = call @llvm.stepvector.nxv2i64() +; CHECK-NEXT: [[TMP13:%.*]] = add [[DOTSPLAT]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = mul [[TMP13]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement [[VECTOR_GEP]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[TMP15]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP16]], align 1 +; CHECK-NEXT: [[TMP17:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: store [[TMP17]], ptr [[TMP16]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX2]], [[TMP6]] +; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP10]] ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll index 123b3cf3df14d5..bfb5cf8d666272 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll @@ -243,14 +243,14 @@ define i32 @pointer_iv_mixed(ptr noalias %a, ptr noalias %b, i64 %n) #0 { ; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 3 ; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.stepvector.nxv2i64() -; CHECK-NEXT: [[VECTOR_GEP:%.*]] = shl [[TMP9]], shufflevector ( insertelement ( poison, i64 2, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[VECTOR_GEP]] +; CHECK-NEXT: [[TMP10:%.*]] = shl [[TMP9]], shufflevector ( insertelement ( poison, i64 2, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP10]] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 3 ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP11:%.*]] = extractelement [[TMP10]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement [[VECTOR_GEP]], i64 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP11]], align 8 ; CHECK-NEXT: [[TMP12]] = add [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: store [[TMP10]], ptr [[NEXT_GEP]], align 8 +; CHECK-NEXT: store [[VECTOR_GEP]], ptr [[NEXT_GEP]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -318,10 +318,10 @@ define void @phi_used_in_vector_compare_and_scalar_indvar_update_and_store(ptr % ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2 ; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.stepvector.nxv2i64() -; CHECK-NEXT: [[VECTOR_GEP:%.*]] = shl [[TMP4]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[VECTOR_GEP]] -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = extractelement [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = shl [[TMP4]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne [[VECTOR_GEP]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = extractelement [[VECTOR_GEP]], i64 0 ; CHECK-NEXT: call void @llvm.masked.store.nxv2i16.p0( zeroinitializer, ptr [[TMP7]], i32 2, [[TMP6]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]] ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP3]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll index 837e39d12359af..99b8cb7ae94b95 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll @@ -181,15 +181,15 @@ define void @single_constant_stride_ptr_iv(ptr %p) { ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = call @llvm.stepvector.nxv4i64() ; CHECK-NEXT: [[TMP15:%.*]] = add [[DOTSPLAT]], [[TMP14]] -; CHECK-NEXT: [[VECTOR_GEP:%.*]] = mul [[TMP15]], shufflevector ( insertelement ( poison, i64 8, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[VECTOR_GEP]] -; CHECK-NEXT: [[TMP17:%.*]] = extractelement [[TMP16]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = mul [[TMP15]], shufflevector ( insertelement ( poison, i64 8, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP16]] +; CHECK-NEXT: [[TMP17:%.*]] = extractelement [[VECTOR_GEP]], i32 0 ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 0 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP18]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) ; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 ; CHECK-NEXT: [[TMP20:%.*]] = add [[TMP19]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP20]], [[TMP16]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP20]], [[VECTOR_GEP]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -757,8 +757,8 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: [[TMP20:%.*]] = add [[DOTSPLAT]], [[TMP19]] ; STRIDED-NEXT: [[DOTSPLATINSERT9:%.*]] = insertelement poison, i64 [[STRIDE]], i64 0 ; STRIDED-NEXT: [[DOTSPLAT10:%.*]] = shufflevector [[DOTSPLATINSERT9]], poison, zeroinitializer -; STRIDED-NEXT: [[VECTOR_GEP:%.*]] = mul [[TMP20]], [[DOTSPLAT10]] -; STRIDED-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[VECTOR_GEP]] +; STRIDED-NEXT: [[TMP21:%.*]] = mul [[TMP20]], [[DOTSPLAT10]] +; STRIDED-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP21]] ; STRIDED-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() ; STRIDED-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 4 ; STRIDED-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], 1 @@ -768,11 +768,11 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: [[DOTSPLAT14:%.*]] = shufflevector [[DOTSPLATINSERT13]], poison, zeroinitializer ; STRIDED-NEXT: [[TMP27:%.*]] = call @llvm.stepvector.nxv4i64() ; STRIDED-NEXT: [[TMP28:%.*]] = add [[DOTSPLAT14]], [[TMP27]] -; STRIDED-NEXT: [[VECTOR_GEP17:%.*]] = mul [[TMP28]], [[DOTSPLAT10]] -; STRIDED-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[POINTER_PHI11]], [[VECTOR_GEP17]] -; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP21]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison), !alias.scope [[META15:![0-9]+]] +; STRIDED-NEXT: [[TMP29:%.*]] = mul [[TMP28]], [[DOTSPLAT10]] +; STRIDED-NEXT: [[VECTOR_GEP17:%.*]] = getelementptr i8, ptr [[POINTER_PHI11]], [[TMP29]] +; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[VECTOR_GEP]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison), !alias.scope [[META15:![0-9]+]] ; STRIDED-NEXT: [[TMP30:%.*]] = add [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; STRIDED-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP30]], [[TMP29]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)), !alias.scope [[META18:![0-9]+]], !noalias [[META15]] +; STRIDED-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP30]], [[VECTOR_GEP17]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)), !alias.scope [[META18:![0-9]+]], !noalias [[META15]] ; STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] ; STRIDED-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP17]] ; STRIDED-NEXT: [[PTR_IND12]] = getelementptr i8, ptr [[POINTER_PHI11]], i64 [[TMP25]] diff --git a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll index 35037968160c61..41d9c4d84202c6 100644 --- a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll @@ -142,22 +142,22 @@ define void @pointer_induction_used_as_vector(ptr noalias %start.1, ptr noalias ; CHECK: vector.body: ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[START_2]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> +; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START_1]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, <4 x ptr> [[TMP1]], i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr ptr, ptr [[NEXT_GEP]], i32 0 -; CHECK-NEXT: store <4 x ptr> [[TMP3]], ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x ptr> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP6]], align 1 -; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i8> [[WIDE_LOAD]], -; CHECK-NEXT: store <4 x i8> [[TMP7]], ptr [[TMP6]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START_1]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, <4 x ptr> [[VECTOR_GEP]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr ptr, ptr [[NEXT_GEP]], i32 0 +; CHECK-NEXT: store <4 x ptr> [[TMP2]], ptr [[TMP3]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i8> [[WIDE_LOAD]], +; CHECK-NEXT: store <4 x i8> [[TMP6]], ptr [[TMP5]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 4 -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -237,13 +237,13 @@ define void @non_constant_vector_expansion(i32 %0, ptr %call) { ; STRIDED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP1]], 4 ; STRIDED-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP1]], i64 0 ; STRIDED-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer -; STRIDED-NEXT: [[VECTOR_GEP:%.*]] = mul <4 x i64> , [[DOTSPLAT]] -; STRIDED-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> [[VECTOR_GEP]] +; STRIDED-NEXT: [[TMP4:%.*]] = mul <4 x i64> , [[DOTSPLAT]] +; STRIDED-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> [[TMP4]] ; STRIDED-NEXT: [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32 ; STRIDED-NEXT: [[TMP5:%.*]] = add i32 [[OFFSET_IDX]], 0 ; STRIDED-NEXT: [[TMP6:%.*]] = getelementptr ptr, ptr [[CALL:%.*]], i32 [[TMP5]] ; STRIDED-NEXT: [[TMP7:%.*]] = getelementptr ptr, ptr [[TMP6]], i32 0 -; STRIDED-NEXT: store <4 x ptr> [[TMP4]], ptr [[TMP7]], align 4 +; STRIDED-NEXT: store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP7]], align 4 ; STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; STRIDED-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP3]] ; STRIDED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 diff --git a/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll b/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll index e58c99dc4bc58b..43eeefb7744900 100644 --- a/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll +++ b/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll @@ -102,14 +102,14 @@ define void @integer_induction_wraps_scev_predicate_known(i32 %x, ptr %call, ptr ; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP0]], 4 ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[VECTOR_GEP:%.*]] = mul <4 x i64> , [[DOTSPLAT]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> [[VECTOR_GEP]] +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i64> , [[DOTSPLAT]] +; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> [[TMP3]] ; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[INDEX]] to i32 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i32 30, [[DOTCAST]] ; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr ptr, ptr [[CALL]], i32 [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr ptr, ptr [[TMP5]], i32 0 -; CHECK-NEXT: store <4 x ptr> [[TMP3]], ptr [[TMP6]], align 4 +; CHECK-NEXT: store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP6]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4294967264 From 403897484f939cffd9b813eb0b759d7113f5295b Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 18 Sep 2024 12:32:47 +0100 Subject: [PATCH 039/321] [InstCombine] Return FRem, as opposed to substituteInParent. This attempts to fix the ASan buildbot, which is detecting that CI is used after it is removed in substituteInParent. The idea was to make sure it was removed even if it had side-effects writing errno, but that appears to happen if we return FRem directly as usual. --- llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index 4933b5bf60eea8..6799d333fb2844 100644 --- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -2820,7 +2820,7 @@ Value *LibCallSimplifier::optimizeFMod(CallInst *CI, IRBuilderBase &B) { Value *FRem = B.CreateFRemFMF(CI->getOperand(0), CI->getOperand(1), CI); if (auto *FRemI = dyn_cast(FRem)) FRemI->setHasNoNaNs(true); - substituteInParent(CI, FRem); + return FRem; } return nullptr; } From 5e23b66699d1066ce10b14a74d6137303517b2f3 Mon Sep 17 00:00:00 2001 From: Mike Hommey Date: Wed, 18 Sep 2024 20:42:42 +0900 Subject: [PATCH 040/321] [LLD][COFF] Handle imported weak aliases consistently (#109105) symTab being a DenseMap, the order in which a symbol and its corresponding import symbol are processed is not guaranteed, and when the latter comes first, it is left undefined. --- lld/COFF/SymbolTable.cpp | 8 ++++++++ lld/test/COFF/import_weak_alias.test | 20 ++++++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 lld/test/COFF/import_weak_alias.test diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp index efea16ccbbfea0..1488ad95d0da62 100644 --- a/lld/COFF/SymbolTable.cpp +++ b/lld/COFF/SymbolTable.cpp @@ -502,6 +502,14 @@ void SymbolTable::resolveRemainingUndefines() { // This odd rule is for compatibility with MSVC linker. if (name.starts_with("__imp_")) { Symbol *imp = find(name.substr(strlen("__imp_"))); + if (imp) { + // The unprefixed symbol might come later in symMap, so handle it now + // so that the condition below can be appropriately applied. + auto *undef = dyn_cast(imp); + if (undef) { + undef->resolveWeakAlias(); + } + } if (imp && isa(imp)) { auto *d = cast(imp); replaceSymbol(sym, ctx, name, d); diff --git a/lld/test/COFF/import_weak_alias.test b/lld/test/COFF/import_weak_alias.test new file mode 100644 index 00000000000000..ae1817c67a20ac --- /dev/null +++ b/lld/test/COFF/import_weak_alias.test @@ -0,0 +1,20 @@ +# REQUIRES: x86 + +# RUN: split-file %s %t.dir +# RUN: llvm-mc --filetype=obj -triple=x86_64-windows-msvc %t.dir/foo.s -o %t.foo.obj +# RUN: llvm-mc --filetype=obj -triple=x86_64-windows-msvc %t.dir/qux.s -o %t.qux.obj +# RUN: lld-link %t.qux.obj %t.foo.obj -out:%t.dll -dll +# +#--- foo.s +.text +bar: + ret + +.weak foo +.set foo, bar +#--- qux.s +.text +.global _DllMainCRTStartup +_DllMainCRTStartup: + call *__imp_foo(%rip) + ret From c29dfb334643073607ac94e78a339a87508217d1 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 18 Sep 2024 04:45:49 -0700 Subject: [PATCH 041/321] [LLVM][TableGen] Change CodeGenSchedule to use const Record pointers (#108782) Change CodeGenSchedule to use const Record pointers. This is a part of effort to have better const correctness in TableGen backends: https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089 --- .../utils/TableGen/Common/CodeGenSchedule.cpp | 116 +++++++++--------- llvm/utils/TableGen/Common/CodeGenSchedule.h | 35 +++--- llvm/utils/TableGen/DFAPacketizerEmitter.cpp | 17 +-- llvm/utils/TableGen/InstrInfoEmitter.cpp | 3 +- llvm/utils/TableGen/SubtargetEmitter.cpp | 34 ++--- 5 files changed, 107 insertions(+), 98 deletions(-) diff --git a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp index 33d1da2f848ba9..de2cb67b1f1d53 100644 --- a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp +++ b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp @@ -271,8 +271,7 @@ void CodeGenSchedModels::checkSTIPredicates() const { // Disallow InstructionEquivalenceClasses with an empty instruction list. for (const Record *R : Records.getAllDerivedDefinitions("InstructionEquivalenceClass")) { - RecVec Opcodes = R->getValueAsListOfDefs("Opcodes"); - if (Opcodes.empty()) { + if (R->getValueAsListOfDefs("Opcodes").empty()) { PrintFatalError(R->getLoc(), "Invalid InstructionEquivalenceClass " "defined with an empty opcode list."); } @@ -311,13 +310,13 @@ static void processSTIPredicate(STIPredicateFunction &Fn, // definitions. Each unique opcode will be associated with an OpcodeInfo // object. for (const Record *Def : Fn.getDefinitions()) { - RecVec Classes = Def->getValueAsListOfDefs("Classes"); + ConstRecVec Classes = Def->getValueAsListOfConstDefs("Classes"); for (const Record *EC : Classes) { const Record *Pred = EC->getValueAsDef("Predicate"); if (!Predicate2Index.contains(Pred)) Predicate2Index[Pred] = NumUniquePredicates++; - RecVec Opcodes = EC->getValueAsListOfDefs("Opcodes"); + ConstRecVec Opcodes = EC->getValueAsListOfConstDefs("Opcodes"); for (const Record *Opcode : Opcodes) { if (!Opcode2Index.contains(Opcode)) { Opcode2Index[Opcode] = OpcodeMappings.size(); @@ -342,14 +341,14 @@ static void processSTIPredicate(STIPredicateFunction &Fn, // Construct a OpcodeInfo object for every unique opcode declared by an // InstructionEquivalenceClass definition. for (const Record *Def : Fn.getDefinitions()) { - RecVec Classes = Def->getValueAsListOfDefs("Classes"); + ConstRecVec Classes = Def->getValueAsListOfConstDefs("Classes"); const Record *SchedModel = Def->getValueAsDef("SchedModel"); unsigned ProcIndex = ProcModelMap.find(SchedModel)->second; APInt ProcMask(ProcModelMap.size(), 0); ProcMask.setBit(ProcIndex); for (const Record *EC : Classes) { - RecVec Opcodes = EC->getValueAsListOfDefs("Opcodes"); + ConstRecVec Opcodes = EC->getValueAsListOfConstDefs("Opcodes"); std::vector OpIndices = EC->getValueAsListOfInts("OperandIndices"); @@ -579,8 +578,7 @@ static void scanSchedRW(const Record *RWDef, ConstRecVec &RWDefs, // Visit each variant (guarded by a different predicate). for (const Record *Variant : RWDef->getValueAsListOfDefs("Variants")) { // Visit each RW in the sequence selected by the current variant. - RecVec Selected = Variant->getValueAsListOfDefs("Selected"); - for (Record *SelDef : Selected) + for (const Record *SelDef : Variant->getValueAsListOfDefs("Selected")) scanSchedRW(SelDef, RWDefs, RWSet); } } @@ -601,8 +599,7 @@ void CodeGenSchedModels::collectSchedRW() { const Record *SchedDef = Inst->TheDef; if (SchedDef->isValueUnset("SchedRW")) continue; - RecVec RWs = SchedDef->getValueAsListOfDefs("SchedRW"); - for (Record *RW : RWs) { + for (const Record *RW : SchedDef->getValueAsListOfDefs("SchedRW")) { if (RW->isSubClassOf("SchedWrite")) scanSchedRW(RW, SWDefs, RWSet); else { @@ -614,8 +611,8 @@ void CodeGenSchedModels::collectSchedRW() { // Find all ReadWrites referenced by InstRW. for (const Record *InstRWDef : Records.getAllDerivedDefinitions("InstRW")) { // For all OperandReadWrites. - RecVec RWDefs = InstRWDef->getValueAsListOfDefs("OperandReadWrites"); - for (Record *RWDef : RWDefs) { + for (const Record *RWDef : + InstRWDef->getValueAsListOfDefs("OperandReadWrites")) { if (RWDef->isSubClassOf("SchedWrite")) scanSchedRW(RWDef, SWDefs, RWSet); else { @@ -627,8 +624,8 @@ void CodeGenSchedModels::collectSchedRW() { // Find all ReadWrites referenced by ItinRW. for (const Record *ItinRWDef : Records.getAllDerivedDefinitions("ItinRW")) { // For all OperandReadWrites. - RecVec RWDefs = ItinRWDef->getValueAsListOfDefs("OperandReadWrites"); - for (Record *RWDef : RWDefs) { + for (const Record *RWDef : + ItinRWDef->getValueAsListOfDefs("OperandReadWrites")) { if (RWDef->isSubClassOf("SchedWrite")) scanSchedRW(RWDef, SWDefs, RWSet); else { @@ -672,7 +669,7 @@ void CodeGenSchedModels::collectSchedRW() { for (CodeGenSchedRW &CGRW : SchedWrites) { if (!CGRW.IsSequence) continue; - findRWs(CGRW.TheDef->getValueAsListOfDefs("Writes"), CGRW.Sequence, + findRWs(CGRW.TheDef->getValueAsListOfConstDefs("Writes"), CGRW.Sequence, /*IsRead=*/false); } // Initialize Aliases vectors. @@ -726,9 +723,10 @@ unsigned CodeGenSchedModels::getSchedRWIdx(const Record *Def, return I == RWVec.end() ? 0 : std::distance(RWVec.begin(), I); } -static void splitSchedReadWrites(const RecVec &RWDefs, RecVec &WriteDefs, - RecVec &ReadDefs) { - for (Record *RWDef : RWDefs) { +static void splitSchedReadWrites(const ConstRecVec &RWDefs, + ConstRecVec &WriteDefs, + ConstRecVec &ReadDefs) { + for (const Record *RWDef : RWDefs) { if (RWDef->isSubClassOf("SchedWrite")) WriteDefs.push_back(RWDef); else { @@ -739,19 +737,19 @@ static void splitSchedReadWrites(const RecVec &RWDefs, RecVec &WriteDefs, } // Split the SchedReadWrites defs and call findRWs for each list. -void CodeGenSchedModels::findRWs(const RecVec &RWDefs, IdxVec &Writes, +void CodeGenSchedModels::findRWs(const ConstRecVec &RWDefs, IdxVec &Writes, IdxVec &Reads) const { - RecVec WriteDefs; - RecVec ReadDefs; + ConstRecVec WriteDefs; + ConstRecVec ReadDefs; splitSchedReadWrites(RWDefs, WriteDefs, ReadDefs); findRWs(WriteDefs, Writes, false); findRWs(ReadDefs, Reads, true); } // Call getSchedRWIdx for all elements in a sequence of SchedRW defs. -void CodeGenSchedModels::findRWs(const RecVec &RWDefs, IdxVec &RWs, +void CodeGenSchedModels::findRWs(const ConstRecVec &RWDefs, IdxVec &RWs, bool IsRead) const { - for (Record *RWDef : RWDefs) { + for (const Record *RWDef : RWDefs) { unsigned Idx = getSchedRWIdx(RWDef, IsRead); assert(Idx && "failed to collect SchedReadWrite"); RWs.push_back(Idx); @@ -859,7 +857,8 @@ void CodeGenSchedModels::collectSchedClasses() { Record *ItinDef = Inst->TheDef->getValueAsDef("Itinerary"); IdxVec Writes, Reads; if (!Inst->TheDef->isValueUnset("SchedRW")) - findRWs(Inst->TheDef->getValueAsListOfDefs("SchedRW"), Writes, Reads); + findRWs(Inst->TheDef->getValueAsListOfConstDefs("SchedRW"), Writes, + Reads); // ProcIdx == 0 indicates the class applies to all processors. unsigned SCIdx = addSchedClass(ItinDef, Writes, Reads, /*ProcIndices*/ {0}); @@ -921,7 +920,8 @@ void CodeGenSchedModels::collectSchedClasses() { << InstName); IdxVec Writes; IdxVec Reads; - findRWs(RWDef->getValueAsListOfDefs("OperandReadWrites"), Writes, Reads); + findRWs(RWDef->getValueAsListOfConstDefs("OperandReadWrites"), Writes, + Reads); LLVM_DEBUG({ for (unsigned WIdx : Writes) dbgs() << " " << SchedWrites[WIdx].Name; @@ -950,10 +950,9 @@ CodeGenSchedModels::getSchedClassIdx(const CodeGenInstruction &Inst) const { } std::string -CodeGenSchedModels::createSchedClassName(Record *ItinClassDef, +CodeGenSchedModels::createSchedClassName(const Record *ItinClassDef, ArrayRef OperWrites, ArrayRef OperReads) { - std::string Name; if (ItinClassDef && ItinClassDef->getName() != "NoItinerary") Name = std::string(ItinClassDef->getName()); @@ -983,7 +982,7 @@ CodeGenSchedModels::createSchedClassName(const ConstRecVec &InstDefs) { /// Add an inferred sched class from an itinerary class and per-operand list of /// SchedWrites and SchedReads. ProcIndices contains the set of IDs of /// processors that may utilize this class. -unsigned CodeGenSchedModels::addSchedClass(Record *ItinClassDef, +unsigned CodeGenSchedModels::addSchedClass(const Record *ItinClassDef, ArrayRef OperWrites, ArrayRef OperReads, ArrayRef ProcIndices) { @@ -1131,7 +1130,8 @@ void CodeGenSchedModels::collectProcItins() { if (!ProcModel.hasItineraries()) continue; - RecVec ItinRecords = ProcModel.ItinsDef->getValueAsListOfDefs("IID"); + ConstRecVec ItinRecords = + ProcModel.ItinsDef->getValueAsListOfConstDefs("IID"); assert(!ItinRecords.empty() && "ProcModel.hasItineraries is incorrect"); // Populate ItinDefList with Itinerary records. @@ -1139,7 +1139,7 @@ void CodeGenSchedModels::collectProcItins() { // Insert each itinerary data record in the correct position within // the processor model's ItinDefList. - for (Record *ItinData : ItinRecords) { + for (const Record *ItinData : ItinRecords) { const Record *ItinDef = ItinData->getValueAsDef("TheClass"); bool FoundClass = false; @@ -1217,14 +1217,15 @@ void CodeGenSchedModels::inferSchedClasses() { } /// Infer classes from per-processor itinerary resources. -void CodeGenSchedModels::inferFromItinClass(Record *ItinClassDef, +void CodeGenSchedModels::inferFromItinClass(const Record *ItinClassDef, unsigned FromClassIdx) { for (unsigned PIdx = 0, PEnd = ProcModels.size(); PIdx != PEnd; ++PIdx) { const CodeGenProcModel &PM = ProcModels[PIdx]; // For all ItinRW entries. bool HasMatch = false; for (const Record *Rec : PM.ItinRWDefs) { - RecVec Matched = Rec->getValueAsListOfDefs("MatchedItinClasses"); + ConstRecVec Matched = + Rec->getValueAsListOfConstDefs("MatchedItinClasses"); if (!llvm::is_contained(Matched, ItinClassDef)) continue; if (HasMatch) @@ -1233,7 +1234,8 @@ void CodeGenSchedModels::inferFromItinClass(Record *ItinClassDef, " in ItinResources for " + PM.ModelName); HasMatch = true; IdxVec Writes, Reads; - findRWs(Rec->getValueAsListOfDefs("OperandReadWrites"), Writes, Reads); + findRWs(Rec->getValueAsListOfConstDefs("OperandReadWrites"), Writes, + Reads); inferFromRW(Writes, Reads, FromClassIdx, PIdx); } } @@ -1255,7 +1257,7 @@ void CodeGenSchedModels::inferFromInstRWs(unsigned SCIdx) { if (II == IE) continue; IdxVec Writes, Reads; - findRWs(Rec->getValueAsListOfDefs("OperandReadWrites"), Writes, Reads); + findRWs(Rec->getValueAsListOfConstDefs("OperandReadWrites"), Writes, Reads); unsigned PIdx = getProcModel(Rec->getValueAsDef("SchedModel")).Index; inferFromRW(Writes, Reads, SCIdx, PIdx); // May mutate SchedClasses. SchedClasses[SCIdx].InstRWProcIndices.insert(PIdx); @@ -1348,7 +1350,8 @@ bool PredTransitions::mutuallyExclusive(Record *PredDef, const CodeGenSchedRW &SchedRW = SchedModels.getSchedRW(PC.RWIdx, PC.IsRead); assert(SchedRW.HasVariants && "PredCheck must refer to a SchedVariant"); - RecVec Variants = SchedRW.TheDef->getValueAsListOfDefs("Variants"); + ConstRecVec Variants = + SchedRW.TheDef->getValueAsListOfConstDefs("Variants"); if (any_of(Variants, [PredDef](const Record *R) { return R->getValueAsDef("Predicate") == PredDef; })) { @@ -1414,8 +1417,8 @@ void PredTransitions::getIntersectingVariants( } if (VarProcIdx == 0 || VarProcIdx == TransVec[TransIdx].ProcIndex) { // Push each variant. Assign TransVecIdx later. - const RecVec VarDefs = SchedRW.TheDef->getValueAsListOfDefs("Variants"); - for (Record *VarDef : VarDefs) + for (const Record *VarDef : + SchedRW.TheDef->getValueAsListOfDefs("Variants")) Variants.emplace_back(VarDef, SchedRW.Index, VarProcIdx, 0); if (VarProcIdx == 0) GenericRW = true; @@ -1446,8 +1449,7 @@ void PredTransitions::getIntersectingVariants( SchedModels.getSchedRW((*AI)->getValueAsDef("AliasRW")); if (AliasRW.HasVariants) { - const RecVec VarDefs = AliasRW.TheDef->getValueAsListOfDefs("Variants"); - for (Record *VD : VarDefs) + for (const Record *VD : AliasRW.TheDef->getValueAsListOfDefs("Variants")) Variants.emplace_back(VD, AliasRW.Index, AliasProcIdx, 0); } if (AliasRW.IsSequence) @@ -1495,7 +1497,8 @@ void PredTransitions::pushVariant(const TransVariant &VInfo, bool IsRead) { if (VInfo.VarOrSeqDef->isSubClassOf("SchedVar")) { Record *PredDef = VInfo.VarOrSeqDef->getValueAsDef("Predicate"); Trans.PredTerm.emplace_back(IsRead, VInfo.RWIdx, PredDef); - RecVec SelectedDefs = VInfo.VarOrSeqDef->getValueAsListOfDefs("Selected"); + ConstRecVec SelectedDefs = + VInfo.VarOrSeqDef->getValueAsListOfConstDefs("Selected"); SchedModels.findRWs(SelectedDefs, SelectedRWs, IsRead); } else { assert(VInfo.VarOrSeqDef->isSubClassOf("WriteSequence") && @@ -1761,12 +1764,14 @@ void CodeGenSchedModels::inferFromRW(ArrayRef OperWrites, // Check if any processor resource group contains all resource records in // SubUnits. -bool CodeGenSchedModels::hasSuperGroup(RecVec &SubUnits, CodeGenProcModel &PM) { +bool CodeGenSchedModels::hasSuperGroup(ConstRecVec &SubUnits, + CodeGenProcModel &PM) { for (const Record *ProcResourceDef : PM.ProcResourceDefs) { if (!ProcResourceDef->isSubClassOf("ProcResGroup")) continue; - RecVec SuperUnits = ProcResourceDef->getValueAsListOfDefs("Resources"); - RecIter RI = SubUnits.begin(), RE = SubUnits.end(); + ConstRecVec SuperUnits = + ProcResourceDef->getValueAsListOfConstDefs("Resources"); + auto RI = SubUnits.begin(), RE = SubUnits.end(); for (; RI != RE; ++RI) { if (!is_contained(SuperUnits, *RI)) { break; @@ -1783,13 +1788,13 @@ void CodeGenSchedModels::verifyProcResourceGroups(CodeGenProcModel &PM) { for (unsigned i = 0, e = PM.ProcResourceDefs.size(); i < e; ++i) { if (!PM.ProcResourceDefs[i]->isSubClassOf("ProcResGroup")) continue; - RecVec CheckUnits = - PM.ProcResourceDefs[i]->getValueAsListOfDefs("Resources"); + ConstRecVec CheckUnits = + PM.ProcResourceDefs[i]->getValueAsListOfConstDefs("Resources"); for (unsigned j = i + 1; j < e; ++j) { if (!PM.ProcResourceDefs[j]->isSubClassOf("ProcResGroup")) continue; - RecVec OtherUnits = - PM.ProcResourceDefs[j]->getValueAsListOfDefs("Resources"); + ConstRecVec OtherUnits = + PM.ProcResourceDefs[j]->getValueAsListOfConstDefs("Resources"); if (std::find_first_of(CheckUnits.begin(), CheckUnits.end(), OtherUnits.begin(), OtherUnits.end()) != CheckUnits.end()) { @@ -1828,7 +1833,7 @@ void CodeGenSchedModels::collectRegisterFiles() { "Invalid RegisterFile with zero physical registers"); } - RecVec RegisterClasses = RF->getValueAsListOfDefs("RegClasses"); + ConstRecVec RegisterClasses = RF->getValueAsListOfConstDefs("RegClasses"); std::vector RegisterCosts = RF->getValueAsListOfInts("RegCosts"); ListInit *MoveElimInfo = RF->getValueAsListInit("AllowMoveElimination"); for (unsigned I = 0, E = RegisterClasses.size(); I < E; ++I) { @@ -1866,7 +1871,8 @@ void CodeGenSchedModels::collectProcResources() { Record *RWModelDef = RW->getValueAsDef("SchedModel"); unsigned PIdx = getProcModel(RWModelDef).Index; IdxVec Writes, Reads; - findRWs(RW->getValueAsListOfDefs("OperandReadWrites"), Writes, Reads); + findRWs(RW->getValueAsListOfConstDefs("OperandReadWrites"), Writes, + Reads); collectRWResources(Writes, Reads, PIdx); } @@ -2004,13 +2010,13 @@ void CodeGenSchedModels::checkCompleteness() { } // Collect itinerary class resources for each processor. -void CodeGenSchedModels::collectItinProcResources(Record *ItinClassDef) { +void CodeGenSchedModels::collectItinProcResources(const Record *ItinClassDef) { for (unsigned PIdx = 0, PEnd = ProcModels.size(); PIdx != PEnd; ++PIdx) { const CodeGenProcModel &PM = ProcModels[PIdx]; // For all ItinRW entries. bool HasMatch = false; for (const Record *R : PM.ItinRWDefs) { - RecVec Matched = R->getValueAsListOfDefs("MatchedItinClasses"); + ConstRecVec Matched = R->getValueAsListOfConstDefs("MatchedItinClasses"); if (!llvm::is_contained(Matched, ItinClassDef)) continue; if (HasMatch) @@ -2019,7 +2025,7 @@ void CodeGenSchedModels::collectItinProcResources(Record *ItinClassDef) { " in ItinResources for " + PM.ModelName); HasMatch = true; IdxVec Writes, Reads; - findRWs(R->getValueAsListOfDefs("OperandReadWrites"), Writes, Reads); + findRWs(R->getValueAsListOfConstDefs("OperandReadWrites"), Writes, Reads); collectRWResources(Writes, Reads, PIdx); } } @@ -2139,8 +2145,8 @@ void CodeGenSchedModels::addWriteRes(const Record *ProcWriteResDef, WRDefs.push_back(ProcWriteResDef); // Visit ProcResourceKinds referenced by the newly discovered WriteRes. - RecVec ProcResDefs = ProcWriteResDef->getValueAsListOfDefs("ProcResources"); - for (const Record *ProcResDef : ProcResDefs) { + for (const Record *ProcResDef : + ProcWriteResDef->getValueAsListOfDefs("ProcResources")) { addProcResource(ProcResDef, ProcModels[PIdx], ProcWriteResDef->getLoc()); } } @@ -2186,7 +2192,7 @@ bool CodeGenProcModel::isUnsupported(const CodeGenInstruction &Inst) const { bool CodeGenProcModel::hasReadOfWrite(const Record *WriteDef) const { for (auto &RADef : ReadAdvanceDefs) { - RecVec ValidWrites = RADef->getValueAsListOfDefs("ValidWrites"); + ConstRecVec ValidWrites = RADef->getValueAsListOfConstDefs("ValidWrites"); if (is_contained(ValidWrites, WriteDef)) return true; } diff --git a/llvm/utils/TableGen/Common/CodeGenSchedule.h b/llvm/utils/TableGen/Common/CodeGenSchedule.h index ff85ac3968593b..f43c856b274ce8 100644 --- a/llvm/utils/TableGen/Common/CodeGenSchedule.h +++ b/llvm/utils/TableGen/Common/CodeGenSchedule.h @@ -33,8 +33,6 @@ class CodeGenTarget; class CodeGenSchedModels; class CodeGenInstruction; -using RecVec = std::vector; -using RecIter = RecVec::const_iterator; using ConstRecVec = std::vector; using ConstRecIter = ConstRecVec::const_iterator; @@ -132,7 +130,7 @@ struct CodeGenSchedTransition { struct CodeGenSchedClass { unsigned Index; std::string Name; - Record *ItinClassDef; + const Record *ItinClassDef; IdxVec Writes; IdxVec Reads; @@ -149,10 +147,11 @@ struct CodeGenSchedClass { // InstRWs processor indices. Filled in inferFromInstRWs DenseSet InstRWProcIndices; - CodeGenSchedClass(unsigned Index, std::string Name, Record *ItinClassDef) + CodeGenSchedClass(unsigned Index, std::string Name, + const Record *ItinClassDef) : Index(Index), Name(std::move(Name)), ItinClassDef(ItinClassDef) {} - bool isKeyEqual(Record *IC, ArrayRef W, + bool isKeyEqual(const Record *IC, ArrayRef W, ArrayRef R) const { return ItinClassDef == IC && ArrayRef(Writes) == W && ArrayRef(Reads) == R; } @@ -172,10 +171,10 @@ struct CodeGenSchedClass { /// registers used by the register renamer. Register costs are defined at /// register class granularity. struct CodeGenRegisterCost { - Record *RCDef; + const Record *RCDef; unsigned Cost; bool AllowMoveElimination; - CodeGenRegisterCost(Record *RC, unsigned RegisterCost, + CodeGenRegisterCost(const Record *RC, unsigned RegisterCost, bool AllowMoveElim = false) : RCDef(RC), Cost(RegisterCost), AllowMoveElimination(AllowMoveElim) {} CodeGenRegisterCost(const CodeGenRegisterCost &) = default; @@ -231,7 +230,7 @@ struct CodeGenProcModel { // Array of InstrItinData records indexed by a CodeGenSchedClass index. // This list is empty if the Processor has no value for Itineraries. // Initialized by collectProcItins(). - RecVec ItinDefList; + ConstRecVec ItinDefList; // Map itinerary classes to per-operand resources. // This list is empty if no ItinRW refers to this Processor. @@ -239,7 +238,7 @@ struct CodeGenProcModel { // List of unsupported feature. // This list is empty if the Processor has no UnsupportedFeatures. - RecVec UnsupportedFeaturesDefs; + ConstRecVec UnsupportedFeaturesDefs; // All read/write resources associated with this processor. ConstRecVec WriteResDefs; @@ -530,13 +529,13 @@ class CodeGenSchedModels { const CodeGenSchedRW &getSchedRW(unsigned Idx, bool IsRead) const { return IsRead ? getSchedRead(Idx) : getSchedWrite(Idx); } - CodeGenSchedRW &getSchedRW(Record *Def) { + CodeGenSchedRW &getSchedRW(const Record *Def) { bool IsRead = Def->isSubClassOf("SchedRead"); unsigned Idx = getSchedRWIdx(Def, IsRead); return const_cast(IsRead ? getSchedRead(Idx) : getSchedWrite(Idx)); } - const CodeGenSchedRW &getSchedRW(Record *Def) const { + const CodeGenSchedRW &getSchedRW(const Record *Def) const { return const_cast(*this).getSchedRW(Def); } @@ -564,13 +563,13 @@ class CodeGenSchedModels { unsigned numInstrSchedClasses() const { return NumInstrSchedClasses; } - void findRWs(const RecVec &RWDefs, IdxVec &Writes, IdxVec &Reads) const; - void findRWs(const RecVec &RWDefs, IdxVec &RWs, bool IsRead) const; + void findRWs(const ConstRecVec &RWDefs, IdxVec &Writes, IdxVec &Reads) const; + void findRWs(const ConstRecVec &RWDefs, IdxVec &RWs, bool IsRead) const; void expandRWSequence(unsigned RWIdx, IdxVec &RWSeq, bool IsRead) const; void expandRWSeqForProc(unsigned RWIdx, IdxVec &RWSeq, bool IsRead, const CodeGenProcModel &ProcModel) const; - unsigned addSchedClass(Record *ItinDef, ArrayRef OperWrites, + unsigned addSchedClass(const Record *ItinDef, ArrayRef OperWrites, ArrayRef OperReads, ArrayRef ProcIndices); @@ -603,7 +602,7 @@ class CodeGenSchedModels { void collectOptionalProcessorInfo(); - std::string createSchedClassName(Record *ItinClassDef, + std::string createSchedClassName(const Record *ItinClassDef, ArrayRef OperWrites, ArrayRef OperReads); std::string createSchedClassName(const ConstRecVec &InstDefs); @@ -629,15 +628,15 @@ class CodeGenSchedModels { void inferFromRW(ArrayRef OperWrites, ArrayRef OperReads, unsigned FromClassIdx, ArrayRef ProcIndices); - void inferFromItinClass(Record *ItinClassDef, unsigned FromClassIdx); + void inferFromItinClass(const Record *ItinClassDef, unsigned FromClassIdx); void inferFromInstRWs(unsigned SCIdx); - bool hasSuperGroup(RecVec &SubUnits, CodeGenProcModel &PM); + bool hasSuperGroup(ConstRecVec &SubUnits, CodeGenProcModel &PM); void verifyProcResourceGroups(CodeGenProcModel &PM); void collectProcResources(); - void collectItinProcResources(Record *ItinClassDef); + void collectItinProcResources(const Record *ItinClassDef); void collectRWResources(unsigned RWIdx, bool IsRead, ArrayRef ProcIndices); diff --git a/llvm/utils/TableGen/DFAPacketizerEmitter.cpp b/llvm/utils/TableGen/DFAPacketizerEmitter.cpp index 55cb39c9de5faa..4070bafded9cc2 100644 --- a/llvm/utils/TableGen/DFAPacketizerEmitter.cpp +++ b/llvm/utils/TableGen/DFAPacketizerEmitter.cpp @@ -78,8 +78,9 @@ class DFAPacketizerEmitter { // functional units. int collectAllComboFuncs(ArrayRef ComboFuncList); - ResourceVector getResourcesForItinerary(Record *Itinerary); - void createScheduleClasses(unsigned ItineraryIdx, const RecVec &Itineraries); + ResourceVector getResourcesForItinerary(const Record *Itinerary); + void createScheduleClasses(unsigned ItineraryIdx, + const ConstRecVec &Itineraries); // Emit code for a subset of itineraries. void emitForItineraries(raw_ostream &OS, @@ -174,12 +175,12 @@ int DFAPacketizerEmitter::collectAllComboFuncs( } ResourceVector -DFAPacketizerEmitter::getResourcesForItinerary(Record *Itinerary) { +DFAPacketizerEmitter::getResourcesForItinerary(const Record *Itinerary) { ResourceVector Resources; assert(Itinerary); - for (Record *StageDef : Itinerary->getValueAsListOfDefs("Stages")) { + for (const Record *StageDef : Itinerary->getValueAsListOfDefs("Stages")) { uint64_t StageResources = 0; - for (Record *Unit : StageDef->getValueAsListOfDefs("Units")) { + for (const Record *Unit : StageDef->getValueAsListOfDefs("Units")) { StageResources |= FUNameToBitsMap[std::string(Unit->getName())]; } if (StageResources != 0) @@ -188,10 +189,10 @@ DFAPacketizerEmitter::getResourcesForItinerary(Record *Itinerary) { return Resources; } -void DFAPacketizerEmitter::createScheduleClasses(unsigned ItineraryIdx, - const RecVec &Itineraries) { +void DFAPacketizerEmitter::createScheduleClasses( + unsigned ItineraryIdx, const ConstRecVec &Itineraries) { unsigned Idx = 0; - for (Record *Itinerary : Itineraries) { + for (const Record *Itinerary : Itineraries) { if (!Itinerary) { ScheduleClasses.push_back({ItineraryIdx, Idx++, 0, ResourceVector{}}); continue; diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp index 4e2138d15fde50..5830cdae709629 100644 --- a/llvm/utils/TableGen/InstrInfoEmitter.cpp +++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp @@ -672,7 +672,8 @@ void InstrInfoEmitter::emitLogicalOperandTypeMappings( void InstrInfoEmitter::emitMCIIHelperMethods(raw_ostream &OS, StringRef TargetName) { - RecVec TIIPredicates = Records.getAllDerivedDefinitions("TIIPredicate"); + ArrayRef TIIPredicates = + Records.getAllDerivedDefinitions("TIIPredicate"); OS << "#ifdef GET_INSTRINFO_MC_HELPER_DECLS\n"; OS << "#undef GET_INSTRINFO_MC_HELPER_DECLS\n\n"; diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp index 394e2eb42c15d2..c568f6747f4f9a 100644 --- a/llvm/utils/TableGen/SubtargetEmitter.cpp +++ b/llvm/utils/TableGen/SubtargetEmitter.cpp @@ -284,7 +284,7 @@ unsigned SubtargetEmitter::FeatureKeyValues(raw_ostream &OS, << "\"" << CommandLineName << "\", " << "\"" << Desc << "\", " << Target << "::" << Name << ", "; - RecVec ImpliesList = Feature->getValueAsListOfDefs("Implies"); + ConstRecVec ImpliesList = Feature->getValueAsListOfConstDefs("Implies"); printFeatureMask(OS, ImpliesList, FeatureMap); @@ -320,8 +320,9 @@ unsigned SubtargetEmitter::CPUKeyValues(raw_ostream &OS, for (const Record *Processor : ProcessorList) { StringRef Name = Processor->getValueAsString("Name"); - RecVec FeatureList = Processor->getValueAsListOfDefs("Features"); - RecVec TuneFeatureList = Processor->getValueAsListOfDefs("TuneFeatures"); + ConstRecVec FeatureList = Processor->getValueAsListOfConstDefs("Features"); + ConstRecVec TuneFeatureList = + Processor->getValueAsListOfConstDefs("TuneFeatures"); // Emit as "{ "cpu", "description", 0, { f1 , f2 , ... fn } },". OS << " { " @@ -366,7 +367,7 @@ void SubtargetEmitter::FormItineraryStageString(const std::string &Name, ItinString += " { " + itostr(Cycles) + ", "; // Get unit list - RecVec UnitList = Stage->getValueAsListOfDefs("Units"); + ConstRecVec UnitList = Stage->getValueAsListOfConstDefs("Units"); // For each unit for (unsigned j = 0, M = UnitList.size(); j < M;) { @@ -444,7 +445,7 @@ void SubtargetEmitter::EmitStageAndOperandCycleData( if (!ItinsDefSet.insert(ProcModel.ItinsDef).second) continue; - RecVec FUs = ProcModel.ItinsDef->getValueAsListOfDefs("FU"); + ConstRecVec FUs = ProcModel.ItinsDef->getValueAsListOfConstDefs("FU"); if (FUs.empty()) continue; @@ -458,7 +459,7 @@ void SubtargetEmitter::EmitStageAndOperandCycleData( OS << "} // end namespace " << Name << "FU\n"; - RecVec BPs = ProcModel.ItinsDef->getValueAsListOfDefs("BP"); + ConstRecVec BPs = ProcModel.ItinsDef->getValueAsListOfConstDefs("BP"); if (!BPs.empty()) { OS << "\n// Pipeline forwarding paths for itineraries \"" << Name << "\"\n" @@ -682,8 +683,7 @@ void SubtargetEmitter::EmitProcessorResourceSubUnits( const Record *PRDef = ProcModel.ProcResourceDefs[i]; if (!PRDef->isSubClassOf("ProcResGroup")) continue; - RecVec ResUnits = PRDef->getValueAsListOfDefs("Resources"); - for (const Record *RUDef : ResUnits) { + for (const Record *RUDef : PRDef->getValueAsListOfDefs("Resources")) { const Record *RU = SchedModels.findProcResUnits(RUDef, ProcModel, PRDef->getLoc()); for (unsigned J = 0; J < RU->getValueAsInt("NumUnits"); ++J) { @@ -842,8 +842,7 @@ void SubtargetEmitter::EmitProcessorResources(const CodeGenProcModel &ProcModel, const unsigned SubUnitsBeginOffset = SubUnitsOffset; int BufferSize = PRDef->getValueAsInt("BufferSize"); if (PRDef->isSubClassOf("ProcResGroup")) { - RecVec ResUnits = PRDef->getValueAsListOfDefs("Resources"); - for (const Record *RU : ResUnits) { + for (const Record *RU : PRDef->getValueAsListOfDefs("Resources")) { NumUnits += RU->getValueAsInt("NumUnits"); SubUnitsOffset += RU->getValueAsInt("NumUnits"); } @@ -1028,7 +1027,7 @@ void SubtargetEmitter::ExpandProcResources( for (const Record *PR : PM.ProcResourceDefs) { if (PR == PRDef || !PR->isSubClassOf("ProcResGroup")) continue; - RecVec SuperResources = PR->getValueAsListOfDefs("Resources"); + ConstRecVec SuperResources = PR->getValueAsListOfConstDefs("Resources"); ConstRecIter SubI = SubResources.begin(), SubE = SubResources.end(); for (; SubI != SubE; ++SubI) { if (!is_contained(SuperResources, *SubI)) { @@ -1105,16 +1104,18 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel, if (RWDef) { Writes.clear(); Reads.clear(); - SchedModels.findRWs(RWDef->getValueAsListOfDefs("OperandReadWrites"), - Writes, Reads); + SchedModels.findRWs( + RWDef->getValueAsListOfConstDefs("OperandReadWrites"), Writes, + Reads); } } if (Writes.empty()) { // Check this processor's itinerary class resources. for (const Record *I : ProcModel.ItinRWDefs) { - RecVec Matched = I->getValueAsListOfDefs("MatchedItinClasses"); + ConstRecVec Matched = + I->getValueAsListOfConstDefs("MatchedItinClasses"); if (is_contained(Matched, SC.ItinClassDef)) { - SchedModels.findRWs(I->getValueAsListOfDefs("OperandReadWrites"), + SchedModels.findRWs(I->getValueAsListOfConstDefs("OperandReadWrites"), Writes, Reads); break; } @@ -1274,7 +1275,8 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel, SCDesc.NumMicroOps = MCSchedClassDesc::InvalidNumMicroOps; break; } - RecVec ValidWrites = ReadAdvance->getValueAsListOfDefs("ValidWrites"); + ConstRecVec ValidWrites = + ReadAdvance->getValueAsListOfConstDefs("ValidWrites"); IdxVec WriteIDs; if (ValidWrites.empty()) WriteIDs.push_back(0); From cb47b45d782fcca46acb614c720233c6b8706e58 Mon Sep 17 00:00:00 2001 From: Chris Apple Date: Wed, 18 Sep 2024 05:48:24 -0600 Subject: [PATCH 042/321] [rtsan] Fix RTTI issue, make a better c test (#108720) Later in a development branch, our c tests were failing, this was due to the lack of RTTI. This follows very similar patterns found in the other sanitizers --- compiler-rt/lib/rtsan/CMakeLists.txt | 2 ++ compiler-rt/test/rtsan/basic.cpp | 1 - compiler-rt/test/rtsan/sanity_check_pure_c.c | 28 ++++++++++++++++++++ 3 files changed, 30 insertions(+), 1 deletion(-) create mode 100644 compiler-rt/test/rtsan/sanity_check_pure_c.c diff --git a/compiler-rt/lib/rtsan/CMakeLists.txt b/compiler-rt/lib/rtsan/CMakeLists.txt index 3f146a757a97eb..07a21b49eb45aa 100644 --- a/compiler-rt/lib/rtsan/CMakeLists.txt +++ b/compiler-rt/lib/rtsan/CMakeLists.txt @@ -29,6 +29,8 @@ set(RTSAN_LINK_LIBS ${COMPILER_RT_UNWINDER_LINK_LIBS} ${COMPILER_RT_CXX_LINK_LIBS}) +append_rtti_flag(OFF RTSAN_CFLAGS) + if(APPLE) add_compiler_rt_object_libraries(RTRtsan OS ${SANITIZER_COMMON_SUPPORTED_OS} diff --git a/compiler-rt/test/rtsan/basic.cpp b/compiler-rt/test/rtsan/basic.cpp index 607db90213a30d..4edf32336720f8 100644 --- a/compiler-rt/test/rtsan/basic.cpp +++ b/compiler-rt/test/rtsan/basic.cpp @@ -1,5 +1,4 @@ // RUN: %clangxx -fsanitize=realtime %s -o %t -// RUN: %clang -fsanitize=realtime %s -o %t // RUN: not %run %t 2>&1 | FileCheck %s // UNSUPPORTED: ios diff --git a/compiler-rt/test/rtsan/sanity_check_pure_c.c b/compiler-rt/test/rtsan/sanity_check_pure_c.c new file mode 100644 index 00000000000000..bdca6039d9324d --- /dev/null +++ b/compiler-rt/test/rtsan/sanity_check_pure_c.c @@ -0,0 +1,28 @@ +// RUN: %clang -fsanitize=realtime %s -o %t +// RUN: not %run %t 2>&1 | FileCheck %s +// RUN: %clang %s -o %t +// RUN: %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-NO-SANITIZE +#ifdef __cplusplus +# error "This test must be built in C mode" +#endif + +#include +#include + +// Check that we can build and run C code. + +void nonblocking_function(void) __attribute__((nonblocking)); + +void nonblocking_function(void) __attribute__((nonblocking)) { + void *ptr = malloc(2); + printf("ptr: %p\n", ptr); // ensure we don't optimize out the malloc +} + +int main() { + nonblocking_function(); + printf("Done\n"); + return 0; +} + +// CHECK: ==ERROR: RealtimeSanitizer +// CHECK-NO-SANITIZE: Done From cb5f81dc94269d357c9f07892e80e42e93f66624 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 18 Sep 2024 04:57:15 -0700 Subject: [PATCH 043/321] [LLVM][TableGen] Use range for loops in AsmMatcherEmitter (#108914) Use range for loops in AsmMatcherEmitter. Convert some Record pointers to const. --- llvm/utils/TableGen/AsmMatcherEmitter.cpp | 147 ++++++++++------------ 1 file changed, 64 insertions(+), 83 deletions(-) diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp index 2a94b77af66c2d..0c03440903fc1d 100644 --- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp +++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp @@ -634,10 +634,10 @@ struct MatchableInfo { // Compare lexicographically by operand. The matcher validates that other // orderings wouldn't be ambiguous using \see couldMatchAmbiguouslyWith(). - for (unsigned i = 0, e = AsmOperands.size(); i != e; ++i) { - if (*AsmOperands[i].Class < *RHS.AsmOperands[i].Class) + for (const auto &[LHSOp, RHSOp] : zip_equal(AsmOperands, RHS.AsmOperands)) { + if (*LHSOp.Class < *RHSOp.Class) return true; - if (*RHS.AsmOperands[i].Class < *AsmOperands[i].Class) + if (*RHSOp.Class < *LHSOp.Class) return false; } @@ -692,21 +692,21 @@ struct MatchableInfo { // Tokens and operand kinds are unambiguous (assuming a correct target // specific parser). - for (unsigned i = 0, e = AsmOperands.size(); i != e; ++i) - if (AsmOperands[i].Class->Kind != RHS.AsmOperands[i].Class->Kind || - AsmOperands[i].Class->Kind == ClassInfo::Token) - if (*AsmOperands[i].Class < *RHS.AsmOperands[i].Class || - *RHS.AsmOperands[i].Class < *AsmOperands[i].Class) + for (const auto &[LHSOp, RHSOp] : zip_equal(AsmOperands, RHS.AsmOperands)) { + if (LHSOp.Class->Kind != RHSOp.Class->Kind || + LHSOp.Class->Kind == ClassInfo::Token) + if (*LHSOp.Class < *RHSOp.Class || *RHSOp.Class < *LHSOp.Class) return false; + } // Otherwise, this operand could commute if all operands are equivalent, or // there is a pair of operands that compare less than and a pair that // compare greater than. bool HasLT = false, HasGT = false; - for (unsigned i = 0, e = AsmOperands.size(); i != e; ++i) { - if (*AsmOperands[i].Class < *RHS.AsmOperands[i].Class) + for (const auto &[LHSOp, RHSOp] : zip_equal(AsmOperands, RHS.AsmOperands)) { + if (*LHSOp.Class < *RHSOp.Class) HasLT = true; - if (*RHS.AsmOperands[i].Class < *AsmOperands[i].Class) + if (*RHSOp.Class < *LHSOp.Class) HasGT = true; } @@ -810,7 +810,7 @@ class AsmMatcherInfo { /// getSubtargetFeature - Lookup or create the subtarget feature info for the /// given operand. - const SubtargetFeatureInfo *getSubtargetFeature(Record *Def) const { + const SubtargetFeatureInfo *getSubtargetFeature(const Record *Def) const { assert(Def->isSubClassOf("Predicate") && "Invalid predicate type!"); const auto &I = SubtargetFeatures.find(Def); return I == SubtargetFeatures.end() ? nullptr : &I->second; @@ -833,9 +833,8 @@ LLVM_DUMP_METHOD void MatchableInfo::dump() const { errs() << " variant: " << AsmVariantID << "\n"; - for (unsigned i = 0, e = AsmOperands.size(); i != e; ++i) { - const AsmOperand &Op = AsmOperands[i]; - errs() << " op[" << i << "] = " << Op.Class->ClassName << " - "; + for (const auto &[Idx, Op] : enumerate(AsmOperands)) { + errs() << " op[" << Idx << "] = " << Op.Class->ClassName << " - "; errs() << '\"' << Op.Token << "\"\n"; } } @@ -1490,21 +1489,18 @@ void AsmMatcherInfo::buildOperandMatchInfo() { // Keep track of all operands of this instructions which belong to the // same class. unsigned NumOptionalOps = 0; - for (unsigned i = 0, e = MI->AsmOperands.size(); i != e; ++i) { - const MatchableInfo::AsmOperand &Op = MI->AsmOperands[i]; + for (const auto &[Idx, Op] : enumerate(MI->AsmOperands)) { if (CallCustomParserForAllOperands || !Op.Class->ParserMethod.empty()) { unsigned &OperandMask = OpClassMask[Op.Class]; OperandMask |= maskTrailingOnes(NumOptionalOps + 1) - << (i - NumOptionalOps); + << (Idx - NumOptionalOps); } if (Op.Class->IsOptional) ++NumOptionalOps; } // Generate operand match info for each mnemonic/operand class pair. - for (const auto &OCM : OpClassMask) { - unsigned OpMask = OCM.second; - ClassInfo *CI = OCM.first; + for (const auto [CI, OpMask] : OpClassMask) { OperandMatchInfo.push_back( OperandMatchEntry::create(MI.get(), CI, OpMask)); } @@ -1613,11 +1609,11 @@ void AsmMatcherInfo::buildInfo() { for (auto &II : Matchables) { // Parse the tokens after the mnemonic. // Note: buildInstructionOperandReference may insert new AsmOperands, so - // don't precompute the loop bound. - for (unsigned i = 0; i != II->AsmOperands.size(); ++i) { - MatchableInfo::AsmOperand &Op = II->AsmOperands[i]; + // don't precompute the loop bound, i.e., cannot use range based for loop + // here. + for (size_t Idx = 0; Idx < II->AsmOperands.size(); ++Idx) { + MatchableInfo::AsmOperand &Op = II->AsmOperands[Idx]; StringRef Token = Op.Token; - // Check for singleton registers. if (const Record *RegRecord = Op.SingletonReg) { Op.Class = RegisterClasses[RegRecord]; @@ -1645,7 +1641,7 @@ void AsmMatcherInfo::buildInfo() { OperandName = Token.substr(1); if (isa(II->DefRec)) - buildInstructionOperandReference(II.get(), OperandName, i); + buildInstructionOperandReference(II.get(), OperandName, Idx); else buildAliasOperandReference(II.get(), OperandName, Op); } @@ -1779,21 +1775,21 @@ void AsmMatcherInfo::buildAliasOperandReference(MatchableInfo *II, const CodeGenInstAlias &CGA = *cast(II->DefRec); // Set up the operand class. - for (unsigned i = 0, e = CGA.ResultOperands.size(); i != e; ++i) - if (CGA.ResultOperands[i].isRecord() && - CGA.ResultOperands[i].getName() == OperandName) { + for (const auto &[ResultOp, SubOpIdx] : + zip_equal(CGA.ResultOperands, CGA.ResultInstOperandIndex)) { + if (ResultOp.isRecord() && ResultOp.getName() == OperandName) { // It's safe to go with the first one we find, because CodeGenInstAlias // validates that all operands with the same name have the same record. - Op.SubOpIdx = CGA.ResultInstOperandIndex[i].second; + Op.SubOpIdx = SubOpIdx.second; // Use the match class from the Alias definition, not the // destination instruction, as we may have an immediate that's // being munged by the match class. - Op.Class = - getOperandClass(CGA.ResultOperands[i].getRecord(), Op.SubOpIdx); + Op.Class = getOperandClass(ResultOp.getRecord(), Op.SubOpIdx); Op.SrcOpName = OperandName; Op.OrigSrcOpName = OperandName; return; } + } PrintFatalError(II->TheDef->getLoc(), "error: unable to find operand: '" + OperandName + "'"); @@ -1862,13 +1858,11 @@ void MatchableInfo::buildAliasResultOperands(bool AliasConstraintsAreChecked) { // populate them. unsigned AliasOpNo = 0; unsigned LastOpNo = CGA.ResultInstOperandIndex.size(); - for (unsigned i = 0, e = ResultInst->Operands.size(); i != e; ++i) { - const CGIOperandList::OperandInfo *OpInfo = &ResultInst->Operands[i]; - + for (const auto &[Idx, OpInfo] : enumerate(ResultInst->Operands)) { // If this is a tied operand, just copy from the previously handled operand. int TiedOp = -1; - if (OpInfo->MINumOperands == 1) - TiedOp = OpInfo->getTiedRegister(); + if (OpInfo.MINumOperands == 1) + TiedOp = OpInfo.getTiedRegister(); if (TiedOp != -1) { unsigned SrcOp1 = 0; unsigned SrcOp2 = 0; @@ -1898,7 +1892,7 @@ void MatchableInfo::buildAliasResultOperands(bool AliasConstraintsAreChecked) { // to benefit from the tied-operands check and just match the operand // as a normal, but not copy the original (TiedOp) to the result // instruction. We do this by passing -1 as the tied operand to copy. - if (ResultInst->Operands[i].Rec->getName() != + if (OpInfo.Rec->getName() != ResultInst->Operands[TiedOp].Rec->getName()) { SrcOp1 = ResOperands[TiedOp].AsmOperandNum; int SubIdx = CGA.ResultInstOperandIndex[AliasOpNo].second; @@ -1913,9 +1907,9 @@ void MatchableInfo::buildAliasResultOperands(bool AliasConstraintsAreChecked) { } // Handle all the suboperands for this operand. - const std::string &OpName = OpInfo->Name; + const std::string &OpName = OpInfo.Name; for (; AliasOpNo < LastOpNo && - CGA.ResultInstOperandIndex[AliasOpNo].first == i; + CGA.ResultInstOperandIndex[AliasOpNo].first == Idx; ++AliasOpNo) { int SubIdx = CGA.ResultInstOperandIndex[AliasOpNo].second; @@ -1935,7 +1929,7 @@ void MatchableInfo::buildAliasResultOperands(bool AliasConstraintsAreChecked) { // record won't be updated and it will fail later on. OperandRefs.try_emplace(Name, SrcOperand); - unsigned NumOperands = (SubIdx == -1 ? OpInfo->MINumOperands : 1); + unsigned NumOperands = (SubIdx == -1 ? OpInfo.MINumOperands : 1); ResOperands.push_back( ResOperand::getRenderedOp(SrcOperand, NumOperands)); break; @@ -2110,9 +2104,7 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName, // Compute the convert enum and the case body. MaxRowLength = std::max(MaxRowLength, II->ResOperands.size() * 2 + 1); - for (unsigned i = 0, e = II->ResOperands.size(); i != e; ++i) { - const MatchableInfo::ResOperand &OpInfo = II->ResOperands[i]; - + for (const auto &[Idx, OpInfo] : enumerate(II->ResOperands)) { // Generate code to populate each result operand. switch (OpInfo.Kind) { case MatchableInfo::ResOperand::RenderAsmOperand: { @@ -2194,7 +2186,7 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName, uint8_t TiedOp = OpInfo.TiedOperands.ResOpnd; uint8_t SrcOp1 = OpInfo.TiedOperands.SrcOpnd1Idx + HasMnemonicFirst; uint8_t SrcOp2 = OpInfo.TiedOperands.SrcOpnd2Idx + HasMnemonicFirst; - assert((i > TiedOp || TiedOp == (uint8_t)-1) && + assert((Idx > TiedOp || TiedOp == (uint8_t)-1) && "Tied operand precedes its target!"); auto TiedTupleName = std::string("Tie") + utostr(TiedOp) + '_' + utostr(SrcOp1) + '_' + utostr(SrcOp2); @@ -2730,26 +2722,21 @@ static void emitGetSubtargetFeatureName(AsmMatcherInfo &Info, raw_ostream &OS) { OS << "}\n\n"; } -static std::string GetAliasRequiredFeatures(Record *R, +static std::string GetAliasRequiredFeatures(const Record *R, const AsmMatcherInfo &Info) { - std::vector ReqFeatures = R->getValueAsListOfDefs("Predicates"); std::string Result; - if (ReqFeatures.empty()) - return Result; - - for (unsigned i = 0, e = ReqFeatures.size(); i != e; ++i) { - const SubtargetFeatureInfo *F = Info.getSubtargetFeature(ReqFeatures[i]); - + bool First = true; + for (const Record *RF : R->getValueAsListOfDefs("Predicates")) { + const SubtargetFeatureInfo *F = Info.getSubtargetFeature(RF); if (!F) PrintFatalError(R->getLoc(), - "Predicate '" + ReqFeatures[i]->getName() + + "Predicate '" + RF->getName() + "' is not marked as an AssemblerPredicate!"); - - if (i) + if (!First) Result += " && "; - Result += "Features.test(" + F->getEnumBitName() + ')'; + First = false; } return Result; @@ -2778,16 +2765,14 @@ emitMnemonicAliasVariant(raw_ostream &OS, const AsmMatcherInfo &Info, // by the string remapper. std::vector Cases; for (const auto &AliasEntry : AliasesFromMnemonic) { - const std::vector &ToVec = AliasEntry.second; - // Loop through each alias and emit code that handles each case. If there // are two instructions without predicates, emit an error. If there is one, // emit it last. std::string MatchCode; int AliasWithNoPredicate = -1; - for (unsigned i = 0, e = ToVec.size(); i != e; ++i) { - Record *R = ToVec[i]; + ArrayRef ToVec = AliasEntry.second; + for (const auto &[Idx, R] : enumerate(ToVec)) { std::string FeatureMask = GetAliasRequiredFeatures(R, Info); // If this unconditionally matches, remember it for later and diagnose @@ -2804,7 +2789,7 @@ emitMnemonicAliasVariant(raw_ostream &OS, const AsmMatcherInfo &Info, PrintFatalError(R->getLoc(), "this is the other MnemonicAlias."); } - AliasWithNoPredicate = i; + AliasWithNoPredicate = Idx; continue; } if (R->getValueAsString("ToMnemonic") == AliasEntry.first) @@ -2819,7 +2804,7 @@ emitMnemonicAliasVariant(raw_ostream &OS, const AsmMatcherInfo &Info, } if (AliasWithNoPredicate != -1) { - Record *R = ToVec[AliasWithNoPredicate]; + const Record *R = ToVec[AliasWithNoPredicate]; if (!MatchCode.empty()) MatchCode += "else\n "; MatchCode += "Mnemonic = \""; @@ -2955,8 +2940,8 @@ emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target, if (II.RequiredFeatures.empty()) OS << "_None"; else - for (unsigned i = 0, e = II.RequiredFeatures.size(); i != e; ++i) - OS << '_' << II.RequiredFeatures[i]->TheDef->getName(); + for (const auto &F : II.RequiredFeatures) + OS << '_' << F->TheDef->getName(); OS << " },\n"; } @@ -3467,24 +3452,20 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { if (MI->RequiredFeatures.empty()) continue; FeatureBitsets.emplace_back(); - for (unsigned I = 0, E = MI->RequiredFeatures.size(); I != E; ++I) - FeatureBitsets.back().push_back(MI->RequiredFeatures[I]->TheDef); + for (const auto *F : MI->RequiredFeatures) + FeatureBitsets.back().push_back(F->TheDef); } - llvm::sort(FeatureBitsets, [&](const std::vector &A, - const std::vector &B) { - if (A.size() < B.size()) - return true; - if (A.size() > B.size()) - return false; - for (auto Pair : zip(A, B)) { - if (std::get<0>(Pair)->getName() < std::get<1>(Pair)->getName()) - return true; - if (std::get<0>(Pair)->getName() > std::get<1>(Pair)->getName()) - return false; - } - return false; - }); + llvm::sort(FeatureBitsets, + [&](ArrayRef A, ArrayRef B) { + if (A.size() != B.size()) + return A.size() < B.size(); + for (const auto [ARec, BRec] : zip_equal(A, B)) { + if (ARec->getName() != BRec->getName()) + return ARec->getName() < BRec->getName(); + } + return false; + }); FeatureBitsets.erase(llvm::unique(FeatureBitsets), FeatureBitsets.end()); OS << "// Feature bitsets.\n" << "enum : " << getMinimalTypeForRange(FeatureBitsets.size()) << " {\n" @@ -3577,8 +3558,8 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { if (MI->RequiredFeatures.empty()) OS << "_None"; else - for (unsigned i = 0, e = MI->RequiredFeatures.size(); i != e; ++i) - OS << '_' << MI->RequiredFeatures[i]->TheDef->getName(); + for (const auto &F : MI->RequiredFeatures) + OS << '_' << F->TheDef->getName(); OS << ", { "; ListSeparator LS; From 8fc3ac4cbd5c838dafd1fc9077cfe07eee69ccce Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 18 Sep 2024 04:58:55 -0700 Subject: [PATCH 044/321] [LLVM][TableGen] Change AsmWriterEmitter to const RecordKeeper (#108918) Change AsmWriterEmitter to const RecordKeeper. This is a part of effort to have better const correctness in TableGen backends: https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089 --- llvm/utils/TableGen/AsmWriterEmitter.cpp | 41 +++++++++++------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/llvm/utils/TableGen/AsmWriterEmitter.cpp b/llvm/utils/TableGen/AsmWriterEmitter.cpp index cbf3a380b442d9..3f1f937e5fd8a2 100644 --- a/llvm/utils/TableGen/AsmWriterEmitter.cpp +++ b/llvm/utils/TableGen/AsmWriterEmitter.cpp @@ -55,13 +55,13 @@ using namespace llvm; namespace { class AsmWriterEmitter { - RecordKeeper &Records; + const RecordKeeper &Records; CodeGenTarget Target; ArrayRef NumberedInstructions; std::vector Instructions; public: - AsmWriterEmitter(RecordKeeper &R); + AsmWriterEmitter(const RecordKeeper &R); void run(raw_ostream &o); @@ -326,7 +326,7 @@ void AsmWriterEmitter::EmitGetMnemonic( raw_ostream &O, std::vector> &TableDrivenOperandPrinters, unsigned &BitsLeft, unsigned &AsmStrBits) { - Record *AsmWriter = Target.getAsmWriter(); + const Record *AsmWriter = Target.getAsmWriter(); StringRef ClassName = AsmWriter->getValueAsString("AsmWriterClassName"); bool PassSubtarget = AsmWriter->getValueAsInt("PassSubtarget"); @@ -486,7 +486,7 @@ void AsmWriterEmitter::EmitPrintInstruction( std::vector> &TableDrivenOperandPrinters, unsigned &BitsLeft, unsigned &AsmStrBits) { const unsigned OpcodeInfoBits = 64; - Record *AsmWriter = Target.getAsmWriter(); + const Record *AsmWriter = Target.getAsmWriter(); StringRef ClassName = AsmWriter->getValueAsString("AsmWriterClassName"); bool PassSubtarget = AsmWriter->getValueAsInt("PassSubtarget"); @@ -596,8 +596,8 @@ emitRegisterNameString(raw_ostream &O, StringRef AltName, AsmName = std::string(Reg.getName()); } else { // Make sure the register has an alternate name for this index. - std::vector AltNameList = - Reg.TheDef->getValueAsListOfDefs("RegAltNameIndices"); + std::vector AltNameList = + Reg.TheDef->getValueAsListOfConstDefs("RegAltNameIndices"); unsigned Idx = 0, e; for (e = AltNameList.size(); Idx < e && (AltNameList[Idx]->getName() != AltName); ++Idx) @@ -633,7 +633,7 @@ emitRegisterNameString(raw_ostream &O, StringRef AltName, } void AsmWriterEmitter::EmitGetRegisterName(raw_ostream &O) { - Record *AsmWriter = Target.getAsmWriter(); + const Record *AsmWriter = Target.getAsmWriter(); StringRef ClassName = AsmWriter->getValueAsString("AsmWriterClassName"); const auto &Registers = Target.getRegBank().getRegisters(); ArrayRef AltNameIndices = Target.getRegAltNameIndices(); @@ -829,7 +829,7 @@ struct AliasPriorityComparator { } // end anonymous namespace void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) { - Record *AsmWriter = Target.getAsmWriter(); + const Record *AsmWriter = Target.getAsmWriter(); O << "\n#ifdef PRINT_ALIAS_INSTR\n"; O << "#undef PRINT_ALIAS_INSTR\n\n"; @@ -843,14 +843,11 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) { unsigned Variant = AsmWriter->getValueAsInt("Variant"); bool PassSubtarget = AsmWriter->getValueAsInt("PassSubtarget"); - std::vector AllInstAliases = - Records.getAllDerivedDefinitions("InstAlias"); - // Create a map from the qualified name to a list of potential matches. typedef std::set, AliasPriorityComparator> AliasWithPriority; std::map AliasMap; - for (Record *R : AllInstAliases) { + for (const Record *R : Records.getAllDerivedDefinitions("InstAlias")) { int Priority = R->getValueAsInt("EmitPriority"); if (Priority < 1) continue; // Aliases with priority 0 are never emitted. @@ -1011,17 +1008,17 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) { MIOpNum += RO.getMINumOperands(); } - std::vector ReqFeatures; + std::vector ReqFeatures; if (PassSubtarget) { // We only consider ReqFeatures predicates if PassSubtarget - std::vector RF = - CGA.TheDef->getValueAsListOfDefs("Predicates"); - copy_if(RF, std::back_inserter(ReqFeatures), [](Record *R) { + std::vector RF = + CGA.TheDef->getValueAsListOfConstDefs("Predicates"); + copy_if(RF, std::back_inserter(ReqFeatures), [](const Record *R) { return R->getValueAsBit("AssemblerMatcherPredicate"); }); } - for (Record *const R : ReqFeatures) { + for (const Record *R : ReqFeatures) { const DagInit *D = R->getValueAsDag("AssemblerCondDag"); auto *Op = dyn_cast(D->getOperator()); if (!Op) @@ -1315,17 +1312,17 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) { O << "#endif // PRINT_ALIAS_INSTR\n"; } -AsmWriterEmitter::AsmWriterEmitter(RecordKeeper &R) : Records(R), Target(R) { - Record *AsmWriter = Target.getAsmWriter(); +AsmWriterEmitter::AsmWriterEmitter(const RecordKeeper &R) + : Records(R), Target(R) { + const Record *AsmWriter = Target.getAsmWriter(); unsigned Variant = AsmWriter->getValueAsInt("Variant"); // Get the instruction numbering. NumberedInstructions = Target.getInstructionsByEnumValue(); - for (unsigned i = 0, e = NumberedInstructions.size(); i != e; ++i) { - const CodeGenInstruction *I = NumberedInstructions[i]; + for (const auto &[Idx, I] : enumerate(NumberedInstructions)) { if (!I->AsmString.empty() && I->TheDef->getName() != "PHI") - Instructions.emplace_back(*I, i, Variant); + Instructions.emplace_back(*I, Idx, Variant); } } From 6b6e2106f974286715bda1abf95d4ab08ac9946f Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 18 Sep 2024 05:08:15 -0700 Subject: [PATCH 045/321] [LLVM][TableGen] Change CodeGenMapTable to use const RecordKeeper (#109034) Change CodeGenMapTable to use const RecordKeeper. This is a part of effort to have better const correctness in TableGen backends: https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089 --- llvm/utils/TableGen/CodeGenMapTable.cpp | 179 +++++++++++------------- llvm/utils/TableGen/TableGenBackends.h | 2 +- 2 files changed, 84 insertions(+), 97 deletions(-) diff --git a/llvm/utils/TableGen/CodeGenMapTable.cpp b/llvm/utils/TableGen/CodeGenMapTable.cpp index 46aad7f7f8bdd9..b599ee149bcd2b 100644 --- a/llvm/utils/TableGen/CodeGenMapTable.cpp +++ b/llvm/utils/TableGen/CodeGenMapTable.cpp @@ -77,12 +77,15 @@ #include "Common/CodeGenInstruction.h" #include "Common/CodeGenTarget.h" +#include "TableGenBackends.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/TableGen/Error.h" #include "llvm/TableGen/Record.h" -using namespace llvm; -typedef std::map> InstrRelMapTy; -typedef std::map, std::vector> RowInstrMapTy; +using namespace llvm; +typedef std::map> InstrRelMapTy; +typedef std::map, std::vector> + RowInstrMapTy; namespace { @@ -92,13 +95,13 @@ class InstrMap { private: std::string Name; std::string FilterClass; - ListInit *RowFields; - ListInit *ColFields; - ListInit *KeyCol; - std::vector ValueCols; + const ListInit *RowFields; + const ListInit *ColFields; + const ListInit *KeyCol; + std::vector ValueCols; public: - InstrMap(Record *MapRec) { + InstrMap(const Record *MapRec) { Name = std::string(MapRec->getName()); // FilterClass - It's used to reduce the search space only to the @@ -133,8 +136,8 @@ class InstrMap { MapRec->getName() + "' has empty " + "`ValueCols' field!"); - for (Init *I : ColValList->getValues()) { - auto *ColI = cast(I); + for (const Init *I : ColValList->getValues()) { + const auto *ColI = cast(I); // Make sure that all the sub-lists in 'ValueCols' have same number of // elements as the fields in 'ColFields'. @@ -148,18 +151,12 @@ class InstrMap { } const std::string &getName() const { return Name; } - const std::string &getFilterClass() const { return FilterClass; } - - ListInit *getRowFields() const { return RowFields; } - - ListInit *getColFields() const { return ColFields; } - - ListInit *getKeyCol() const { return KeyCol; } - - const std::vector &getValueCols() const { return ValueCols; } + const ListInit *getRowFields() const { return RowFields; } + const ListInit *getColFields() const { return ColFields; } + const ListInit *getKeyCol() const { return KeyCol; } + ArrayRef getValueCols() const { return ValueCols; } }; -} // end anonymous namespace //===----------------------------------------------------------------------===// // class MapTableEmitter : It builds the instruction relation maps using @@ -167,7 +164,6 @@ class InstrMap { // relationship maps as tables into XXXGenInstrInfo.inc file along with the // functions to query them. -namespace { class MapTableEmitter { private: // std::string TargetName; @@ -177,18 +173,19 @@ class MapTableEmitter { // InstrDefs - list of instructions filtered using FilterClass defined // in InstrMapDesc. - std::vector InstrDefs; + ArrayRef InstrDefs; // RowInstrMap - maps RowFields values to the instructions. It's keyed by the // values of the row fields and contains vector of records as values. RowInstrMapTy RowInstrMap; // KeyInstrVec - list of key instructions. - std::vector KeyInstrVec; - DenseMap> MapTable; + std::vector KeyInstrVec; + DenseMap> MapTable; public: - MapTableEmitter(CodeGenTarget &Target, RecordKeeper &Records, Record *IMRec) + MapTableEmitter(const CodeGenTarget &Target, const RecordKeeper &Records, + const Record *IMRec) : Target(Target), InstrMapDesc(IMRec) { const std::string &FilterClass = InstrMapDesc.getFilterClass(); InstrDefs = Records.getAllDerivedDefinitions(FilterClass); @@ -198,11 +195,12 @@ class MapTableEmitter { // Returns true if an instruction is a key instruction, i.e., its ColFields // have same values as KeyCol. - bool isKeyColInstr(Record *CurInstr); + bool isKeyColInstr(const Record *CurInstr); // Find column instruction corresponding to a key instruction based on the // constraints for that column. - Record *getInstrForColumn(Record *KeyInstr, ListInit *CurValueCol); + const Record *getInstrForColumn(const Record *KeyInstr, + const ListInit *CurValueCol); // Find column instructions for each key instruction based // on ValueCols and store them into MapTable. @@ -226,17 +224,17 @@ class MapTableEmitter { //===----------------------------------------------------------------------===// void MapTableEmitter::buildRowInstrMap() { - for (Record *CurInstr : InstrDefs) { - std::vector KeyValue; - ListInit *RowFields = InstrMapDesc.getRowFields(); - for (Init *RowField : RowFields->getValues()) { - RecordVal *RecVal = CurInstr->getValue(RowField); + for (const Record *CurInstr : InstrDefs) { + std::vector KeyValue; + const ListInit *RowFields = InstrMapDesc.getRowFields(); + for (const Init *RowField : RowFields->getValues()) { + const RecordVal *RecVal = CurInstr->getValue(RowField); if (RecVal == nullptr) PrintFatalError(CurInstr->getLoc(), "No value " + RowField->getAsString() + " found in \"" + CurInstr->getName() + "\" instruction description."); - Init *CurInstrVal = RecVal->getValue(); + const Init *CurInstrVal = RecVal->getValue(); KeyValue.push_back(CurInstrVal); } @@ -254,18 +252,19 @@ void MapTableEmitter::buildRowInstrMap() { // Return true if an instruction is a KeyCol instruction. //===----------------------------------------------------------------------===// -bool MapTableEmitter::isKeyColInstr(Record *CurInstr) { - ListInit *ColFields = InstrMapDesc.getColFields(); - ListInit *KeyCol = InstrMapDesc.getKeyCol(); +bool MapTableEmitter::isKeyColInstr(const Record *CurInstr) { + const ListInit *ColFields = InstrMapDesc.getColFields(); + const ListInit *KeyCol = InstrMapDesc.getKeyCol(); // Check if the instruction is a KeyCol instruction. bool MatchFound = true; for (unsigned j = 0, endCF = ColFields->size(); (j < endCF) && MatchFound; j++) { - RecordVal *ColFieldName = CurInstr->getValue(ColFields->getElement(j)); + const RecordVal *ColFieldName = + CurInstr->getValue(ColFields->getElement(j)); std::string CurInstrVal = ColFieldName->getValue()->getAsUnquotedString(); std::string KeyColValue = KeyCol->getElement(j)->getAsUnquotedString(); - MatchFound = (CurInstrVal == KeyColValue); + MatchFound = CurInstrVal == KeyColValue; } return MatchFound; } @@ -278,15 +277,15 @@ bool MapTableEmitter::isKeyColInstr(Record *CurInstr) { void MapTableEmitter::buildMapTable() { // Find column instructions for a given key based on the ColField // constraints. - const std::vector &ValueCols = InstrMapDesc.getValueCols(); + ArrayRef ValueCols = InstrMapDesc.getValueCols(); unsigned NumOfCols = ValueCols.size(); - for (Record *CurKeyInstr : KeyInstrVec) { - std::vector ColInstrVec(NumOfCols); + for (const Record *CurKeyInstr : KeyInstrVec) { + std::vector ColInstrVec(NumOfCols); // Find the column instruction based on the constraints for the column. for (unsigned ColIdx = 0; ColIdx < NumOfCols; ColIdx++) { - ListInit *CurValueCol = ValueCols[ColIdx]; - Record *ColInstr = getInstrForColumn(CurKeyInstr, CurValueCol); + const ListInit *CurValueCol = ValueCols[ColIdx]; + const Record *ColInstr = getInstrForColumn(CurKeyInstr, CurValueCol); ColInstrVec[ColIdx] = ColInstr; } MapTable[CurKeyInstr] = ColInstrVec; @@ -297,14 +296,14 @@ void MapTableEmitter::buildMapTable() { // Find column instruction based on the constraints for that column. //===----------------------------------------------------------------------===// -Record *MapTableEmitter::getInstrForColumn(Record *KeyInstr, - ListInit *CurValueCol) { - ListInit *RowFields = InstrMapDesc.getRowFields(); - std::vector KeyValue; +const Record *MapTableEmitter::getInstrForColumn(const Record *KeyInstr, + const ListInit *CurValueCol) { + const ListInit *RowFields = InstrMapDesc.getRowFields(); + std::vector KeyValue; // Construct KeyValue using KeyInstr's values for RowFields. - for (Init *RowField : RowFields->getValues()) { - Init *KeyInstrVal = KeyInstr->getValue(RowField)->getValue(); + for (const Init *RowField : RowFields->getValues()) { + const Init *KeyInstrVal = KeyInstr->getValue(RowField)->getValue(); KeyValue.push_back(KeyInstrVal); } @@ -312,20 +311,20 @@ Record *MapTableEmitter::getInstrForColumn(Record *KeyInstr, // in RowInstrMap. We search through these instructions to find a match // for the current column, i.e., the instruction which has the same values // as CurValueCol for all the fields in ColFields. - const std::vector &RelatedInstrVec = RowInstrMap[KeyValue]; + ArrayRef RelatedInstrVec = RowInstrMap[KeyValue]; - ListInit *ColFields = InstrMapDesc.getColFields(); - Record *MatchInstr = nullptr; + const ListInit *ColFields = InstrMapDesc.getColFields(); + const Record *MatchInstr = nullptr; - for (llvm::Record *CurInstr : RelatedInstrVec) { + for (const Record *CurInstr : RelatedInstrVec) { bool MatchFound = true; for (unsigned j = 0, endCF = ColFields->size(); (j < endCF) && MatchFound; j++) { - Init *ColFieldJ = ColFields->getElement(j); - Init *CurInstrInit = CurInstr->getValue(ColFieldJ)->getValue(); + const Init *ColFieldJ = ColFields->getElement(j); + const Init *CurInstrInit = CurInstr->getValue(ColFieldJ)->getValue(); std::string CurInstrVal = CurInstrInit->getAsUnquotedString(); - Init *ColFieldJVallue = CurValueCol->getElement(j); - MatchFound = (CurInstrVal == ColFieldJVallue->getAsUnquotedString()); + const Init *ColFieldJVallue = CurValueCol->getElement(j); + MatchFound = CurInstrVal == ColFieldJVallue->getAsUnquotedString(); } if (MatchFound) { @@ -333,7 +332,7 @@ Record *MapTableEmitter::getInstrForColumn(Record *KeyInstr, // Already had a match // Error if multiple matches are found for a column. std::string KeyValueStr; - for (Init *Value : KeyValue) { + for (const Init *Value : KeyValue) { if (!KeyValueStr.empty()) KeyValueStr += ", "; KeyValueStr += Value->getAsString(); @@ -357,11 +356,10 @@ Record *MapTableEmitter::getInstrForColumn(Record *KeyInstr, //===----------------------------------------------------------------------===// unsigned MapTableEmitter::emitBinSearchTable(raw_ostream &OS) { - ArrayRef NumberedInstructions = Target.getInstructionsByEnumValue(); StringRef Namespace = Target.getInstNamespace(); - const std::vector &ValueCols = InstrMapDesc.getValueCols(); + ArrayRef ValueCols = InstrMapDesc.getValueCols(); unsigned NumCol = ValueCols.size(); unsigned TotalNumInstr = NumberedInstructions.size(); unsigned TableSize = 0; @@ -372,7 +370,7 @@ unsigned MapTableEmitter::emitBinSearchTable(raw_ostream &OS) { OS << "Table[][" << NumCol + 1 << "] = {\n"; for (unsigned i = 0; i < TotalNumInstr; i++) { const Record *CurInstr = NumberedInstructions[i]->TheDef; - std::vector ColInstrs = MapTable[CurInstr]; + ArrayRef ColInstrs = MapTable[CurInstr]; std::string OutStr; unsigned RelExists = 0; if (!ColInstrs.empty()) { @@ -434,8 +432,8 @@ void MapTableEmitter::emitBinSearch(raw_ostream &OS, unsigned TableSize) { void MapTableEmitter::emitMapFuncBody(raw_ostream &OS, unsigned TableSize) { - ListInit *ColFields = InstrMapDesc.getColFields(); - const std::vector &ValueCols = InstrMapDesc.getValueCols(); + const ListInit *ColFields = InstrMapDesc.getColFields(); + ArrayRef ValueCols = InstrMapDesc.getValueCols(); // Emit binary search algorithm to locate instructions in the // relation table. If found, return opcode value from the appropriate column @@ -444,7 +442,7 @@ void MapTableEmitter::emitMapFuncBody(raw_ostream &OS, unsigned TableSize) { if (ValueCols.size() > 1) { for (unsigned i = 0, e = ValueCols.size(); i < e; i++) { - ListInit *ColumnI = ValueCols[i]; + const ListInit *ColumnI = ValueCols[i]; OS << " if ("; for (unsigned j = 0, ColSize = ColumnI->size(); j < ColSize; ++j) { std::string ColName = ColFields->getElement(j)->getAsUnquotedString(); @@ -476,8 +474,8 @@ void MapTableEmitter::emitTablesWithFunc(raw_ostream &OS) { // since first column is used for the key instructions), then we also need // to pass another input to indicate the column to be selected. - ListInit *ColFields = InstrMapDesc.getColFields(); - const std::vector &ValueCols = InstrMapDesc.getValueCols(); + const ListInit *ColFields = InstrMapDesc.getColFields(); + ArrayRef ValueCols = InstrMapDesc.getValueCols(); OS << "// " << InstrMapDesc.getName() << "\nLLVM_READONLY\n"; OS << "int " << InstrMapDesc.getName() << "(uint16_t Opcode"; if (ValueCols.size() > 1) { @@ -499,23 +497,20 @@ void MapTableEmitter::emitTablesWithFunc(raw_ostream &OS) { // Emit enums for the column fields across all the instruction maps. //===----------------------------------------------------------------------===// -static void emitEnums(raw_ostream &OS, RecordKeeper &Records) { - - std::vector InstrMapVec; - InstrMapVec = Records.getAllDerivedDefinitions("InstrMapping"); - std::map> ColFieldValueMap; +static void emitEnums(raw_ostream &OS, const RecordKeeper &Records) { + std::map> ColFieldValueMap; // Iterate over all InstrMapping records and create a map between column // fields and their possible values across all records. - for (Record *CurMap : InstrMapVec) { - ListInit *ColFields; - ColFields = CurMap->getValueAsListInit("ColFields"); - ListInit *List = CurMap->getValueAsListInit("ValueCols"); - std::vector ValueCols; + for (const Record *CurMap : + Records.getAllDerivedDefinitions("InstrMapping")) { + const ListInit *ColFields = CurMap->getValueAsListInit("ColFields"); + const ListInit *List = CurMap->getValueAsListInit("ValueCols"); + std::vector ValueCols; unsigned ListSize = List->size(); for (unsigned j = 0; j < ListSize; j++) { - auto *ListJ = cast(List->getElement(j)); + const auto *ListJ = cast(List->getElement(j)); if (ListJ->size() != ColFields->size()) PrintFatalError("Record `" + CurMap->getName() + @@ -533,12 +528,10 @@ static void emitEnums(raw_ostream &OS, RecordKeeper &Records) { } } - for (auto &Entry : ColFieldValueMap) { - std::vector FieldValues = Entry.second; - + for (auto &[EnumName, FieldValues] : ColFieldValueMap) { // Delete duplicate entries from ColFieldValueMap for (unsigned i = 0; i < FieldValues.size() - 1; i++) { - Init *CurVal = FieldValues[i]; + const Init *CurVal = FieldValues[i]; for (unsigned j = i + 1; j < FieldValues.size(); j++) { if (CurVal == FieldValues[j]) { FieldValues.erase(FieldValues.begin() + j); @@ -548,28 +541,24 @@ static void emitEnums(raw_ostream &OS, RecordKeeper &Records) { } // Emit enumerated values for the column fields. - OS << "enum " << Entry.first << " {\n"; - for (unsigned i = 0, endFV = FieldValues.size(); i < endFV; i++) { - OS << "\t" << Entry.first << "_" << FieldValues[i]->getAsUnquotedString(); - if (i != endFV - 1) - OS << ",\n"; - else - OS << "\n};\n\n"; - } + OS << "enum " << EnumName << " {\n"; + ListSeparator LS(",\n"); + for (const Init *Field : FieldValues) + OS << LS << "\t" << EnumName << "_" << Field->getAsUnquotedString(); + OS << "\n};\n\n"; } } -namespace llvm { //===----------------------------------------------------------------------===// // Parse 'InstrMapping' records and use the information to form relationship // between instructions. These relations are emitted as a tables along with the // functions to query them. //===----------------------------------------------------------------------===// -void EmitMapTable(RecordKeeper &Records, raw_ostream &OS) { +void llvm::EmitMapTable(const RecordKeeper &Records, raw_ostream &OS) { CodeGenTarget Target(Records); StringRef NameSpace = Target.getInstNamespace(); - std::vector InstrMapVec; - InstrMapVec = Records.getAllDerivedDefinitions("InstrMapping"); + ArrayRef InstrMapVec = + Records.getAllDerivedDefinitions("InstrMapping"); if (InstrMapVec.empty()) return; @@ -585,7 +574,7 @@ void EmitMapTable(RecordKeeper &Records, raw_ostream &OS) { // Iterate over all instruction mapping records and construct relationship // maps based on the information specified there. // - for (Record *CurMap : InstrMapVec) { + for (const Record *CurMap : InstrMapVec) { MapTableEmitter IMap(Target, Records, CurMap); // Build RowInstrMap to group instructions based on their values for @@ -604,5 +593,3 @@ void EmitMapTable(RecordKeeper &Records, raw_ostream &OS) { OS << "} // end namespace llvm\n"; OS << "#endif // GET_INSTRMAP_INFO\n\n"; } - -} // namespace llvm diff --git a/llvm/utils/TableGen/TableGenBackends.h b/llvm/utils/TableGen/TableGenBackends.h index e0d12abaaa0376..fc3b87370766a3 100644 --- a/llvm/utils/TableGen/TableGenBackends.h +++ b/llvm/utils/TableGen/TableGenBackends.h @@ -61,7 +61,7 @@ namespace llvm { class raw_ostream; class RecordKeeper; -void EmitMapTable(RecordKeeper &RK, raw_ostream &OS); +void EmitMapTable(const RecordKeeper &RK, raw_ostream &OS); // Defined in DecoderEmitter.cpp void EmitDecoder(RecordKeeper &RK, raw_ostream &OS, From 40c45b6b43180221acb49f387e7d3158adf49e3e Mon Sep 17 00:00:00 2001 From: Aaron Ballman Date: Wed, 18 Sep 2024 08:25:58 -0400 Subject: [PATCH 046/321] Remove clang-rename (#108988) clang-rename has largely been superseded by clangd and this project hasn't received much attention in many years. Further, our documentation on it still claims it's in very early stages of development despite being ~10 years old. One of the primary people driving the tool has mentioned that they don't believe there is a reason to continue to support it unless it's still being actively used (https://reviews.llvm.org/D148439#4303202) and I've found no evidence to suggest that is the case. Original RFC: https://discourse.llvm.org/t/rfc-time-to-deprecate-remove-clang-rename/70707 --- clang-tools-extra/CODE_OWNERS.TXT | 2 +- clang-tools-extra/docs/ReleaseNotes.rst | 3 - clang-tools-extra/docs/clang-rename.rst | 168 ---- clang-tools-extra/docs/index.rst | 1 - clang-tools-extra/test/CMakeLists.txt | 3 - .../ClangRenameClassReplacements.cpp | 11 - clang/docs/ClangFormattedStatus.rst | 5 - clang/docs/ReleaseNotes.rst | 2 + clang/docs/tools/clang-formatted-files.txt | 1 - clang/test/CMakeLists.txt | 1 - .../clang-rename/ClassAsTemplateArgument.cpp | 21 - clang/test/clang-rename/ClassFindByName.cpp | 10 - .../test/clang-rename/ClassSimpleRenaming.cpp | 14 - clang/test/clang-rename/ClassTestMulti.cpp | 11 - .../clang-rename/ClassTestMultiByName.cpp | 8 - .../clang-rename/ComplexFunctionOverride.cpp | 47 - .../clang-rename/ComplicatedClassType.cpp | 63 -- clang/test/clang-rename/Ctor.cpp | 14 - clang/test/clang-rename/CtorInitializer.cpp | 17 - clang/test/clang-rename/DeclRefExpr.cpp | 24 - clang/test/clang-rename/ForceMulti.cpp | 8 - clang/test/clang-rename/ForwardClassDecl.cpp | 4 - clang/test/clang-rename/FunctionMacro.cpp | 20 - clang/test/clang-rename/FunctionOverride.cpp | 13 - clang/test/clang-rename/FunctionTemplate.cpp | 19 - .../FunctionWithClassFindByName.cpp | 12 - .../clang-rename/IncludeHeaderWithSymbol.cpp | 10 - .../clang-rename/Inputs/HeaderWithSymbol.h | 1 - .../clang-rename/Inputs/OffsetToNewName.yaml | 6 - .../Inputs/QualifiedNameToNewName.yaml | 6 - clang/test/clang-rename/InvalidNewName.cpp | 2 - clang/test/clang-rename/InvalidOffset.cpp | 9 - .../clang-rename/InvalidQualifiedName.cpp | 4 - clang/test/clang-rename/MemberExprMacro.cpp | 22 - clang/test/clang-rename/Namespace.cpp | 13 - clang/test/clang-rename/NoNewName.cpp | 4 - clang/test/clang-rename/NonExistFile.cpp | 2 - .../TemplateClassInstantiation.cpp | 42 - clang/test/clang-rename/TemplateCtor.cpp | 10 - clang/test/clang-rename/TemplateTypename.cpp | 24 - .../clang-rename/TemplatedClassFunction.cpp | 27 - clang/test/clang-rename/Typedef.cpp | 8 - .../clang-rename/UserDefinedConversion.cpp | 26 - clang/test/clang-rename/Variable.cpp | 33 - clang/test/clang-rename/VariableMacro.cpp | 21 - clang/test/clang-rename/VariableTemplate.cpp | 32 - clang/test/clang-rename/YAMLInput.cpp | 10 - clang/tools/CMakeLists.txt | 1 - clang/tools/clang-rename/CMakeLists.txt | 26 - clang/tools/clang-rename/ClangRename.cpp | 242 ------ clang/tools/clang-rename/clang-rename.el | 80 -- clang/tools/clang-rename/clang-rename.py | 70 -- clang/unittests/CMakeLists.txt | 1 - clang/unittests/Rename/CMakeLists.txt | 29 - clang/unittests/Rename/ClangRenameTest.h | 116 --- clang/unittests/Rename/RenameAliasTest.cpp | 303 ------- clang/unittests/Rename/RenameClassTest.cpp | 820 ------------------ clang/unittests/Rename/RenameEnumTest.cpp | 189 ---- clang/unittests/Rename/RenameFunctionTest.cpp | 573 ------------ clang/unittests/Rename/RenameMemberTest.cpp | 228 ----- .../secondary/clang-tools-extra/test/BUILD.gn | 1 - llvm/utils/gn/secondary/clang/test/BUILD.gn | 1 - .../clang/tools/clang-rename/BUILD.gn | 14 - .../llvm-project-overlay/clang/BUILD.bazel | 15 - 64 files changed, 3 insertions(+), 3520 deletions(-) delete mode 100644 clang-tools-extra/docs/clang-rename.rst delete mode 100644 clang-tools-extra/test/clang-apply-replacements/ClangRenameClassReplacements.cpp delete mode 100644 clang/test/clang-rename/ClassAsTemplateArgument.cpp delete mode 100644 clang/test/clang-rename/ClassFindByName.cpp delete mode 100644 clang/test/clang-rename/ClassSimpleRenaming.cpp delete mode 100644 clang/test/clang-rename/ClassTestMulti.cpp delete mode 100644 clang/test/clang-rename/ClassTestMultiByName.cpp delete mode 100644 clang/test/clang-rename/ComplexFunctionOverride.cpp delete mode 100644 clang/test/clang-rename/ComplicatedClassType.cpp delete mode 100644 clang/test/clang-rename/Ctor.cpp delete mode 100644 clang/test/clang-rename/CtorInitializer.cpp delete mode 100644 clang/test/clang-rename/DeclRefExpr.cpp delete mode 100644 clang/test/clang-rename/ForceMulti.cpp delete mode 100644 clang/test/clang-rename/ForwardClassDecl.cpp delete mode 100644 clang/test/clang-rename/FunctionMacro.cpp delete mode 100644 clang/test/clang-rename/FunctionOverride.cpp delete mode 100644 clang/test/clang-rename/FunctionTemplate.cpp delete mode 100644 clang/test/clang-rename/FunctionWithClassFindByName.cpp delete mode 100644 clang/test/clang-rename/IncludeHeaderWithSymbol.cpp delete mode 100644 clang/test/clang-rename/Inputs/HeaderWithSymbol.h delete mode 100644 clang/test/clang-rename/Inputs/OffsetToNewName.yaml delete mode 100644 clang/test/clang-rename/Inputs/QualifiedNameToNewName.yaml delete mode 100644 clang/test/clang-rename/InvalidNewName.cpp delete mode 100644 clang/test/clang-rename/InvalidOffset.cpp delete mode 100644 clang/test/clang-rename/InvalidQualifiedName.cpp delete mode 100644 clang/test/clang-rename/MemberExprMacro.cpp delete mode 100644 clang/test/clang-rename/Namespace.cpp delete mode 100644 clang/test/clang-rename/NoNewName.cpp delete mode 100644 clang/test/clang-rename/NonExistFile.cpp delete mode 100644 clang/test/clang-rename/TemplateClassInstantiation.cpp delete mode 100644 clang/test/clang-rename/TemplateCtor.cpp delete mode 100644 clang/test/clang-rename/TemplateTypename.cpp delete mode 100644 clang/test/clang-rename/TemplatedClassFunction.cpp delete mode 100644 clang/test/clang-rename/Typedef.cpp delete mode 100644 clang/test/clang-rename/UserDefinedConversion.cpp delete mode 100644 clang/test/clang-rename/Variable.cpp delete mode 100644 clang/test/clang-rename/VariableMacro.cpp delete mode 100644 clang/test/clang-rename/VariableTemplate.cpp delete mode 100644 clang/test/clang-rename/YAMLInput.cpp delete mode 100644 clang/tools/clang-rename/CMakeLists.txt delete mode 100644 clang/tools/clang-rename/ClangRename.cpp delete mode 100644 clang/tools/clang-rename/clang-rename.el delete mode 100644 clang/tools/clang-rename/clang-rename.py delete mode 100644 clang/unittests/Rename/CMakeLists.txt delete mode 100644 clang/unittests/Rename/ClangRenameTest.h delete mode 100644 clang/unittests/Rename/RenameAliasTest.cpp delete mode 100644 clang/unittests/Rename/RenameClassTest.cpp delete mode 100644 clang/unittests/Rename/RenameEnumTest.cpp delete mode 100644 clang/unittests/Rename/RenameFunctionTest.cpp delete mode 100644 clang/unittests/Rename/RenameMemberTest.cpp delete mode 100644 llvm/utils/gn/secondary/clang/tools/clang-rename/BUILD.gn diff --git a/clang-tools-extra/CODE_OWNERS.TXT b/clang-tools-extra/CODE_OWNERS.TXT index 4cf80aa2b0b826..2831ec7e25f59f 100644 --- a/clang-tools-extra/CODE_OWNERS.TXT +++ b/clang-tools-extra/CODE_OWNERS.TXT @@ -23,7 +23,7 @@ D: clang-tidy N: Manuel Klimek E: klimek@google.com -D: clang-rename, all parts of clang-tools-extra not covered by someone else +D: all parts of clang-tools-extra not covered by someone else N: Sam McCall E: sammccall@google.com diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 2370b594d22269..d284bb62f7c7f4 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -87,9 +87,6 @@ Improvements to clang-doc Improvements to clang-query --------------------------- -Improvements to clang-rename ----------------------------- - The improvements are... Improvements to clang-tidy diff --git a/clang-tools-extra/docs/clang-rename.rst b/clang-tools-extra/docs/clang-rename.rst deleted file mode 100644 index e13d8c3ad25f98..00000000000000 --- a/clang-tools-extra/docs/clang-rename.rst +++ /dev/null @@ -1,168 +0,0 @@ -============ -Clang-Rename -============ - -.. contents:: - -See also: - -.. toctree:: - :maxdepth: 1 - - -:program:`clang-rename` is a C++ refactoring tool. Its purpose is to perform -efficient renaming actions in large-scale projects such as renaming classes, -functions, variables, arguments, namespaces etc. - -The tool is in a very early development stage, so you might encounter bugs and -crashes. Submitting reports with information about how to reproduce the issue -to `the LLVM bugtracker `_ will definitely help the -project. If you have any ideas or suggestions, you might want to put a feature -request there. - -Using Clang-Rename -================== - -:program:`clang-rename` is a `LibTooling -`_-based tool, and it's easier to -work with if you set up a compile command database for your project (for an -example of how to do this see `How To Setup Tooling For LLVM -`_). You can also -specify compilation options on the command line after `--`: - -.. code-block:: console - - $ clang-rename -offset=42 -new-name=foo test.cpp -- -Imy_project/include -DMY_DEFINES ... - - -To get an offset of a symbol in a file run - -.. code-block:: console - - $ grep -FUbo 'foo' file.cpp - - -The tool currently supports renaming actions inside a single translation unit -only. It is planned to extend the tool's functionality to support multi-TU -renaming actions in the future. - -:program:`clang-rename` also aims to be easily integrated into popular text -editors, such as Vim and Emacs, and improve the workflow of users. - -Although a command line interface exists, it is highly recommended to use the -text editor interface instead for better experience. - -You can also identify one or more symbols to be renamed by giving the fully -qualified name: - -.. code-block:: console - - $ clang-rename -qualified-name=foo -new-name=bar test.cpp - -Renaming multiple symbols at once is supported, too. However, -:program:`clang-rename` doesn't accept both `-offset` and `-qualified-name` at -the same time. So, you can either specify multiple `-offset` or -`-qualified-name`. - -.. code-block:: console - - $ clang-rename -offset=42 -new-name=bar1 -offset=150 -new-name=bar2 test.cpp - -or - -.. code-block:: console - - $ clang-rename -qualified-name=foo1 -new-name=bar1 -qualified-name=foo2 -new-name=bar2 test.cpp - - -Alternatively, {offset | qualified-name} / new-name pairs can be put into a YAML -file: - -.. code-block:: yaml - - --- - - Offset: 42 - NewName: bar1 - - Offset: 150 - NewName: bar2 - ... - -or - -.. code-block:: yaml - - --- - - QualifiedName: foo1 - NewName: bar1 - - QualifiedName: foo2 - NewName: bar2 - ... - -That way you can avoid spelling out all the names as command line arguments: - -.. code-block:: console - - $ clang-rename -input=test.yaml test.cpp - -:program:`clang-rename` offers the following options: - -.. code-block:: console - - $ clang-rename --help - USAGE: clang-rename [subcommand] [options] [... ] - - OPTIONS: - - Generic Options: - - -help - Display available options (-help-hidden for more) - -help-list - Display list of available options (-help-list-hidden for more) - -version - Display the version of this program - - clang-rename common options: - - -export-fixes= - YAML file to store suggested fixes in. - -extra-arg= - Additional argument to append to the compiler command line - Can be used several times. - -extra-arg-before= - Additional argument to prepend to the compiler command line - Can be used several times. - -force - Ignore nonexistent qualified names. - -i - Overwrite edited s. - -input= - YAML file to load oldname-newname pairs from. - -new-name= - The new name to change the symbol to. - -offset= - Locates the symbol by offset as opposed to :. - -p - Build path - -pl - Print the locations affected by renaming to stderr. - -pn - Print the found symbol's name prior to renaming to stderr. - -qualified-name= - The fully qualified name of the symbol. - -Vim Integration -=============== - -You can call :program:`clang-rename` directly from Vim! To set up -:program:`clang-rename` integration for Vim see -`clang/tools/clang-rename/clang-rename.py -`_. - -Please note that **you have to save all buffers, in which the replacement will -happen before running the tool**. - -Once installed, you can point your cursor to symbols you want to rename, press -`cr` and type new desired name. The ` key -`_ -is a reference to a specific key defined by the mapleader variable and is bound -to backslash by default. - -Emacs Integration -================= - -You can also use :program:`clang-rename` while using Emacs! To set up -:program:`clang-rename` integration for Emacs see -`clang-rename/tool/clang-rename.el -`_. - -Once installed, you can point your cursor to symbols you want to rename, press -`M-X`, type `clang-rename` and new desired name. - -Please note that **you have to save all buffers, in which the replacement will -happen before running the tool**. diff --git a/clang-tools-extra/docs/index.rst b/clang-tools-extra/docs/index.rst index d5c00b89a1555e..9f7324fcf74197 100644 --- a/clang-tools-extra/docs/index.rst +++ b/clang-tools-extra/docs/index.rst @@ -19,7 +19,6 @@ Contents clang-include-fixer modularize pp-trace - clang-rename clangd clang-doc diff --git a/clang-tools-extra/test/CMakeLists.txt b/clang-tools-extra/test/CMakeLists.txt index 0953ff2531e1a1..d72a117166a08b 100644 --- a/clang-tools-extra/test/CMakeLists.txt +++ b/clang-tools-extra/test/CMakeLists.txt @@ -28,9 +28,6 @@ configure_lit_site_cfg( ) set(CLANG_TOOLS_TEST_DEPS - # For the clang-apply-replacements test that uses clang-rename. - clang-rename - # For the clang-doc tests that emit bitcode files. llvm-bcanalyzer diff --git a/clang-tools-extra/test/clang-apply-replacements/ClangRenameClassReplacements.cpp b/clang-tools-extra/test/clang-apply-replacements/ClangRenameClassReplacements.cpp deleted file mode 100644 index 2b478bbf900df8..00000000000000 --- a/clang-tools-extra/test/clang-apply-replacements/ClangRenameClassReplacements.cpp +++ /dev/null @@ -1,11 +0,0 @@ -// RUN: rm -rf %t -// RUN: mkdir -p %t/fixes -// RUN: cat %s > %t.cpp -// RUN: clang-rename -offset=254 -new-name=Bar -export-fixes=%t/fixes/clang-rename.yaml %t.cpp -- -// RUN: clang-apply-replacements %t -// RUN: sed 's,//.*,,' %t.cpp | FileCheck %s - -class Foo {}; // CHECK: class Bar {}; - -// Use grep -FUbo 'Foo' to get the correct offset of Cla when changing -// this file. diff --git a/clang/docs/ClangFormattedStatus.rst b/clang/docs/ClangFormattedStatus.rst index 0ee0782879ef6b..b917e077679b47 100644 --- a/clang/docs/ClangFormattedStatus.rst +++ b/clang/docs/ClangFormattedStatus.rst @@ -809,11 +809,6 @@ tree in terms of conformance to :doc:`ClangFormat` as of: March 06, 2022 17:32:2 - `4` - `0` - :good:`100%` - * - clang/tools/clang-rename - - `1` - - `1` - - `0` - - :good:`100%` * - clang/tools/clang-repl - `1` - `1` diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index dd004228b679e4..7b612e3c65f494 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -39,6 +39,8 @@ code bases. - The ``le32`` and ``le64`` targets have been removed. +- The ``clang-rename`` tool has been removed. + C/C++ Language Potentially Breaking Changes ------------------------------------------- diff --git a/clang/docs/tools/clang-formatted-files.txt b/clang/docs/tools/clang-formatted-files.txt index 48ded9c7545547..fa40ea74fb7e7d 100644 --- a/clang/docs/tools/clang-formatted-files.txt +++ b/clang/docs/tools/clang-formatted-files.txt @@ -608,7 +608,6 @@ clang/tools/clang-refactor/ClangRefactor.cpp clang/tools/clang-refactor/TestSupport.cpp clang/tools/clang-refactor/TestSupport.h clang/tools/clang-refactor/ToolRefactoringResultConsumer.h -clang/tools/clang-rename/ClangRename.cpp clang/tools/clang-repl/ClangRepl.cpp clang/tools/clang-scan-deps/ClangScanDeps.cpp clang/tools/clang-shlib/clang-shlib.cpp diff --git a/clang/test/CMakeLists.txt b/clang/test/CMakeLists.txt index 299a35723b59d5..2d84b0d73053f6 100644 --- a/clang/test/CMakeLists.txt +++ b/clang/test/CMakeLists.txt @@ -72,7 +72,6 @@ list(APPEND CLANG_TEST_DEPS clang-tblgen clang-offload-bundler clang-import-test - clang-rename clang-refactor clang-diff clang-installapi diff --git a/clang/test/clang-rename/ClassAsTemplateArgument.cpp b/clang/test/clang-rename/ClassAsTemplateArgument.cpp deleted file mode 100644 index 2e09a5b529e753..00000000000000 --- a/clang/test/clang-rename/ClassAsTemplateArgument.cpp +++ /dev/null @@ -1,21 +0,0 @@ -class Foo /* Test 1 */ {}; // CHECK: class Bar /* Test 1 */ {}; - -template -void func() {} - -template -class Baz {}; - -int main() { - func(); // CHECK: func(); - Baz /* Test 2 */ obj; // CHECK: Baz /* Test 2 */ obj; - return 0; -} - -// Test 1. -// RUN: clang-rename -offset=7 -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s -// Test 2. -// RUN: clang-rename -offset=215 -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s - -// To find offsets after modifying the file, use: -// grep -Ubo 'Foo.*' diff --git a/clang/test/clang-rename/ClassFindByName.cpp b/clang/test/clang-rename/ClassFindByName.cpp deleted file mode 100644 index 4430891ec4b1e0..00000000000000 --- a/clang/test/clang-rename/ClassFindByName.cpp +++ /dev/null @@ -1,10 +0,0 @@ -class Foo { // CHECK: class Bar { -}; - -int main() { - Foo *Pointer = 0; // CHECK: Bar *Pointer = 0; - return 0; -} - -// Test 1. -// RUN: clang-rename -qualified-name=Foo -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s diff --git a/clang/test/clang-rename/ClassSimpleRenaming.cpp b/clang/test/clang-rename/ClassSimpleRenaming.cpp deleted file mode 100644 index 086f55736cb72f..00000000000000 --- a/clang/test/clang-rename/ClassSimpleRenaming.cpp +++ /dev/null @@ -1,14 +0,0 @@ -class Foo /* Test 1 */ { // CHECK: class Bar /* Test 1 */ { -public: - void foo(int x); -}; - -void Foo::foo(int x) /* Test 2 */ {} // CHECK: void Bar::foo(int x) /* Test 2 */ {} - -// Test 1. -// RUN: clang-rename -offset=6 -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s -// Test 2. -// RUN: clang-rename -offset=109 -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s - -// To find offsets after modifying the file, use: -// grep -Ubo 'Foo.*' diff --git a/clang/test/clang-rename/ClassTestMulti.cpp b/clang/test/clang-rename/ClassTestMulti.cpp deleted file mode 100644 index 81e65c76065214..00000000000000 --- a/clang/test/clang-rename/ClassTestMulti.cpp +++ /dev/null @@ -1,11 +0,0 @@ -class Foo1 /* Offset 1 */ { // CHECK: class Bar1 /* Offset 1 */ { -}; - -class Foo2 /* Offset 2 */ { // CHECK: class Bar2 /* Offset 2 */ { -}; - -// Test 1. -// RUN: clang-rename -offset=6 -new-name=Bar1 -offset=76 -new-name=Bar2 %s -- | sed 's,//.*,,' | FileCheck %s - -// To find offsets after modifying the file, use: -// grep -Ubo 'Foo.*' diff --git a/clang/test/clang-rename/ClassTestMultiByName.cpp b/clang/test/clang-rename/ClassTestMultiByName.cpp deleted file mode 100644 index 61b69a1bdf4cac..00000000000000 --- a/clang/test/clang-rename/ClassTestMultiByName.cpp +++ /dev/null @@ -1,8 +0,0 @@ -class Foo1 { // CHECK: class Bar1 -}; - -class Foo2 { // CHECK: class Bar2 -}; - -// Test 1. -// RUN: clang-rename -qualified-name=Foo1 -new-name=Bar1 -qualified-name=Foo2 -new-name=Bar2 %s -- | sed 's,//.*,,' | FileCheck %s diff --git a/clang/test/clang-rename/ComplexFunctionOverride.cpp b/clang/test/clang-rename/ComplexFunctionOverride.cpp deleted file mode 100644 index ccf3a20e540024..00000000000000 --- a/clang/test/clang-rename/ComplexFunctionOverride.cpp +++ /dev/null @@ -1,47 +0,0 @@ -struct A { - virtual void foo() {} /* Test 1 */ // CHECK: virtual void bar() {} -}; - -struct B : A { - void foo() override {} /* Test 2 */ // CHECK: void bar() override {} -}; - -struct C : B { - void foo() override {} /* Test 3 */ // CHECK: void bar() override {} -}; - -struct D : B { - void foo() override {} /* Test 4 */ // CHECK: void bar() override {} -}; - -struct E : D { - void foo() override {} /* Test 5 */ // CHECK: void bar() override {} -}; - -int main() { - A a; - a.foo(); // CHECK: a.bar(); - B b; - b.foo(); // CHECK: b.bar(); - C c; - c.foo(); // CHECK: c.bar(); - D d; - d.foo(); // CHECK: d.bar(); - E e; - e.foo(); // CHECK: e.bar(); - return 0; -} - -// Test 1. -// RUN: clang-rename -offset=26 -new-name=bar %s -- | sed 's,//.*,,' | FileCheck %s -// Test 2. -// RUN: clang-rename -offset=109 -new-name=bar %s -- | sed 's,//.*,,' | FileCheck %s -// Test 3. -// RUN: clang-rename -offset=201 -new-name=bar %s -- | sed 's,//.*,,' | FileCheck %s -// Test 4. -// RUN: clang-rename -offset=293 -new-name=bar %s -- | sed 's,//.*,,' | FileCheck %s -// Test 5. -// RUN: clang-rename -offset=385 -new-name=bar %s -- | sed 's,//.*,,' | FileCheck %s - -// To find offsets after modifying the file, use: -// grep -Ubo 'foo.*' diff --git a/clang/test/clang-rename/ComplicatedClassType.cpp b/clang/test/clang-rename/ComplicatedClassType.cpp deleted file mode 100644 index 8801953031273a..00000000000000 --- a/clang/test/clang-rename/ComplicatedClassType.cpp +++ /dev/null @@ -1,63 +0,0 @@ -// Forward declaration. -class Foo; /* Test 1 */ // CHECK: class Bar; /* Test 1 */ - -class Baz { - virtual int getValue() const = 0; -}; - -class Foo : public Baz { /* Test 2 */// CHECK: class Bar : public Baz { -public: - Foo(int value = 0) : x(value) {} // CHECK: Bar(int value = 0) : x(value) {} - - Foo &operator++(int) { // CHECK: Bar &operator++(int) { - x++; - return *this; - } - - bool operator<(Foo const &rhs) { // CHECK: bool operator<(Bar const &rhs) { - return this->x < rhs.x; - } - - int getValue() const { - return 0; - } - -private: - int x; -}; - -int main() { - Foo *Pointer = 0; // CHECK: Bar *Pointer = 0; - Foo Variable = Foo(10); // CHECK: Bar Variable = Bar(10); - for (Foo it; it < Variable; it++) { // CHECK: for (Bar it; it < Variable; it++) { - } - const Foo *C = new Foo(); // CHECK: const Bar *C = new Bar(); - const_cast(C)->getValue(); // CHECK: const_cast(C)->getValue(); - Foo foo; // CHECK: Bar foo; - const Baz &BazReference = foo; - const Baz *BazPointer = &foo; - dynamic_cast(BazReference).getValue(); /* Test 3 */ // CHECK: dynamic_cast(BazReference).getValue(); - dynamic_cast(BazPointer)->getValue(); /* Test 4 */ // CHECK: dynamic_cast(BazPointer)->getValue(); - reinterpret_cast(BazPointer)->getValue(); /* Test 5 */ // CHECK: reinterpret_cast(BazPointer)->getValue(); - static_cast(BazReference).getValue(); /* Test 6 */ // CHECK: static_cast(BazReference).getValue(); - static_cast(BazPointer)->getValue(); /* Test 7 */ // CHECK: static_cast(BazPointer)->getValue(); - return 0; -} - -// Test 1. -// RUN: clang-rename -offset=30 -new-name=Bar %s -- -frtti | sed 's,//.*,,' | FileCheck %s -// Test 2. -// RUN: clang-rename -offset=155 -new-name=Bar %s -- -frtti | sed 's,//.*,,' | FileCheck %s -// Test 3. -// RUN: clang-rename -offset=1133 -new-name=Bar %s -- -frtti | sed 's,//.*,,' | FileCheck %s -// Test 4. -// RUN: clang-rename -offset=1266 -new-name=Bar %s -- -frtti | sed 's,//.*,,' | FileCheck %s -// Test 5. -// RUN: clang-rename -offset=1402 -new-name=Bar %s -- -frtti | sed 's,//.*,,' | FileCheck %s -// Test 6. -// RUN: clang-rename -offset=1533 -new-name=Bar %s -- -frtti | sed 's,//.*,,' | FileCheck %s -// Test 7. -// RUN: clang-rename -offset=1665 -new-name=Bar %s -- -frtti | sed 's,//.*,,' | FileCheck %s - -// To find offsets after modifying the file, use: -// grep -Ubo 'Foo.*' diff --git a/clang/test/clang-rename/Ctor.cpp b/clang/test/clang-rename/Ctor.cpp deleted file mode 100644 index 9908a4123ddfc0..00000000000000 --- a/clang/test/clang-rename/Ctor.cpp +++ /dev/null @@ -1,14 +0,0 @@ -class Foo { // CHECK: class Bar { -public: - Foo(); /* Test 1 */ // CHECK: Bar(); -}; - -Foo::Foo() /* Test 2 */ {} // CHECK: Bar::Bar() /* Test 2 */ {} - -// Test 1. -// RUN: clang-rename -offset=62 -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s -// Test 2. -// RUN: clang-rename -offset=116 -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s - -// To find offsets after modifying the file, use: -// grep -Ubo 'Foo.*' diff --git a/clang/test/clang-rename/CtorInitializer.cpp b/clang/test/clang-rename/CtorInitializer.cpp deleted file mode 100644 index fed4f5b06c2755..00000000000000 --- a/clang/test/clang-rename/CtorInitializer.cpp +++ /dev/null @@ -1,17 +0,0 @@ -class Baz {}; - -class Qux { - Baz Foo; /* Test 1 */ // CHECK: Baz Bar; -public: - Qux(); -}; - -Qux::Qux() : Foo() /* Test 2 */ {} // CHECK: Qux::Qux() : Bar() /* Test 2 */ {} - -// Test 1. -// RUN: clang-rename -offset=33 -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s -// Test 2. -// RUN: clang-rename -offset=118 -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s - -// To find offsets after modifying the file, use: -// grep -Ubo 'Foo.*' diff --git a/clang/test/clang-rename/DeclRefExpr.cpp b/clang/test/clang-rename/DeclRefExpr.cpp deleted file mode 100644 index 6462862d82adfa..00000000000000 --- a/clang/test/clang-rename/DeclRefExpr.cpp +++ /dev/null @@ -1,24 +0,0 @@ -class C { -public: - static int Foo; /* Test 1 */ // CHECK: static int Bar; -}; - -int foo(int x) { return 0; } -#define MACRO(a) foo(a) - -int main() { - C::Foo = 1; /* Test 2 */ // CHECK: C::Bar = 1; - MACRO(C::Foo); // CHECK: MACRO(C::Bar); - int y = C::Foo; /* Test 3 */ // CHECK: int y = C::Bar; - return 0; -} - -// Test 1. -// RUN: clang-rename -offset=31 -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s -// Test 2. -// RUN: clang-rename -offset=152 -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s -// Test 3. -// RUN: clang-rename -offset=271 -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s - -// To find offsets after modifying the file, use: -// grep -Ubo 'Foo.*' diff --git a/clang/test/clang-rename/ForceMulti.cpp b/clang/test/clang-rename/ForceMulti.cpp deleted file mode 100644 index 41983ce260c826..00000000000000 --- a/clang/test/clang-rename/ForceMulti.cpp +++ /dev/null @@ -1,8 +0,0 @@ -class B /* Test 1 */ { // CHECK: class B2 /* Test 1 */ { -}; - -class D : public B /* Test 1 */ { // CHECK: class D : public B2 /* Test 1 */ { -}; - -// Test 1. -// RUN: clang-rename -force -qualified-name B -new-name B2 -qualified-name E -new-name E2 %s -- | sed 's,//.*,,' | FileCheck %s diff --git a/clang/test/clang-rename/ForwardClassDecl.cpp b/clang/test/clang-rename/ForwardClassDecl.cpp deleted file mode 100644 index ef731a16d6e06b..00000000000000 --- a/clang/test/clang-rename/ForwardClassDecl.cpp +++ /dev/null @@ -1,4 +0,0 @@ -class Foo; // CHECK: class Bar; -Foo *f(); // CHECK: Bar *f(); - -// RUN: clang-rename -offset=6 -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s diff --git a/clang/test/clang-rename/FunctionMacro.cpp b/clang/test/clang-rename/FunctionMacro.cpp deleted file mode 100644 index 6e87026ec70690..00000000000000 --- a/clang/test/clang-rename/FunctionMacro.cpp +++ /dev/null @@ -1,20 +0,0 @@ -#define moo foo // CHECK: #define moo macro_function - -int foo() /* Test 1 */ { // CHECK: int macro_function() /* Test 1 */ { - return 42; -} - -void boo(int value) {} - -void qoo() { - foo(); // CHECK: macro_function(); - boo(foo()); // CHECK: boo(macro_function()); - moo(); - boo(moo()); -} - -// Test 1. -// RUN: clang-rename -offset=68 -new-name=macro_function %s -- | sed 's,//.*,,' | FileCheck %s - -// To find offsets after modifying the file, use: -// grep -Ubo 'foo.*' diff --git a/clang/test/clang-rename/FunctionOverride.cpp b/clang/test/clang-rename/FunctionOverride.cpp deleted file mode 100644 index adfeb739e66d16..00000000000000 --- a/clang/test/clang-rename/FunctionOverride.cpp +++ /dev/null @@ -1,13 +0,0 @@ -class A { virtual void foo(); /* Test 1 */ }; // CHECK: class A { virtual void bar(); -class B : public A { void foo(); /* Test 2 */ }; // CHECK: class B : public A { void bar(); -class C : public B { void foo(); /* Test 3 */ }; // CHECK: class C : public B { void bar(); - -// Test 1. -// RUN: clang-rename -offset=23 -new-name=bar %s -- | sed 's,//.*,,' | FileCheck %s -// Test 2. -// RUN: clang-rename -offset=116 -new-name=bar %s -- | sed 's,//.*,,' | FileCheck %s -// Test 3. -// RUN: clang-rename -offset=209 -new-name=bar %s -- | sed 's,//.*,,' | FileCheck %s - -// To find offsets after modifying the file, use: -// grep -Ubo 'foo.*' diff --git a/clang/test/clang-rename/FunctionTemplate.cpp b/clang/test/clang-rename/FunctionTemplate.cpp deleted file mode 100644 index 51b2515b889421..00000000000000 --- a/clang/test/clang-rename/FunctionTemplate.cpp +++ /dev/null @@ -1,19 +0,0 @@ -template -void Foo(T t); // CHECK: void Bar(T t); - -template <> -void Foo(int a); // CHECK: void Bar(int a); - -void test() { - Foo(1); // CHECK: Bar(1); -} - -// Test 1. -// RUN: clang-rename -offset=28 -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s -// Test 2. -// RUN: clang-rename -offset=81 -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s -// Test 3. -// RUN: clang-rename -offset=137 -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s - -// To find offsets after modifying the file, use: -// grep -Ubo 'Foo.*' diff --git a/clang/test/clang-rename/FunctionWithClassFindByName.cpp b/clang/test/clang-rename/FunctionWithClassFindByName.cpp deleted file mode 100644 index 2cae09a1c24482..00000000000000 --- a/clang/test/clang-rename/FunctionWithClassFindByName.cpp +++ /dev/null @@ -1,12 +0,0 @@ -void foo() { -} - -class Foo { // CHECK: class Bar -}; - -int main() { - Foo *Pointer = 0; // CHECK: Bar *Pointer = 0; - return 0; -} - -// RUN: clang-rename -qualified-name=Foo -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s diff --git a/clang/test/clang-rename/IncludeHeaderWithSymbol.cpp b/clang/test/clang-rename/IncludeHeaderWithSymbol.cpp deleted file mode 100644 index cb2baee57b8932..00000000000000 --- a/clang/test/clang-rename/IncludeHeaderWithSymbol.cpp +++ /dev/null @@ -1,10 +0,0 @@ -#include "Inputs/HeaderWithSymbol.h" - -int main() { - return 0; // CHECK: {{^ return 0;}} -} - -// Test 1. -// The file IncludeHeaderWithSymbol.cpp doesn't contain the symbol Foo -// and is expected to be written to stdout without modifications -// RUN: clang-rename -qualified-name=Foo -new-name=Bar %s -- | FileCheck %s diff --git a/clang/test/clang-rename/Inputs/HeaderWithSymbol.h b/clang/test/clang-rename/Inputs/HeaderWithSymbol.h deleted file mode 100644 index 1fe02e89786cf5..00000000000000 --- a/clang/test/clang-rename/Inputs/HeaderWithSymbol.h +++ /dev/null @@ -1 +0,0 @@ -struct Foo {}; diff --git a/clang/test/clang-rename/Inputs/OffsetToNewName.yaml b/clang/test/clang-rename/Inputs/OffsetToNewName.yaml deleted file mode 100644 index d8e972880f3618..00000000000000 --- a/clang/test/clang-rename/Inputs/OffsetToNewName.yaml +++ /dev/null @@ -1,6 +0,0 @@ ---- -- Offset: 6 - NewName: Bar1 -- Offset: 44 - NewName: Bar2 -... diff --git a/clang/test/clang-rename/Inputs/QualifiedNameToNewName.yaml b/clang/test/clang-rename/Inputs/QualifiedNameToNewName.yaml deleted file mode 100644 index 6e3783671dfaf7..00000000000000 --- a/clang/test/clang-rename/Inputs/QualifiedNameToNewName.yaml +++ /dev/null @@ -1,6 +0,0 @@ ---- -- QualifiedName: Foo1 - NewName: Bar1 -- QualifiedName: Foo2 - NewName: Bar2 -... diff --git a/clang/test/clang-rename/InvalidNewName.cpp b/clang/test/clang-rename/InvalidNewName.cpp deleted file mode 100644 index e6b38e59420a8f..00000000000000 --- a/clang/test/clang-rename/InvalidNewName.cpp +++ /dev/null @@ -1,2 +0,0 @@ -// RUN: not clang-rename -new-name=class -offset=133 %s 2>&1 | FileCheck %s -// CHECK: ERROR: new name is not a valid identifier in C++17. diff --git a/clang/test/clang-rename/InvalidOffset.cpp b/clang/test/clang-rename/InvalidOffset.cpp deleted file mode 100644 index 2ae04d01e4a7c3..00000000000000 --- a/clang/test/clang-rename/InvalidOffset.cpp +++ /dev/null @@ -1,9 +0,0 @@ -#include "Inputs/HeaderWithSymbol.h" -#define FOO int bar; -FOO - -int foo; - -// RUN: not clang-rename -new-name=qux -offset=259 %s -- 2>&1 | FileCheck %s -// CHECK-NOT: CHECK -// CHECK: error: SourceLocation in file {{.*}}InvalidOffset.cpp at offset 259 is invalid diff --git a/clang/test/clang-rename/InvalidQualifiedName.cpp b/clang/test/clang-rename/InvalidQualifiedName.cpp deleted file mode 100644 index 5280e3939ccde1..00000000000000 --- a/clang/test/clang-rename/InvalidQualifiedName.cpp +++ /dev/null @@ -1,4 +0,0 @@ -struct S { -}; - -// RUN: clang-rename -force -qualified-name S2 -new-name=T %s -- diff --git a/clang/test/clang-rename/MemberExprMacro.cpp b/clang/test/clang-rename/MemberExprMacro.cpp deleted file mode 100644 index 56cd8d95f6e882..00000000000000 --- a/clang/test/clang-rename/MemberExprMacro.cpp +++ /dev/null @@ -1,22 +0,0 @@ -class Baz { -public: - int Foo; /* Test 1 */ // CHECK: int Bar; -}; - -int qux(int x) { return 0; } -#define MACRO(a) qux(a) - -int main() { - Baz baz; - baz.Foo = 1; /* Test 2 */ // CHECK: baz.Bar = 1; - MACRO(baz.Foo); // CHECK: MACRO(baz.Bar); - int y = baz.Foo; // CHECK: int y = baz.Bar; -} - -// Test 1. -// RUN: clang-rename -offset=26 -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s -// Test 2. -// RUN: clang-rename -offset=155 -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s - -// To find offsets after modifying the file, use: -// grep -Ubo 'Foo.*' diff --git a/clang/test/clang-rename/Namespace.cpp b/clang/test/clang-rename/Namespace.cpp deleted file mode 100644 index ec9630fdedb6af..00000000000000 --- a/clang/test/clang-rename/Namespace.cpp +++ /dev/null @@ -1,13 +0,0 @@ -namespace gcc /* Test 1 */ { // CHECK: namespace clang /* Test 1 */ { - int x; -} - -void boo() { - gcc::x = 42; // CHECK: clang::x = 42; -} - -// Test 1. -// RUN: clang-rename -offset=10 -new-name=clang %s -- | sed 's,//.*,,' | FileCheck %s - -// To find offsets after modifying the file, use: -// grep -Ubo 'Foo.*' diff --git a/clang/test/clang-rename/NoNewName.cpp b/clang/test/clang-rename/NoNewName.cpp deleted file mode 100644 index 4f882d83b0c190..00000000000000 --- a/clang/test/clang-rename/NoNewName.cpp +++ /dev/null @@ -1,4 +0,0 @@ -// Check for an error while -new-name argument has not been passed to -// clang-rename. -// RUN: not clang-rename -offset=133 %s 2>&1 | FileCheck %s -// CHECK: clang-rename: -new-name must be specified. diff --git a/clang/test/clang-rename/NonExistFile.cpp b/clang/test/clang-rename/NonExistFile.cpp deleted file mode 100644 index f45839be804736..00000000000000 --- a/clang/test/clang-rename/NonExistFile.cpp +++ /dev/null @@ -1,2 +0,0 @@ -// RUN: not clang-rename -offset=0 -new-name=bar non-existing-file 2>&1 | FileCheck %s -// CHECK: clang-rename: non-existing-file does not exist. diff --git a/clang/test/clang-rename/TemplateClassInstantiation.cpp b/clang/test/clang-rename/TemplateClassInstantiation.cpp deleted file mode 100644 index 493d0951df57b9..00000000000000 --- a/clang/test/clang-rename/TemplateClassInstantiation.cpp +++ /dev/null @@ -1,42 +0,0 @@ -template -class Foo { /* Test 1 */ // CHECK: class Bar { /* Test 1 */ -public: - T foo(T arg, T& ref, T* ptr) { - T value; - int number = 42; - value = (T)number; - value = static_cast(number); - return value; - } - static void foo(T value) {} - T member; -}; - -template -void func() { - Foo obj; /* Test 2 */ // CHECK: Bar obj; - obj.member = T(); - Foo::foo(); // CHECK: Bar::foo(); -} - -int main() { - Foo i; /* Test 3 */ // CHECK: Bar i; - i.member = 0; - Foo::foo(0); // CHECK: Bar::foo(0); - - Foo b; // CHECK: Bar b; - b.member = false; - Foo::foo(false); // CHECK: Bar::foo(false); - - return 0; -} - -// Test 1. -// RUN: clang-rename -offset=29 -new-name=Bar %s -- -fno-delayed-template-parsing | sed 's,//.*,,' | FileCheck %s -// Test 2. -// RUN: clang-rename -offset=324 -new-name=Bar %s -- -fno-delayed-template-parsing | sed 's,//.*,,' | FileCheck %s -// Test 3. -// RUN: clang-rename -offset=463 -new-name=Bar %s -- -fno-delayed-template-parsing | sed 's,//.*,,' | FileCheck %s - -// To find offsets after modifying the file, use: -// grep -Ubo 'Foo.*' diff --git a/clang/test/clang-rename/TemplateCtor.cpp b/clang/test/clang-rename/TemplateCtor.cpp deleted file mode 100644 index 9a59194ac3f4d3..00000000000000 --- a/clang/test/clang-rename/TemplateCtor.cpp +++ /dev/null @@ -1,10 +0,0 @@ -class Foo { // CHECK: class Bar { -public: - template - Foo(); // CHECK: Bar(); - - template - Foo(Foo &); // CHECK: Bar(Bar &); -}; - -// RUN: clang-rename -offset=6 -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s diff --git a/clang/test/clang-rename/TemplateTypename.cpp b/clang/test/clang-rename/TemplateTypename.cpp deleted file mode 100644 index 559ec1f9ade75d..00000000000000 --- a/clang/test/clang-rename/TemplateTypename.cpp +++ /dev/null @@ -1,24 +0,0 @@ -template // CHECK: template -class Foo { -T foo(T arg, T& ref, T* /* Test 2 */ ptr) { // CHECK: U foo(U arg, U& ref, U* /* Test 2 */ ptr) { - T value; // CHECK: U value; - int number = 42; - value = (T)number; // CHECK: value = (U)number; - value = static_cast(number); // CHECK: value = static_cast(number); - return value; -} - -static void foo(T value) {} // CHECK: static void foo(U value) {} - -T member; // CHECK: U member; -}; - -// Test 1. -// RUN: clang-rename -offset=19 -new-name=U %s -- -fno-delayed-template-parsing | sed 's,//.*,,' | FileCheck %s -// Test 2. -// RUN: clang-rename -offset=126 -new-name=U %s -- -fno-delayed-template-parsing | sed 's,//.*,,' | FileCheck %s -// Test 3. -// RUN: clang-rename -offset=392 -new-name=U %s -- -fno-delayed-template-parsing | sed 's,//.*,,' | FileCheck %s - -// To find offsets after modifying the file, use: -// grep -Ubo 'T.*' diff --git a/clang/test/clang-rename/TemplatedClassFunction.cpp b/clang/test/clang-rename/TemplatedClassFunction.cpp deleted file mode 100644 index d7f21e0847c97a..00000000000000 --- a/clang/test/clang-rename/TemplatedClassFunction.cpp +++ /dev/null @@ -1,27 +0,0 @@ -template -class A { -public: - void foo() /* Test 1 */ {} // CHECK: void bar() /* Test 1 */ {} -}; - -int main(int argc, char **argv) { - A a; - A b; - A c; - a.foo(); /* Test 2 */ // CHECK: a.bar(); /* Test 2 */ - b.foo(); /* Test 3 */ // CHECK: b.bar(); /* Test 3 */ - c.foo(); /* Test 4 */ // CHECK: c.bar(); /* Test 4 */ - return 0; -} - -// Test 1. -// RUN: clang-rename -offset=48 -new-name=bar %s -- | sed 's,//.*,,' | FileCheck %s -// Test 2. -// RUN: clang-rename -offset=191 -new-name=bar %s -- | sed 's,//.*,,' | FileCheck %s -// Test 3. -// RUN: clang-rename -offset=255 -new-name=bar %s -- | sed 's,//.*,,' | FileCheck %s -// Test 4. -// RUN: clang-rename -offset=319 -new-name=bar %s -- | sed 's,//.*,,' | FileCheck %s - -// To find offsets after modifying the file, use: -// grep -Ubo 'foo.*' diff --git a/clang/test/clang-rename/Typedef.cpp b/clang/test/clang-rename/Typedef.cpp deleted file mode 100644 index 64d337fae22c79..00000000000000 --- a/clang/test/clang-rename/Typedef.cpp +++ /dev/null @@ -1,8 +0,0 @@ -namespace std { -class basic_string {}; -typedef basic_string string; -} // namespace std - -std::string foo(); // // CHECK: std::new_string foo(); - -// RUN: clang-rename -offset=93 -new-name=new_string %s -- | sed 's,//.*,,' | FileCheck %s diff --git a/clang/test/clang-rename/UserDefinedConversion.cpp b/clang/test/clang-rename/UserDefinedConversion.cpp deleted file mode 100644 index 60f251ab448358..00000000000000 --- a/clang/test/clang-rename/UserDefinedConversion.cpp +++ /dev/null @@ -1,26 +0,0 @@ -class Foo { /* Test 1 */ // CHECK: class Bar { -public: - Foo() {} // CHECK: Bar() {} -}; - -class Baz { -public: - operator Foo() /* Test 2 */ const { // CHECK: operator Bar() /* Test 2 */ const { - Foo foo; // CHECK: Bar foo; - return foo; - } -}; - -int main() { - Baz boo; - Foo foo = static_cast(boo); // CHECK: Bar foo = static_cast(boo); - return 0; -} - -// Test 1. -// RUN: clang-rename -offset=7 -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s -// Test 2. -// RUN: clang-rename -offset=164 -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s - -// To find offsets after modifying the file, use: -// grep -Ubo 'Foo.*' diff --git a/clang/test/clang-rename/Variable.cpp b/clang/test/clang-rename/Variable.cpp deleted file mode 100644 index d7e670fb43eebb..00000000000000 --- a/clang/test/clang-rename/Variable.cpp +++ /dev/null @@ -1,33 +0,0 @@ -#define NAMESPACE namespace A -NAMESPACE { -int Foo; /* Test 1 */ // CHECK: int Bar; -} -int Foo; // CHECK: int Foo; -int Qux = Foo; // CHECK: int Qux = Foo; -int Baz = A::Foo; /* Test 2 */ // CHECK: Baz = A::Bar; -void fun() { - struct { - int Foo; // CHECK: int Foo; - } b = {100}; - int Foo = 100; // CHECK: int Foo = 100; - Baz = Foo; // CHECK: Baz = Foo; - { - extern int Foo; // CHECK: extern int Foo; - Baz = Foo; // CHECK: Baz = Foo; - Foo = A::Foo /* Test 3 */ + Baz; // CHECK: Foo = A::Bar /* Test 3 */ + Baz; - A::Foo /* Test 4 */ = b.Foo; // CHECK: A::Bar /* Test 4 */ = b.Foo; - } - Foo = b.Foo; // Foo = b.Foo; -} - -// Test 1. -// RUN: clang-rename -offset=46 -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s -// Test 2. -// RUN: clang-rename -offset=234 -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s -// Test 3. -// RUN: clang-rename -offset=641 -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s -// Test 4. -// RUN: clang-rename -offset=716 -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s - -// To find offsets after modifying the file, use: -// grep -Ubo 'Foo.*' diff --git a/clang/test/clang-rename/VariableMacro.cpp b/clang/test/clang-rename/VariableMacro.cpp deleted file mode 100644 index 622e825d3e41a0..00000000000000 --- a/clang/test/clang-rename/VariableMacro.cpp +++ /dev/null @@ -1,21 +0,0 @@ -#define Baz Foo // CHECK: #define Baz Bar - -void foo(int value) {} - -void macro() { - int Foo; /* Test 1 */ // CHECK: int Bar; - Foo = 42; /* Test 2 */ // CHECK: Bar = 42; - Baz -= 0; - foo(Foo); /* Test 3 */ // CHECK: foo(Bar); - foo(Baz); -} - -// Test 1. -// RUN: clang-rename -offset=88 -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s -// Test 2. -// RUN: clang-rename -offset=129 -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s -// Test 3. -// RUN: clang-rename -offset=191 -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s - -// To find offsets after modifying the file, use: -// grep -Ubo 'Foo.*' diff --git a/clang/test/clang-rename/VariableTemplate.cpp b/clang/test/clang-rename/VariableTemplate.cpp deleted file mode 100644 index a345ede5a7f6ad..00000000000000 --- a/clang/test/clang-rename/VariableTemplate.cpp +++ /dev/null @@ -1,32 +0,0 @@ -template -bool Foo = true; // CHECK: bool Bar = true; - -// explicit template specialization -template <> -bool Foo = false; // CHECK: bool Bar = false; - -// partial template specialization -template -bool Foo = false; // bool Bar = false; - -void k() { - // ref to the explicit template specialization - Foo; // CHECK: Bar; - // ref to the primary template. - Foo; // CHECK: Bar; -} - - -// Test 1. -// RUN: clang-rename -offset=34 -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s -// Test 2. -// RUN: clang-rename -offset=128 -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s -// Test 3. -// RUN: clang-rename -offset=248 -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s -// Test 4. -// RUN: clang-rename -offset=357 -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s -// Test 5. -// RUN: clang-rename -offset=431 -new-name=Bar %s -- | sed 's,//.*,,' | FileCheck %s - -// To find offsets after modifying the file, use: -// grep -Ubo 'Foo.*' diff --git a/clang/test/clang-rename/YAMLInput.cpp b/clang/test/clang-rename/YAMLInput.cpp deleted file mode 100644 index 55dbc6d66a5a6c..00000000000000 --- a/clang/test/clang-rename/YAMLInput.cpp +++ /dev/null @@ -1,10 +0,0 @@ -class Foo1 { // CHECK: class Bar1 -}; - -class Foo2 { // CHECK: class Bar2 -}; - -// Test 1. -// RUN: clang-rename -input %S/Inputs/OffsetToNewName.yaml %s -- | sed 's,//.*,,' | FileCheck %s -// Test 2. -// RUN: clang-rename -input %S/Inputs/QualifiedNameToNewName.yaml %s -- | sed 's,//.*,,' | FileCheck %s diff --git a/clang/tools/CMakeLists.txt b/clang/tools/CMakeLists.txt index f588a3634ee6bc..9a3512712a28a4 100644 --- a/clang/tools/CMakeLists.txt +++ b/clang/tools/CMakeLists.txt @@ -19,7 +19,6 @@ endif() add_clang_subdirectory(c-index-test) -add_clang_subdirectory(clang-rename) add_clang_subdirectory(clang-refactor) # For MinGW we only enable shared library if LLVM_LINK_LLVM_DYLIB=ON. # Without that option resulting library is too close to 2^16 DLL exports limit. diff --git a/clang/tools/clang-rename/CMakeLists.txt b/clang/tools/clang-rename/CMakeLists.txt deleted file mode 100644 index f4c4e520520d9e..00000000000000 --- a/clang/tools/clang-rename/CMakeLists.txt +++ /dev/null @@ -1,26 +0,0 @@ -set(LLVM_LINK_COMPONENTS - Option - Support - ) - -add_clang_tool(clang-rename - ClangRename.cpp - ) - -clang_target_link_libraries(clang-rename - PRIVATE - clangBasic - clangFrontend - clangRewrite - clangSerialization - clangTooling - clangToolingCore - clangToolingRefactoring - ) - -install(FILES clang-rename.py - DESTINATION "${CMAKE_INSTALL_DATADIR}/clang" - COMPONENT clang-rename) -install(FILES clang-rename.el - DESTINATION "${CMAKE_INSTALL_DATADIR}/clang" - COMPONENT clang-rename) diff --git a/clang/tools/clang-rename/ClangRename.cpp b/clang/tools/clang-rename/ClangRename.cpp deleted file mode 100644 index f2ac0c4360e0dc..00000000000000 --- a/clang/tools/clang-rename/ClangRename.cpp +++ /dev/null @@ -1,242 +0,0 @@ -//===--- tools/extra/clang-rename/ClangRename.cpp - Clang rename tool -----===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// This file implements a clang-rename tool that automatically finds and -/// renames symbols in C++ code. -/// -//===----------------------------------------------------------------------===// - -#include "clang/Basic/Diagnostic.h" -#include "clang/Basic/DiagnosticOptions.h" -#include "clang/Basic/FileManager.h" -#include "clang/Basic/IdentifierTable.h" -#include "clang/Basic/LangOptions.h" -#include "clang/Basic/SourceManager.h" -#include "clang/Basic/TokenKinds.h" -#include "clang/Frontend/TextDiagnosticPrinter.h" -#include "clang/Rewrite/Core/Rewriter.h" -#include "clang/Tooling/CommonOptionsParser.h" -#include "clang/Tooling/Refactoring.h" -#include "clang/Tooling/Refactoring/Rename/RenamingAction.h" -#include "clang/Tooling/Refactoring/Rename/USRFindingAction.h" -#include "clang/Tooling/ReplacementsYaml.h" -#include "clang/Tooling/Tooling.h" -#include "llvm/ADT/IntrusiveRefCntPtr.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/FileSystem.h" -#include "llvm/Support/YAMLTraits.h" -#include "llvm/Support/raw_ostream.h" -#include -#include - -using namespace llvm; -using namespace clang; - -/// An oldname -> newname rename. -struct RenameAllInfo { - unsigned Offset = 0; - std::string QualifiedName; - std::string NewName; -}; - -LLVM_YAML_IS_SEQUENCE_VECTOR(RenameAllInfo) - -namespace llvm { -namespace yaml { - -/// Specialized MappingTraits to describe how a RenameAllInfo is -/// (de)serialized. -template <> struct MappingTraits { - static void mapping(IO &IO, RenameAllInfo &Info) { - IO.mapOptional("Offset", Info.Offset); - IO.mapOptional("QualifiedName", Info.QualifiedName); - IO.mapRequired("NewName", Info.NewName); - } -}; - -} // end namespace yaml -} // end namespace llvm - -static cl::OptionCategory ClangRenameOptions("clang-rename common options"); - -static cl::list SymbolOffsets( - "offset", - cl::desc("Locates the symbol by offset as opposed to :."), - cl::cat(ClangRenameOptions)); -static cl::opt Inplace("i", cl::desc("Overwrite edited s."), - cl::cat(ClangRenameOptions)); -static cl::list - QualifiedNames("qualified-name", - cl::desc("The fully qualified name of the symbol."), - cl::cat(ClangRenameOptions)); - -static cl::list - NewNames("new-name", cl::desc("The new name to change the symbol to."), - cl::cat(ClangRenameOptions)); -static cl::opt PrintName( - "pn", - cl::desc("Print the found symbol's name prior to renaming to stderr."), - cl::cat(ClangRenameOptions)); -static cl::opt PrintLocations( - "pl", cl::desc("Print the locations affected by renaming to stderr."), - cl::cat(ClangRenameOptions)); -static cl::opt - ExportFixes("export-fixes", - cl::desc("YAML file to store suggested fixes in."), - cl::value_desc("filename"), cl::cat(ClangRenameOptions)); -static cl::opt - Input("input", cl::desc("YAML file to load oldname-newname pairs from."), - cl::Optional, cl::cat(ClangRenameOptions)); -static cl::opt Force("force", - cl::desc("Ignore nonexistent qualified names."), - cl::cat(ClangRenameOptions)); - -int main(int argc, const char **argv) { - auto ExpectedParser = - tooling::CommonOptionsParser::create(argc, argv, ClangRenameOptions); - if (!ExpectedParser) { - llvm::errs() << ExpectedParser.takeError(); - return 1; - } - tooling::CommonOptionsParser &OP = ExpectedParser.get(); - - if (!Input.empty()) { - // Populate QualifiedNames and NewNames from a YAML file. - ErrorOr> Buffer = - llvm::MemoryBuffer::getFile(Input); - if (!Buffer) { - errs() << "clang-rename: failed to read " << Input << ": " - << Buffer.getError().message() << "\n"; - return 1; - } - - std::vector Infos; - llvm::yaml::Input YAML(Buffer.get()->getBuffer()); - YAML >> Infos; - for (const auto &Info : Infos) { - if (!Info.QualifiedName.empty()) - QualifiedNames.push_back(Info.QualifiedName); - else - SymbolOffsets.push_back(Info.Offset); - NewNames.push_back(Info.NewName); - } - } - - // Check the arguments for correctness. - if (NewNames.empty()) { - errs() << "clang-rename: -new-name must be specified.\n\n"; - return 1; - } - - if (SymbolOffsets.empty() == QualifiedNames.empty()) { - errs() << "clang-rename: -offset and -qualified-name can't be present at " - "the same time.\n"; - return 1; - } - - // Check if NewNames is a valid identifier in C++17. - LangOptions Options; - Options.CPlusPlus = true; - Options.CPlusPlus17 = true; - IdentifierTable Table(Options); - for (const auto &NewName : NewNames) { - auto NewNameTokKind = Table.get(NewName).getTokenID(); - if (!tok::isAnyIdentifier(NewNameTokKind)) { - errs() << "ERROR: new name is not a valid identifier in C++17.\n\n"; - return 1; - } - } - - if (SymbolOffsets.size() + QualifiedNames.size() != NewNames.size()) { - errs() << "clang-rename: number of symbol offsets(" << SymbolOffsets.size() - << ") + number of qualified names (" << QualifiedNames.size() - << ") must be equal to number of new names(" << NewNames.size() - << ").\n\n"; - cl::PrintHelpMessage(); - return 1; - } - - auto Files = OP.getSourcePathList(); - tooling::RefactoringTool Tool(OP.getCompilations(), Files); - tooling::USRFindingAction FindingAction(SymbolOffsets, QualifiedNames, Force); - Tool.run(tooling::newFrontendActionFactory(&FindingAction).get()); - const std::vector> &USRList = - FindingAction.getUSRList(); - const std::vector &PrevNames = FindingAction.getUSRSpellings(); - if (PrintName) { - for (const auto &PrevName : PrevNames) { - outs() << "clang-rename found name: " << PrevName << '\n'; - } - } - - if (FindingAction.errorOccurred()) { - // Diagnostics are already issued at this point. - return 1; - } - - // Perform the renaming. - tooling::RenamingAction RenameAction(NewNames, PrevNames, USRList, - Tool.getReplacements(), PrintLocations); - std::unique_ptr Factory = - tooling::newFrontendActionFactory(&RenameAction); - int ExitCode; - - if (Inplace) { - ExitCode = Tool.runAndSave(Factory.get()); - } else { - ExitCode = Tool.run(Factory.get()); - - if (!ExportFixes.empty()) { - std::error_code EC; - llvm::raw_fd_ostream OS(ExportFixes, EC, llvm::sys::fs::OF_None); - if (EC) { - llvm::errs() << "Error opening output file: " << EC.message() << '\n'; - return 1; - } - - // Export replacements. - tooling::TranslationUnitReplacements TUR; - const auto &FileToReplacements = Tool.getReplacements(); - for (const auto &Entry : FileToReplacements) - TUR.Replacements.insert(TUR.Replacements.end(), Entry.second.begin(), - Entry.second.end()); - - yaml::Output YAML(OS); - YAML << TUR; - OS.close(); - return 0; - } - - // Write every file to stdout. Right now we just barf the files without any - // indication of which files start where, other than that we print the files - // in the same order we see them. - LangOptions DefaultLangOptions; - IntrusiveRefCntPtr DiagOpts = new DiagnosticOptions(); - TextDiagnosticPrinter DiagnosticPrinter(errs(), &*DiagOpts); - DiagnosticsEngine Diagnostics( - IntrusiveRefCntPtr(new DiagnosticIDs()), &*DiagOpts, - &DiagnosticPrinter, false); - auto &FileMgr = Tool.getFiles(); - SourceManager Sources(Diagnostics, FileMgr); - Rewriter Rewrite(Sources, DefaultLangOptions); - - Tool.applyAllReplacements(Rewrite); - for (const auto &File : Files) { - auto Entry = FileMgr.getOptionalFileRef(File); - if (!Entry) { - errs() << "clang-rename: " << File << " does not exist.\n"; - return 1; - } - const auto ID = Sources.getOrCreateFileID(*Entry, SrcMgr::C_User); - Rewrite.getEditBuffer(ID).write(outs()); - } - } - - return ExitCode; -} diff --git a/clang/tools/clang-rename/clang-rename.el b/clang/tools/clang-rename/clang-rename.el deleted file mode 100644 index 3f47c11e2c752e..00000000000000 --- a/clang/tools/clang-rename/clang-rename.el +++ /dev/null @@ -1,80 +0,0 @@ -;;; clang-rename.el --- Renames every occurrence of a symbol found at . -*- lexical-binding: t; -*- - -;; Version: 0.1.0 -;; Keywords: tools, c - -;;; Commentary: - -;; To install clang-rename.el make sure the directory of this file is in your -;; `load-path' and add -;; -;; (require 'clang-rename) -;; -;; to your .emacs configuration. - -;;; Code: - -(defgroup clang-rename nil - "Integration with clang-rename" - :group 'c) - -(defcustom clang-rename-binary "clang-rename" - "Path to clang-rename executable." - :type '(file :must-match t) - :group 'clang-rename) - -;;;###autoload -(defun clang-rename (new-name) - "Rename all instances of the symbol at point to NEW-NAME using clang-rename." - (interactive "sEnter a new name: ") - (save-some-buffers :all) - ;; clang-rename should not be combined with other operations when undoing. - (undo-boundary) - (let ((output-buffer (get-buffer-create "*clang-rename*"))) - (with-current-buffer output-buffer (erase-buffer)) - (let ((exit-code (call-process - clang-rename-binary nil output-buffer nil - (format "-offset=%d" - ;; clang-rename wants file (byte) offsets, not - ;; buffer (character) positions. - (clang-rename--bufferpos-to-filepos - ;; Emacs treats one character after a symbol as - ;; part of the symbol, but clang-rename doesn’t. - ;; Use the beginning of the current symbol, if - ;; available, to resolve the inconsistency. - (or (car (bounds-of-thing-at-point 'symbol)) - (point)) - 'exact)) - (format "-new-name=%s" new-name) - "-i" (buffer-file-name)))) - (if (and (integerp exit-code) (zerop exit-code)) - ;; Success; revert current buffer so it gets the modifications. - (progn - (kill-buffer output-buffer) - (revert-buffer :ignore-auto :noconfirm :preserve-modes)) - ;; Failure; append exit code to output buffer and display it. - (let ((message (clang-rename--format-message - "clang-rename failed with %s %s" - (if (integerp exit-code) "exit status" "signal") - exit-code))) - (with-current-buffer output-buffer - (insert ?\n message ?\n)) - (message "%s" message) - (display-buffer output-buffer)))))) - -(defalias 'clang-rename--bufferpos-to-filepos - (if (fboundp 'bufferpos-to-filepos) - 'bufferpos-to-filepos - ;; Emacs 24 doesn’t have ‘bufferpos-to-filepos’, simulate it using - ;; ‘position-bytes’. - (lambda (position &optional _quality _coding-system) - (1- (position-bytes position))))) - -;; ‘format-message’ is new in Emacs 25.1. Provide a fallback for older -;; versions. -(defalias 'clang-rename--format-message - (if (fboundp 'format-message) 'format-message 'format)) - -(provide 'clang-rename) - -;;; clang-rename.el ends here diff --git a/clang/tools/clang-rename/clang-rename.py b/clang/tools/clang-rename/clang-rename.py deleted file mode 100644 index 1cbabaf859a5e8..00000000000000 --- a/clang/tools/clang-rename/clang-rename.py +++ /dev/null @@ -1,70 +0,0 @@ -""" -Minimal clang-rename integration with Vim. - -Before installing make sure one of the following is satisfied: - -* clang-rename is in your PATH -* `g:clang_rename_path` in ~/.vimrc points to valid clang-rename executable -* `binary` in clang-rename.py points to valid to clang-rename executable - -To install, simply put this into your ~/.vimrc for python2 support - - noremap cr :pyf /clang-rename.py - -For python3 use the following command (note the change from :pyf to :py3f) - - noremap cr :py3f /clang-rename.py - -IMPORTANT NOTE: Before running the tool, make sure you saved the file. - -All you have to do now is to place a cursor on a variable/function/class which -you would like to rename and press 'cr'. You will be prompted for a new -name if the cursor points to a valid symbol. -""" - -from __future__ import absolute_import, division, print_function -import vim -import subprocess -import sys - - -def main(): - binary = "clang-rename" - if vim.eval('exists("g:clang_rename_path")') == "1": - binary = vim.eval("g:clang_rename_path") - - # Get arguments for clang-rename binary. - offset = int(vim.eval('line2byte(line("."))+col(".")')) - 2 - if offset < 0: - print( - "Couldn't determine cursor position. Is your file empty?", file=sys.stderr - ) - return - filename = vim.current.buffer.name - - new_name_request_message = "type new name:" - new_name = vim.eval("input('{}\n')".format(new_name_request_message)) - - # Call clang-rename. - command = [ - binary, - filename, - "-i", - "-offset", - str(offset), - "-new-name", - str(new_name), - ] - # FIXME: make it possible to run the tool on unsaved file. - p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout, stderr = p.communicate() - - if stderr: - print(stderr) - - # Reload all buffers in Vim. - vim.command("checktime") - - -if __name__ == "__main__": - main() diff --git a/clang/unittests/CMakeLists.txt b/clang/unittests/CMakeLists.txt index e43ee7bfa88aaa..85d265426ec80b 100644 --- a/clang/unittests/CMakeLists.txt +++ b/clang/unittests/CMakeLists.txt @@ -48,7 +48,6 @@ if(NOT WIN32 AND CLANG_TOOL_LIBCLANG_BUILD) add_subdirectory(libclang) endif() add_subdirectory(DirectoryWatcher) -add_subdirectory(Rename) add_subdirectory(Index) add_subdirectory(InstallAPI) add_subdirectory(Serialization) diff --git a/clang/unittests/Rename/CMakeLists.txt b/clang/unittests/Rename/CMakeLists.txt deleted file mode 100644 index 6ec0c521551c61..00000000000000 --- a/clang/unittests/Rename/CMakeLists.txt +++ /dev/null @@ -1,29 +0,0 @@ -set(LLVM_LINK_COMPONENTS - FrontendOpenMP - support - ) - -# We'd like clang/unittests/Tooling/RewriterTestContext.h in the test. -include_directories(${CLANG_SOURCE_DIR}) - -add_clang_unittest(ClangRenameTests - RenameClassTest.cpp - RenameEnumTest.cpp - RenameAliasTest.cpp - RenameMemberTest.cpp - RenameFunctionTest.cpp - ) - -clang_target_link_libraries(ClangRenameTests - PRIVATE - clangAST - clangASTMatchers - clangBasic - clangFormat - clangFrontend - clangRewrite - clangSerialization - clangTooling - clangToolingCore - clangToolingRefactoring - ) diff --git a/clang/unittests/Rename/ClangRenameTest.h b/clang/unittests/Rename/ClangRenameTest.h deleted file mode 100644 index 64033657b57963..00000000000000 --- a/clang/unittests/Rename/ClangRenameTest.h +++ /dev/null @@ -1,116 +0,0 @@ -//===-- ClangRenameTests.cpp - clang-rename unit tests --------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_CLANG_UNITTESTS_RENAME_CLANGRENAMETEST_H -#define LLVM_CLANG_UNITTESTS_RENAME_CLANGRENAMETEST_H - -#include "unittests/Tooling/RewriterTestContext.h" -#include "clang/ASTMatchers/ASTMatchFinder.h" -#include "clang/Basic/FileManager.h" -#include "clang/Basic/FileSystemOptions.h" -#include "clang/Format/Format.h" -#include "clang/Frontend/CompilerInstance.h" -#include "clang/Frontend/PCHContainerOperations.h" -#include "clang/Tooling/Refactoring.h" -#include "clang/Tooling/Refactoring/Rename/RenamingAction.h" -#include "clang/Tooling/Refactoring/Rename/USRFindingAction.h" -#include "clang/Tooling/Tooling.h" -#include "llvm/ADT/IntrusiveRefCntPtr.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/Support/Format.h" -#include "llvm/Support/MemoryBuffer.h" -#include "llvm/Support/VirtualFileSystem.h" -#include "gtest/gtest.h" -#include -#include -#include - -namespace clang { -namespace clang_rename { -namespace test { - -struct Case { - std::string Before; - std::string After; - std::string OldName; - std::string NewName; -}; - -class ClangRenameTest : public testing::Test, - public testing::WithParamInterface { -protected: - void AppendToHeader(StringRef Code) { HeaderContent += Code.str(); } - - std::string runClangRenameOnCode(llvm::StringRef Code, - llvm::StringRef OldName, - llvm::StringRef NewName) { - std::string NewCode; - llvm::raw_string_ostream(NewCode) << llvm::format( - "#include \"%s\"\n%s", HeaderName.c_str(), Code.str().c_str()); - tooling::FileContentMappings FileContents = {{HeaderName, HeaderContent}, - {CCName, NewCode}}; - clang::RewriterTestContext Context; - Context.createInMemoryFile(HeaderName, HeaderContent); - clang::FileID InputFileID = Context.createInMemoryFile(CCName, NewCode); - - tooling::USRFindingAction FindingAction({}, {std::string(OldName)}, false); - std::unique_ptr USRFindingActionFactory = - tooling::newFrontendActionFactory(&FindingAction); - - if (!tooling::runToolOnCodeWithArgs( - USRFindingActionFactory->create(), NewCode, {"-std=c++11"}, CCName, - "clang-rename", std::make_shared(), - FileContents)) - return ""; - - const std::vector> &USRList = - FindingAction.getUSRList(); - std::vector NewNames = {std::string(NewName)}; - std::map FileToReplacements; - tooling::QualifiedRenamingAction RenameAction(NewNames, USRList, - FileToReplacements); - auto RenameActionFactory = tooling::newFrontendActionFactory(&RenameAction); - if (!tooling::runToolOnCodeWithArgs( - RenameActionFactory->create(), NewCode, {"-std=c++11"}, CCName, - "clang-rename", std::make_shared(), - FileContents)) - return ""; - - formatAndApplyAllReplacements(FileToReplacements, Context.Rewrite, "llvm"); - return Context.getRewrittenText(InputFileID); - } - - void CompareSnippets(StringRef Expected, StringRef Actual) { - std::string ExpectedCode; - llvm::raw_string_ostream(ExpectedCode) << llvm::format( - "#include \"%s\"\n%s", HeaderName.c_str(), Expected.str().c_str()); - EXPECT_EQ(format(ExpectedCode), format(Actual)); - } - - std::string format(llvm::StringRef Code) { - tooling::Replacements Replaces = format::reformat( - format::getLLVMStyle(), Code, {tooling::Range(0, Code.size())}); - auto ChangedCode = tooling::applyAllReplacements(Code, Replaces); - EXPECT_TRUE(static_cast(ChangedCode)); - if (!ChangedCode) { - llvm::errs() << llvm::toString(ChangedCode.takeError()); - return ""; - } - return *ChangedCode; - } - - std::string HeaderContent; - std::string HeaderName = "header.h"; - std::string CCName = "input.cc"; -}; - -} // namespace test -} // namespace clang_rename -} // namesdpace clang - -#endif diff --git a/clang/unittests/Rename/RenameAliasTest.cpp b/clang/unittests/Rename/RenameAliasTest.cpp deleted file mode 100644 index 50fa2c104263fb..00000000000000 --- a/clang/unittests/Rename/RenameAliasTest.cpp +++ /dev/null @@ -1,303 +0,0 @@ -//===-- RenameAliasTest.cpp - unit tests for renaming alias ---------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "ClangRenameTest.h" - -namespace clang { -namespace clang_rename { -namespace test { -namespace { - -class RenameAliasTest : public ClangRenameTest { -public: - RenameAliasTest() { - AppendToHeader(R"( - #define MACRO(x) x - namespace some_ns { - class A { - public: - void foo() {} - struct Nested { - enum NestedEnum { - E1, E2, - }; - }; - }; - } // namespace some_ns - namespace a { - typedef some_ns::A TA; - using UA = some_ns::A; - } // namespace a - namespace b { - typedef some_ns::A TA; - using UA = some_ns::A; - } - template class ptr {}; - template - - using TPtr = ptr; - )"); - } -}; - -INSTANTIATE_TEST_SUITE_P( - RenameAliasTests, RenameAliasTest, - testing::ValuesIn(std::vector({ - // basic functions - {"void f(a::TA a1) {}", "void f(b::TB a1) {}", "a::TA", "b::TB"}, - {"void f(a::UA a1) {}", "void f(b::UB a1) {}", "a::UA", "b::UB"}, - {"void f(a::TA* a1) {}", "void f(b::TB* a1) {}", "a::TA", "b::TB"}, - {"void f(a::TA** a1) {}", "void f(b::TB** a1) {}", "a::TA", "b::TB"}, - {"a::TA f() { return a::TA(); }", "b::TB f() { return b::TB(); }", - "a::TA", "b::TB"}, - {"a::TA f() { return a::UA(); }", "b::TB f() { return a::UA(); }", - "a::TA", "b::TB"}, - {"a::TA f() { return a::UA(); }", "a::TA f() { return b::UB(); }", - "a::UA", "b::UB"}, - {"void f() { a::TA a; }", "void f() { b::TB a; }", "a::TA", "b::TB"}, - {"void f(const a::TA& a1) {}", "void f(const b::TB& a1) {}", "a::TA", - "b::TB"}, - {"void f(const a::UA& a1) {}", "void f(const b::UB& a1) {}", "a::UA", - "b::UB"}, - {"void f(const a::TA* a1) {}", "void f(const b::TB* a1) {}", "a::TA", - "b::TB"}, - {"namespace a { void f(TA a1) {} }", - "namespace a { void f(b::TB a1) {} }", "a::TA", "b::TB"}, - {"void f(MACRO(a::TA) a1) {}", "void f(MACRO(b::TB) a1) {}", "a::TA", - "b::TB"}, - {"void f(MACRO(a::TA a1)) {}", "void f(MACRO(b::TB a1)) {}", "a::TA", - "b::TB"}, - - // shorten/add namespace. - {"namespace b { void f(a::UA a1) {} }", - "namespace b {void f(UB a1) {} }", "a::UA", "b::UB"}, - {"namespace a { void f(UA a1) {} }", - "namespace a {void f(b::UB a1) {} }", "a::UA", "b::UB"}, - - // use namespace and typedefs - {"struct S { using T = a::TA; T a_; };", - "struct S { using T = b::TB; T a_; };", "a::TA", "b::TB"}, - {"using T = a::TA; T gA;", "using T = b::TB; T gA;", "a::TA", "b::TB"}, - {"using T = a::UA; T gA;", "using T = b::UB; T gA;", "a::UA", "b::UB"}, - {"typedef a::TA T; T gA;", "typedef b::TB T; T gA;", "a::TA", "b::TB"}, - {"typedef a::UA T; T gA;", "typedef b::UB T; T gA;", "a::UA", "b::UB"}, - {"typedef MACRO(a::TA) T; T gA;", "typedef MACRO(b::TB) T; T gA;", - "a::TA", "b::TB"}, - - // types in using shadows. - {"using a::TA; TA gA;", "using b::TB; b::TB gA;", "a::TA", "b::TB"}, - {"using a::UA; UA gA;", "using b::UB; b::UB gA;", "a::UA", "b::UB"}, - - // struct members and other oddities - {"struct S : public a::TA {};", "struct S : public b::TB {};", "a::TA", - "b::TB"}, - {"struct S : public a::UA {};", "struct S : public b::UB {};", "a::UA", - "b::UB"}, - {"struct F { void f(a::TA a1) {} };", - "struct F { void f(b::TB a1) {} };", "a::TA", "b::TB"}, - {"struct F { a::TA a_; };", "struct F { b::TB a_; };", "a::TA", - "b::TB"}, - {"struct F { ptr a_; };", "struct F { ptr a_; };", - "a::TA", "b::TB"}, - {"struct F { ptr a_; };", "struct F { ptr a_; };", - "a::UA", "b::UB"}, - - // types in nested name specifiers - {"void f() { a::TA::Nested ne; }", "void f() { b::TB::Nested ne; }", - "a::TA", "b::TB"}, - {"void f() { a::UA::Nested ne; }", "void f() { b::UB::Nested ne; }", - "a::UA", "b::UB"}, - {"void f() { a::TA::Nested::NestedEnum e; }", - "void f() { b::TB::Nested::NestedEnum e; }", "a::TA", "b::TB"}, - {"void f() { auto e = a::TA::Nested::NestedEnum::E1; }", - "void f() { auto e = b::TB::Nested::NestedEnum::E1; }", "a::TA", - "b::TB"}, - {"void f() { auto e = a::TA::Nested::E1; }", - "void f() { auto e = b::TB::Nested::E1; }", "a::TA", "b::TB"}, - - // templates - {"template struct Foo { T t; }; void f() { Foo " - "foo; }", - "template struct Foo { T t; }; void f() { Foo " - "foo; }", - "a::TA", "b::TB"}, - {"template struct Foo { a::TA a; };", - "template struct Foo { b::TB a; };", "a::TA", "b::TB"}, - {"template void f(T t) {} void g() { f(a::TA()); }", - "template void f(T t) {} void g() { f(b::TB()); }", - "a::TA", "b::TB"}, - {"template void f(T t) {} void g() { f(a::UA()); }", - "template void f(T t) {} void g() { f(b::UB()); }", - "a::UA", "b::UB"}, - {"template int f() { return 1; } template <> int " - "f() { return 2; } int g() { return f(); }", - "template int f() { return 1; } template <> int " - "f() { return 2; } int g() { return f(); }", - "a::TA", "b::TB"}, - {"struct Foo { template T foo(); }; void g() { Foo f; " - "auto a = f.template foo(); }", - "struct Foo { template T foo(); }; void g() { Foo f; " - "auto a = f.template foo(); }", - "a::TA", "b::TB"}, - {"struct Foo { template T foo(); }; void g() { Foo f; " - "auto a = f.template foo(); }", - "struct Foo { template T foo(); }; void g() { Foo f; " - "auto a = f.template foo(); }", - "a::UA", "b::UB"}, - - // The following two templates are distilled from regressions found in - // unique_ptr<> and type_traits.h - {"template struct outer { typedef T type; type Baz(); }; " - "outer g_A;", - "template struct outer { typedef T type; type Baz(); }; " - "outer g_A;", - "a::TA", "b::TB"}, - {"template struct nested { typedef T type; }; template " - " struct outer { typename nested::type Foo(); }; " - "outer g_A;", - "template struct nested { typedef T type; }; template " - " struct outer { typename nested::type Foo(); }; " - "outer g_A;", - "a::TA", "b::TB"}, - - // macros - {"#define FOO(T, t) T t\nvoid f() { FOO(a::TA, a1); FOO(a::TA, a2); }", - "#define FOO(T, t) T t\nvoid f() { FOO(b::TB, a1); FOO(b::TB, a2); }", - "a::TA", "b::TB"}, - {"#define FOO(n) a::TA n\nvoid f() { FOO(a1); FOO(a2); }", - "#define FOO(n) b::TB n\nvoid f() { FOO(a1); FOO(a2); }", "a::TA", - "b::TB"}, - {"#define FOO(n) a::UA n\nvoid f() { FOO(a1); FOO(a2); }", - "#define FOO(n) b::UB n\nvoid f() { FOO(a1); FOO(a2); }", "a::UA", - "b::UB"}, - - // Pointer to member functions - {"auto gA = &a::TA::foo;", "auto gA = &b::TB::foo;", "a::TA", "b::TB"}, - {"using a::TA; auto gA = &TA::foo;", - "using b::TB; auto gA = &b::TB::foo;", "a::TA", "b::TB"}, - {"typedef a::TA T; auto gA = &T::foo;", - "typedef b::TB T; auto gA = &T::foo;", "a::TA", "b::TB"}, - {"auto gA = &MACRO(a::TA)::foo;", "auto gA = &MACRO(b::TB)::foo;", - "a::TA", "b::TB"}, - - // templated using alias. - {"void f(TPtr p) {}", "void f(NewTPtr p) {}", "TPtr", - "NewTPtr"}, - {"void f(::TPtr p) {}", "void f(::NewTPtr p) {}", "TPtr", - "NewTPtr"}, - }))); - -TEST_P(RenameAliasTest, RenameAlias) { - auto Param = GetParam(); - assert(!Param.OldName.empty()); - assert(!Param.NewName.empty()); - std::string Actual = - runClangRenameOnCode(Param.Before, Param.OldName, Param.NewName); - CompareSnippets(Param.After, Actual); -} - -TEST_F(RenameAliasTest, RenameTypedefDefinitions) { - std::string Before = R"( - class X {}; - typedef X TOld; - )"; - std::string Expected = R"( - class X {}; - typedef X TNew; - )"; - std::string After = runClangRenameOnCode(Before, "TOld", "TNew"); - CompareSnippets(Expected, After); -} - -TEST_F(RenameAliasTest, RenameUsingAliasDefinitions) { - std::string Before = R"( - class X {}; - using UOld = X; - )"; - std::string Expected = R"( - class X {}; - using UNew = X; - )"; - std::string After = runClangRenameOnCode(Before, "UOld", "UNew"); - CompareSnippets(Expected, After); -} - -TEST_F(RenameAliasTest, RenameTemplatedAliasDefinitions) { - std::string Before = R"( - template - class X { T t; }; - - template - using Old = X; - )"; - std::string Expected = R"( - template - class X { T t; }; - - template - using New = X; - )"; - std::string After = runClangRenameOnCode(Before, "Old", "New"); - CompareSnippets(Expected, After); -} - -TEST_F(RenameAliasTest, RenameAliasesInNamespaces) { - std::string Before = R"( - namespace x { class X {}; } - namespace ns { - using UOld = x::X; - } - )"; - std::string Expected = R"( - namespace x { class X {}; } - namespace ns { - using UNew = x::X; - } - )"; - std::string After = runClangRenameOnCode(Before, "ns::UOld", "ns::UNew"); - CompareSnippets(Expected, After); -} - -TEST_F(RenameAliasTest, AliasesInMacros) { - std::string Before = R"( - namespace x { class Old {}; } - namespace ns { - #define REF(alias) alias alias_var; - - #define ALIAS(old) \ - using old##Alias = x::old; \ - REF(old##Alias); - - ALIAS(Old); - - OldAlias old_alias; - } - )"; - std::string Expected = R"( - namespace x { class Old {}; } - namespace ns { - #define REF(alias) alias alias_var; - - #define ALIAS(old) \ - using old##Alias = x::old; \ - REF(old##Alias); - - ALIAS(Old); - - NewAlias old_alias; - } - )"; - std::string After = - runClangRenameOnCode(Before, "ns::OldAlias", "ns::NewAlias"); - CompareSnippets(Expected, After); -} - -} // anonymous namespace -} // namespace test -} // namespace clang_rename -} // namesdpace clang diff --git a/clang/unittests/Rename/RenameClassTest.cpp b/clang/unittests/Rename/RenameClassTest.cpp deleted file mode 100644 index 24370b5795e942..00000000000000 --- a/clang/unittests/Rename/RenameClassTest.cpp +++ /dev/null @@ -1,820 +0,0 @@ -//===-- RenameClassTest.cpp - unit tests for renaming classes -------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "ClangRenameTest.h" - -namespace clang { -namespace clang_rename { -namespace test { -namespace { - -class RenameClassTest : public ClangRenameTest { -public: - RenameClassTest() { - AppendToHeader(R"( - namespace a { - class Foo { - public: - struct Nested { - enum NestedEnum {E1, E2}; - }; - void func() {} - static int Constant; - }; - class Goo { - public: - struct Nested { - enum NestedEnum {E1, E2}; - }; - }; - int Foo::Constant = 1; - } // namespace a - namespace b { - class Foo {}; - } // namespace b - - #define MACRO(x) x - - template class ptr {}; - )"); - } -}; - -INSTANTIATE_TEST_SUITE_P( - RenameClassTests, RenameClassTest, - testing::ValuesIn(std::vector({ - // basic classes - {"a::Foo f;", "b::Bar f;", "", ""}, - {"::a::Foo f;", "::b::Bar f;", "", ""}, - {"void f(a::Foo f) {}", "void f(b::Bar f) {}", "", ""}, - {"void f(a::Foo *f) {}", "void f(b::Bar *f) {}", "", ""}, - {"a::Foo f() { return a::Foo(); }", "b::Bar f() { return b::Bar(); }", - "", ""}, - {"namespace a {a::Foo f() { return Foo(); }}", - "namespace a {b::Bar f() { return b::Bar(); }}", "", ""}, - {"void f(const a::Foo& a1) {}", "void f(const b::Bar& a1) {}", "", ""}, - {"void f(const a::Foo* a1) {}", "void f(const b::Bar* a1) {}", "", ""}, - {"namespace a { void f(Foo a1) {} }", - "namespace a { void f(b::Bar a1) {} }", "", ""}, - {"void f(MACRO(a::Foo) a1) {}", "void f(MACRO(b::Bar) a1) {}", "", ""}, - {"void f(MACRO(a::Foo a1)) {}", "void f(MACRO(b::Bar a1)) {}", "", ""}, - {"a::Foo::Nested ns;", "b::Bar::Nested ns;", "", ""}, - {"auto t = a::Foo::Constant;", "auto t = b::Bar::Constant;", "", ""}, - {"a::Foo::Nested ns;", "a::Foo::Nested2 ns;", "a::Foo::Nested", - "a::Foo::Nested2"}, - - // use namespace and typedefs - {"using a::Foo; Foo gA;", "using b::Bar; b::Bar gA;", "", ""}, - {"using a::Foo; void f(Foo gA) {}", "using b::Bar; void f(Bar gA) {}", - "", ""}, - {"using a::Foo; namespace x { Foo gA; }", - "using b::Bar; namespace x { Bar gA; }", "", ""}, - {"struct S { using T = a::Foo; T a_; };", - "struct S { using T = b::Bar; T a_; };", "", ""}, - {"using T = a::Foo; T gA;", "using T = b::Bar; T gA;", "", ""}, - {"typedef a::Foo T; T gA;", "typedef b::Bar T; T gA;", "", ""}, - {"typedef MACRO(a::Foo) T; T gA;", "typedef MACRO(b::Bar) T; T gA;", "", - ""}, - - // struct members and other oddities - {"struct S : public a::Foo {};", "struct S : public b::Bar {};", "", - ""}, - {"struct F { void f(a::Foo a1) {} };", - "struct F { void f(b::Bar a1) {} };", "", ""}, - {"struct F { a::Foo a_; };", "struct F { b::Bar a_; };", "", ""}, - {"struct F { ptr a_; };", "struct F { ptr a_; };", "", - ""}, - - {"void f() { a::Foo::Nested ne; }", "void f() { b::Bar::Nested ne; }", - "", ""}, - {"void f() { a::Goo::Nested ne; }", "void f() { a::Goo::Nested ne; }", - "", ""}, - {"void f() { a::Foo::Nested::NestedEnum e; }", - "void f() { b::Bar::Nested::NestedEnum e; }", "", ""}, - {"void f() { auto e = a::Foo::Nested::NestedEnum::E1; }", - "void f() { auto e = b::Bar::Nested::NestedEnum::E1; }", "", ""}, - {"void f() { auto e = a::Foo::Nested::E1; }", - "void f() { auto e = b::Bar::Nested::E1; }", "", ""}, - - // templates - {"template struct Foo { T t; };\n" - "void f() { Foo foo; }", - "template struct Foo { T t; };\n" - "void f() { Foo foo; }", - "", ""}, - {"template struct Foo { a::Foo a; };", - "template struct Foo { b::Bar a; };", "", ""}, - {"template void f(T t) {}\n" - "void g() { f(a::Foo()); }", - "template void f(T t) {}\n" - "void g() { f(b::Bar()); }", - "", ""}, - {"template int f() { return 1; }\n" - "template <> int f() { return 2; }\n" - "int g() { return f(); }", - "template int f() { return 1; }\n" - "template <> int f() { return 2; }\n" - "int g() { return f(); }", - "", ""}, - {"struct Foo { template T foo(); };\n" - "void g() { Foo f; auto a = f.template foo(); }", - "struct Foo { template T foo(); };\n" - "void g() { Foo f; auto a = f.template foo(); }", - "", ""}, - - // The following two templates are distilled from regressions found in - // unique_ptr<> and type_traits.h - {"template struct outer {\n" - " typedef T type;\n" - " type Baz();\n" - " };\n" - " outer g_A;", - "template struct outer {\n" - " typedef T type;\n" - " type Baz();\n" - " };\n" - " outer g_A;", - "", ""}, - {"template struct nested { typedef T type; };\n" - "template struct outer { typename nested::type Foo(); " - "};\n" - "outer g_A;", - "template struct nested { typedef T type; };\n" - "template struct outer { typename nested::type Foo(); " - "};\n" - "outer g_A;", - "", ""}, - - // macros - {"#define FOO(T, t) T t\n" - "void f() { FOO(a::Foo, a1); FOO(a::Foo, a2); }", - "#define FOO(T, t) T t\n" - "void f() { FOO(b::Bar, a1); FOO(b::Bar, a2); }", - "", ""}, - {"#define FOO(n) a::Foo n\n" - " void f() { FOO(a1); FOO(a2); }", - "#define FOO(n) b::Bar n\n" - " void f() { FOO(a1); FOO(a2); }", - "", ""}, - - // Pointer to member functions - {"auto gA = &a::Foo::func;", "auto gA = &b::Bar::func;", "", ""}, - {"using a::Foo; auto gA = &Foo::func;", - "using b::Bar; auto gA = &b::Bar::func;", "", ""}, - {"using a::Foo; namespace x { auto gA = &Foo::func; }", - "using b::Bar; namespace x { auto gA = &Bar::func; }", "", ""}, - {"typedef a::Foo T; auto gA = &T::func;", - "typedef b::Bar T; auto gA = &T::func;", "", ""}, - {"auto gA = &MACRO(a::Foo)::func;", "auto gA = &MACRO(b::Bar)::func;", - "", ""}, - - // Short match inside a namespace - {"namespace a { void f(Foo a1) {} }", - "namespace a { void f(b::Bar a1) {} }", "", ""}, - - // Correct match. - {"using a::Foo; struct F { ptr a_; };", - "using b::Bar; struct F { ptr a_; };", "", ""}, - - // avoid false positives - {"void f(b::Foo a) {}", "void f(b::Foo a) {}", "", ""}, - {"namespace b { void f(Foo a) {} }", "namespace b { void f(Foo a) {} }", - "", ""}, - - // friends, everyone needs friends. - {"class Foo { int i; friend class a::Foo; };", - "class Foo { int i; friend class b::Bar; };", "", ""}, - })) ); - -TEST_P(RenameClassTest, RenameClasses) { - auto Param = GetParam(); - std::string OldName = Param.OldName.empty() ? "a::Foo" : Param.OldName; - std::string NewName = Param.NewName.empty() ? "b::Bar" : Param.NewName; - std::string Actual = runClangRenameOnCode(Param.Before, OldName, NewName); - CompareSnippets(Param.After, Actual); -} - -class NamespaceDetectionTest : public ClangRenameTest { -protected: - NamespaceDetectionTest() { - AppendToHeader(R"( - class Old {}; - namespace o1 { - class Old {}; - namespace o2 { - class Old {}; - namespace o3 { - class Old {}; - } // namespace o3 - } // namespace o2 - } // namespace o1 - )"); - } -}; - -INSTANTIATE_TEST_SUITE_P( - RenameClassTest, NamespaceDetectionTest, - ::testing::ValuesIn(std::vector({ - // Test old and new namespace overlap. - {"namespace o1 { namespace o2 { namespace o3 { Old moo; } } }", - "namespace o1 { namespace o2 { namespace o3 { New moo; } } }", - "o1::o2::o3::Old", "o1::o2::o3::New"}, - {"namespace o1 { namespace o2 { namespace o3 { Old moo; } } }", - "namespace o1 { namespace o2 { namespace o3 { n3::New moo; } } }", - "o1::o2::o3::Old", "o1::o2::n3::New"}, - {"namespace o1 { namespace o2 { namespace o3 { Old moo; } } }", - "namespace o1 { namespace o2 { namespace o3 { n2::n3::New moo; } } }", - "o1::o2::o3::Old", "o1::n2::n3::New"}, - {"namespace o1 { namespace o2 { Old moo; } }", - "namespace o1 { namespace o2 { New moo; } }", "::o1::o2::Old", - "::o1::o2::New"}, - {"namespace o1 { namespace o2 { Old moo; } }", - "namespace o1 { namespace o2 { n2::New moo; } }", "::o1::o2::Old", - "::o1::n2::New"}, - {"namespace o1 { namespace o2 { Old moo; } }", - "namespace o1 { namespace o2 { ::n1::n2::New moo; } }", - "::o1::o2::Old", "::n1::n2::New"}, - {"namespace o1 { namespace o2 { Old moo; } }", - "namespace o1 { namespace o2 { n1::n2::New moo; } }", "::o1::o2::Old", - "n1::n2::New"}, - - // Test old and new namespace with differing depths. - {"namespace o1 { namespace o2 { namespace o3 { Old moo; } } }", - "namespace o1 { namespace o2 { namespace o3 { New moo; } } }", - "o1::o2::o3::Old", "::o1::New"}, - {"namespace o1 { namespace o2 { namespace o3 { Old moo; } } }", - "namespace o1 { namespace o2 { namespace o3 { New moo; } } }", - "o1::o2::o3::Old", "::o1::o2::New"}, - {"namespace o1 { namespace o2 { namespace o3 { Old moo; } } }", - "namespace o1 { namespace o2 { namespace o3 { New moo; } } }", - "o1::o2::o3::Old", "o1::New"}, - {"namespace o1 { namespace o2 { namespace o3 { Old moo; } } }", - "namespace o1 { namespace o2 { namespace o3 { New moo; } } }", - "o1::o2::o3::Old", "o1::o2::New"}, - {"Old moo;", "o1::New moo;", "::Old", "o1::New"}, - {"Old moo;", "o1::New moo;", "Old", "o1::New"}, - {"namespace o1 { ::Old moo; }", "namespace o1 { New moo; }", "Old", - "o1::New"}, - {"namespace o1 { namespace o2 { Old moo; } }", - "namespace o1 { namespace o2 { ::New moo; } }", "::o1::o2::Old", - "::New"}, - {"namespace o1 { namespace o2 { Old moo; } }", - "namespace o1 { namespace o2 { New moo; } }", "::o1::o2::Old", "New"}, - - // Test moving into the new namespace at different levels. - {"namespace n1 { namespace n2 { o1::o2::Old moo; } }", - "namespace n1 { namespace n2 { New moo; } }", "::o1::o2::Old", - "::n1::n2::New"}, - {"namespace n1 { namespace n2 { o1::o2::Old moo; } }", - "namespace n1 { namespace n2 { New moo; } }", "::o1::o2::Old", - "n1::n2::New"}, - {"namespace n1 { namespace n2 { o1::o2::Old moo; } }", - "namespace n1 { namespace n2 { o2::New moo; } }", "::o1::o2::Old", - "::n1::o2::New"}, - {"namespace n1 { namespace n2 { o1::o2::Old moo; } }", - "namespace n1 { namespace n2 { o2::New moo; } }", "::o1::o2::Old", - "n1::o2::New"}, - {"namespace n1 { namespace n2 { o1::o2::Old moo; } }", - "namespace n1 { namespace n2 { ::o1::o2::New moo; } }", - "::o1::o2::Old", "::o1::o2::New"}, - {"namespace n1 { namespace n2 { o1::o2::Old moo; } }", - "namespace n1 { namespace n2 { o1::o2::New moo; } }", "::o1::o2::Old", - "o1::o2::New"}, - - // Test friends declarations. - {"class Foo { friend class o1::Old; };", - "class Foo { friend class o1::New; };", "o1::Old", "o1::New"}, - {"class Foo { int i; friend class o1::Old; };", - "class Foo { int i; friend class ::o1::New; };", "::o1::Old", - "::o1::New"}, - {"namespace o1 { class Foo { int i; friend class Old; }; }", - "namespace o1 { class Foo { int i; friend class New; }; }", "o1::Old", - "o1::New"}, - {"namespace o1 { class Foo { int i; friend class Old; }; }", - "namespace o1 { class Foo { int i; friend class New; }; }", - "::o1::Old", "::o1::New"}, - })) ); - -TEST_P(NamespaceDetectionTest, RenameClasses) { - auto Param = GetParam(); - std::string Actual = - runClangRenameOnCode(Param.Before, Param.OldName, Param.NewName); - CompareSnippets(Param.After, Actual); -} - -class TemplatedClassRenameTest : public ClangRenameTest { -protected: - TemplatedClassRenameTest() { - AppendToHeader(R"( - template struct Old { - T t_; - T f() { return T(); }; - static T s(T t) { return t; } - }; - namespace ns { - template struct Old { - T t_; - T f() { return T(); }; - static T s(T t) { return t; } - }; - } // namespace ns - - namespace o1 { - namespace o2 { - namespace o3 { - template struct Old { - T t_; - T f() { return T(); }; - static T s(T t) { return t; } - }; - } // namespace o3 - } // namespace o2 - } // namespace o1 - )"); - } -}; - -INSTANTIATE_TEST_SUITE_P( - RenameClassTests, TemplatedClassRenameTest, - ::testing::ValuesIn(std::vector({ - {"Old gI; Old gB;", "New gI; New gB;", "Old", - "New"}, - {"ns::Old gI; ns::Old gB;", - "ns::New gI; ns::New gB;", "ns::Old", "ns::New"}, - {"auto gI = &Old::f; auto gB = &Old::f;", - "auto gI = &New::f; auto gB = &New::f;", "Old", "New"}, - {"auto gI = &ns::Old::f;", "auto gI = &ns::New::f;", - "ns::Old", "ns::New"}, - - {"int gI = Old::s(0); bool gB = Old::s(false);", - "int gI = New::s(0); bool gB = New::s(false);", "Old", - "New"}, - {"int gI = ns::Old::s(0); bool gB = ns::Old::s(false);", - "int gI = ns::New::s(0); bool gB = ns::New::s(false);", - "ns::Old", "ns::New"}, - - {"struct S { Old o_; };", "struct S { New o_; };", "Old", - "New"}, - {"struct S { ns::Old o_; };", "struct S { ns::New o_; };", - "ns::Old", "ns::New"}, - - {"auto a = reinterpret_cast*>(new Old);", - "auto a = reinterpret_cast*>(new New);", "Old", "New"}, - {"auto a = reinterpret_cast*>(new ns::Old);", - "auto a = reinterpret_cast*>(new ns::New);", - "ns::Old", "ns::New"}, - {"auto a = reinterpret_cast*>(new Old);", - "auto a = reinterpret_cast*>(new New);", "Old", - "New"}, - {"auto a = reinterpret_cast*>(new ns::Old);", - "auto a = reinterpret_cast*>(new ns::New);", - "ns::Old", "ns::New"}, - - {"Old& foo();", "New& foo();", "Old", "New"}, - {"ns::Old& foo();", "ns::New& foo();", "ns::Old", - "ns::New"}, - {"o1::o2::o3::Old& foo();", "o1::o2::o3::New& foo();", - "o1::o2::o3::Old", "o1::o2::o3::New"}, - {"namespace ns { Old& foo(); }", - "namespace ns { New& foo(); }", "ns::Old", "ns::New"}, - {"const Old& foo();", "const New& foo();", "Old", "New"}, - {"const ns::Old& foo();", "const ns::New& foo();", - "ns::Old", "ns::New"}, - - // FIXME: figure out why this only works when Moo gets - // specialized at some point. - {"template struct Moo { Old o_; }; Moo m;", - "template struct Moo { New o_; }; Moo m;", "Old", - "New"}, - {"template struct Moo { ns::Old o_; }; Moo m;", - "template struct Moo { ns::New o_; }; Moo m;", - "ns::Old", "ns::New"}, - })) ); - -TEST_P(TemplatedClassRenameTest, RenameTemplateClasses) { - auto Param = GetParam(); - std::string Actual = - runClangRenameOnCode(Param.Before, Param.OldName, Param.NewName); - CompareSnippets(Param.After, Actual); -} - -TEST_F(ClangRenameTest, RenameClassWithOutOfLineMembers) { - std::string Before = R"( - class Old { - public: - Old(); - ~Old(); - - Old* next(); - - private: - Old* next_; - }; - - Old::Old() {} - Old::~Old() {} - Old* Old::next() { return next_; } - )"; - std::string Expected = R"( - class New { - public: - New(); - ~New(); - - New* next(); - - private: - New* next_; - }; - - New::New() {} - New::~New() {} - New* New::next() { return next_; } - )"; - std::string After = runClangRenameOnCode(Before, "Old", "New"); - CompareSnippets(Expected, After); -} - -TEST_F(ClangRenameTest, RenameClassWithInlineMembers) { - std::string Before = R"( - class Old { - public: - Old() {} - ~Old() {} - - Old* next() { return next_; } - - private: - Old* next_; - }; - )"; - std::string Expected = R"( - class New { - public: - New() {} - ~New() {} - - New* next() { return next_; } - - private: - New* next_; - }; - )"; - std::string After = runClangRenameOnCode(Before, "Old", "New"); - CompareSnippets(Expected, After); -} - -TEST_F(ClangRenameTest, RenameClassWithNamespaceWithInlineMembers) { - std::string Before = R"( - namespace ns { - class Old { - public: - Old() {} - ~Old() {} - - Old* next() { return next_; } - - private: - Old* next_; - }; - } // namespace ns - )"; - std::string Expected = R"( - namespace ns { - class New { - public: - New() {} - ~New() {} - - New* next() { return next_; } - - private: - New* next_; - }; - } // namespace ns - )"; - std::string After = runClangRenameOnCode(Before, "ns::Old", "ns::New"); - CompareSnippets(Expected, After); -} - -TEST_F(ClangRenameTest, RenameClassWithNamespaceWithOutOfInlineMembers) { - std::string Before = R"( - namespace ns { - class Old { - public: - Old(); - ~Old(); - - Old* next(); - - private: - Old* next_; - }; - - Old::Old() {} - Old::~Old() {} - Old* Old::next() { return next_; } - } // namespace ns - )"; - std::string Expected = R"( - namespace ns { - class New { - public: - New(); - ~New(); - - New* next(); - - private: - New* next_; - }; - - New::New() {} - New::~New() {} - New* New::next() { return next_; } - } // namespace ns - )"; - std::string After = runClangRenameOnCode(Before, "ns::Old", "ns::New"); - CompareSnippets(Expected, After); -} - -TEST_F(ClangRenameTest, RenameClassInInheritedConstructor) { - // `using Base::Base;` will generate an implicit constructor containing usage - // of `::ns::Old` which should not be matched. - std::string Before = R"( - namespace ns { - class Old; - class Old { - int x; - }; - class Base { - protected: - Old *moo_; - public: - Base(Old *moo) : moo_(moo) {} - }; - class Derived : public Base { - public: - using Base::Base; - }; - } // namespace ns - int main() { - ::ns::Old foo; - ::ns::Derived d(&foo); - return 0; - })"; - std::string Expected = R"( - namespace ns { - class New; - class New { - int x; - }; - class Base { - protected: - New *moo_; - public: - Base(New *moo) : moo_(moo) {} - }; - class Derived : public Base { - public: - using Base::Base; - }; - } // namespace ns - int main() { - ::ns::New foo; - ::ns::Derived d(&foo); - return 0; - })"; - std::string After = runClangRenameOnCode(Before, "ns::Old", "ns::New"); - CompareSnippets(Expected, After); -} - -TEST_F(ClangRenameTest, DontRenameReferencesInImplicitFunction) { - std::string Before = R"( - namespace ns { - class Old { - }; - } // namespace ns - struct S { - int y; - ns::Old old; - }; - void f() { - S s1, s2, s3; - // This causes an implicit assignment operator to be created. - s1 = s2 = s3; - } - )"; - std::string Expected = R"( - namespace ns { - class New { - }; - } // namespace ns - struct S { - int y; - ::new_ns::New old; - }; - void f() { - S s1, s2, s3; - // This causes an implicit assignment operator to be created. - s1 = s2 = s3; - } - )"; - std::string After = runClangRenameOnCode(Before, "ns::Old", "::new_ns::New"); - CompareSnippets(Expected, After); -} - -TEST_F(ClangRenameTest, ReferencesInLambdaFunctionParameters) { - std::string Before = R"( - template - class function; - template - class function { - public: - template - function(Functor f) {} - - function() {} - - R operator()(ArgTypes...) const {} - }; - - namespace ns { - class Old {}; - void f() { - function func; - } - } // namespace ns)"; - std::string Expected = R"( - template - class function; - template - class function { - public: - template - function(Functor f) {} - - function() {} - - R operator()(ArgTypes...) const {} - }; - - namespace ns { - class New {}; - void f() { - function func; - } - } // namespace ns)"; - std::string After = runClangRenameOnCode(Before, "ns::Old", "::new_ns::New"); - CompareSnippets(Expected, After); -} - -TEST_F(ClangRenameTest, DontChangeIfSameName) { - std::string Before = R"( - namespace foo { - class Old { - public: - static void foo() {} - }; - } - - void f(foo::Old * x) { - foo::Old::foo() ; - } - using foo::Old;)"; - std::string Expected = R"( - namespace foo { - class Old { - public: - static void foo() {} - }; - } - - void f(foo::Old * x) { - foo::Old::foo() ; - } - using foo::Old;)"; - std::string After = runClangRenameOnCode(Before, "foo::Old", "foo::Old"); - CompareSnippets(Expected, After); -} - -TEST_F(ClangRenameTest, ChangeIfNewNameWithLeadingDotDot) { - std::string Before = R"( - namespace foo { - class Old { - public: - static void foo() {} - }; - } - - void f(foo::Old * x) { - foo::Old::foo() ; - } - using foo::Old;)"; - std::string Expected = R"( - namespace foo { - class Old { - public: - static void foo() {} - }; - } - - void f(::foo::Old * x) { - ::foo::Old::foo() ; - } - using ::foo::Old;)"; - std::string After = runClangRenameOnCode(Before, "foo::Old", "::foo::Old"); - CompareSnippets(Expected, After); -} - -TEST_F(ClangRenameTest, ChangeIfSameNameWithLeadingDotDot) { - std::string Before = R"( - namespace foo { - class Old { - public: - static void foo() {} - }; - } - - void f(foo::Old * x) { - foo::Old::foo() ; - } - using foo::Old;)"; - std::string Expected = R"( - namespace foo { - class Old { - public: - static void foo() {} - }; - } - - void f(::foo::Old * x) { - ::foo::Old::foo() ; - } - using ::foo::Old;)"; - std::string After = runClangRenameOnCode(Before, "::foo::Old", "::foo::Old"); - CompareSnippets(Expected, After); -} - -TEST_F(RenameClassTest, UsingAlias) { - std::string Before = R"( - namespace a { struct A {}; } - - namespace foo { - using Alias = a::A; - Alias a; - })"; - std::string Expected = R"( - namespace a { struct B {}; } - - namespace foo { - using Alias = b::B; - Alias a; - })"; - std::string After = runClangRenameOnCode(Before, "a::A", "b::B"); - CompareSnippets(Expected, After); -} - -TEST_F(ClangRenameTest, FieldDesignatedInitializers) { - std::string Before = R"( - struct S { - int a; - }; - void foo() { - S s = { .a = 10 }; - s.a = 20; - })"; - std::string Expected = R"( - struct S { - int b; - }; - void foo() { - S s = { .b = 10 }; - s.b = 20; - })"; - std::string After = runClangRenameOnCode(Before, "S::a", "S::b"); - CompareSnippets(Expected, After); -} - -// FIXME: investigate why the test fails when adding a new USR to the USRSet. -TEST_F(ClangRenameTest, DISABLED_NestedTemplates) { - std::string Before = R"( - namespace a { template struct A {}; } - a::A> foo;)"; - std::string Expected = R"( - namespace a { template struct B {}; } - b::B> foo;)"; - std::string After = runClangRenameOnCode(Before, "a::A", "b::B"); - CompareSnippets(Expected, After); -} - - -} // anonymous namespace -} // namespace test -} // namespace clang_rename -} // namesdpace clang diff --git a/clang/unittests/Rename/RenameEnumTest.cpp b/clang/unittests/Rename/RenameEnumTest.cpp deleted file mode 100644 index dc3440047c4a46..00000000000000 --- a/clang/unittests/Rename/RenameEnumTest.cpp +++ /dev/null @@ -1,189 +0,0 @@ -#include "ClangRenameTest.h" - -namespace clang { -namespace clang_rename { -namespace test { -namespace { - -class RenameEnumTest : public ClangRenameTest { -public: - RenameEnumTest() { - AppendToHeader(R"( - #define MACRO(x) x - namespace a { - enum A1 { Red }; - enum class A2 { Blue }; - struct C { - enum NestedEnum { White }; - enum class NestedScopedEnum { Black }; - }; - namespace d { - enum A3 { Orange }; - } // namespace d - enum A4 { Pink }; - } // namespace a - enum A5 { Green };)"); - } -}; - -INSTANTIATE_TEST_SUITE_P( - RenameEnumTests, RenameEnumTest, - testing::ValuesIn(std::vector({ - {"void f(a::A2 arg) { a::A2 t = a::A2::Blue; }", - "void f(b::B2 arg) { b::B2 t = b::B2::Blue; }", "a::A2", "b::B2"}, - {"void f() { a::A1* t1; }", "void f() { b::B1* t1; }", "a::A1", - "b::B1"}, - {"void f() { a::A2* t1; }", "void f() { b::B2* t1; }", "a::A2", - "b::B2"}, - {"void f() { enum a::A2 t = a::A2::Blue; }", - "void f() { enum b::B2 t = b::B2::Blue; }", "a::A2", "b::B2"}, - {"void f() { enum a::A2 t = a::A2::Blue; }", - "void f() { enum b::B2 t = b::B2::Blue; }", "a::A2", "b::B2"}, - - {"void f() { a::A1 t = a::Red; }", "void f() { b::B1 t = b::B1::Red; }", - "a::A1", "b::B1"}, - {"void f() { a::A1 t = a::A1::Red; }", - "void f() { b::B1 t = b::B1::Red; }", "a::A1", "b::B1"}, - {"void f() { auto t = a::Red; }", "void f() { auto t = b::B1::Red; }", - "a::A1", "b::B1"}, - {"namespace b { void f() { a::A1 t = a::Red; } }", - "namespace b { void f() { B1 t = B1::Red; } }", "a::A1", "b::B1"}, - {"void f() { a::d::A3 t = a::d::Orange; }", - "void f() { a::b::B3 t = a::b::B3::Orange; }", "a::d::A3", "a::b::B3"}, - {"namespace a { void f() { a::d::A3 t = a::d::Orange; } }", - "namespace a { void f() { b::B3 t = b::B3::Orange; } }", "a::d::A3", - "a::b::B3"}, - {"void f() { A5 t = Green; }", "void f() { B5 t = Green; }", "A5", - "B5"}, - // FIXME: the new namespace qualifier should be added to the unscoped - // enum constant. - {"namespace a { void f() { auto t = Green; } }", - "namespace a { void f() { auto t = Green; } }", "a::A1", "b::B1"}, - - // namespace qualifiers - {"namespace a { void f(A1 a1) {} }", - "namespace a { void f(b::B1 a1) {} }", "a::A1", "b::B1"}, - {"namespace a { void f(A2 a2) {} }", - "namespace a { void f(b::B2 a2) {} }", "a::A2", "b::B2"}, - {"namespace b { void f(a::A1 a1) {} }", - "namespace b { void f(B1 a1) {} }", "a::A1", "b::B1"}, - {"namespace b { void f(a::A2 a2) {} }", - "namespace b { void f(B2 a2) {} }", "a::A2", "b::B2"}, - - // nested enums - {"void f() { a::C::NestedEnum t = a::C::White; }", - "void f() { a::C::NewNestedEnum t = a::C::NewNestedEnum::White; }", - "a::C::NestedEnum", "a::C::NewNestedEnum"}, - {"void f() { a::C::NestedScopedEnum t = a::C::NestedScopedEnum::Black; " - "}", - "void f() { a::C::NewNestedScopedEnum t = " - "a::C::NewNestedScopedEnum::Black; }", - "a::C::NestedScopedEnum", "a::C::NewNestedScopedEnum"}, - - // macros - {"void f(MACRO(a::A1) a1) {}", "void f(MACRO(b::B1) a1) {}", "a::A1", - "b::B1"}, - {"void f(MACRO(a::A2) a2) {}", "void f(MACRO(b::B2) a2) {}", "a::A2", - "b::B2"}, - {"#define FOO(T, t) T t\nvoid f() { FOO(a::A1, a1); }", - "#define FOO(T, t) T t\nvoid f() { FOO(b::B1, a1); }", "a::A1", - "b::B1"}, - {"#define FOO(T, t) T t\nvoid f() { FOO(a::A2, a2); }", - "#define FOO(T, t) T t\nvoid f() { FOO(b::B2, a2); }", "a::A2", - "b::B2"}, - {"#define FOO(n) a::A1 n\nvoid f() { FOO(a1); FOO(a2); }", - "#define FOO(n) b::B1 n\nvoid f() { FOO(a1); FOO(a2); }", "a::A1", - "b::B1"}, - - // using and type alias - {"using a::A1; A1 gA;", "using b::B1; b::B1 gA;", "a::A1", "b::B1"}, - {"using a::A2; A2 gA;", "using b::B2; b::B2 gA;", "a::A2", "b::B2"}, - {"struct S { using T = a::A1; T a_; };", - "struct S { using T = b::B1; T a_; };", "a::A1", "b::B1"}, - {"using T = a::A1; T gA;", "using T = b::B1; T gA;", "a::A1", "b::B1"}, - {"using T = a::A2; T gA;", "using T = b::B2; T gA;", "a::A2", "b::B2"}, - {"typedef a::A1 T; T gA;", "typedef b::B1 T; T gA;", "a::A1", "b::B1"}, - {"typedef a::A2 T; T gA;", "typedef b::B2 T; T gA;", "a::A2", "b::B2"}, - {"typedef MACRO(a::A1) T; T gA;", "typedef MACRO(b::B1) T; T gA;", - "a::A1", "b::B1"}, - - // templates - {"template struct Foo { T t; }; void f() { Foo " - "foo1; }", - "template struct Foo { T t; }; void f() { Foo " - "foo1; }", - "a::A1", "b::B1"}, - {"template struct Foo { T t; }; void f() { Foo " - "foo2; }", - "template struct Foo { T t; }; void f() { Foo " - "foo2; }", - "a::A2", "b::B2"}, - {"template struct Foo { a::A1 a1; };", - "template struct Foo { b::B1 a1; };", "a::A1", "b::B1"}, - {"template struct Foo { a::A2 a2; };", - "template struct Foo { b::B2 a2; };", "a::A2", "b::B2"}, - {"template int f() { return 1; } template<> int f() " - "{ return 2; } int g() { return f(); }", - "template int f() { return 1; } template<> int f() " - "{ return 2; } int g() { return f(); }", - "a::A1", "b::B1"}, - {"template int f() { return 1; } template<> int f() " - "{ return 2; } int g() { return f(); }", - "template int f() { return 1; } template<> int f() " - "{ return 2; } int g() { return f(); }", - "a::A2", "b::B2"}, - {"struct Foo { template T foo(); }; void g() { Foo f; " - "f.foo(); }", - "struct Foo { template T foo(); }; void g() { Foo f; " - "f.foo(); }", - "a::A1", "b::B1"}, - {"struct Foo { template T foo(); }; void g() { Foo f; " - "f.foo(); }", - "struct Foo { template T foo(); }; void g() { Foo f; " - "f.foo(); }", - "a::A2", "b::B2"}, - })) ); - -TEST_P(RenameEnumTest, RenameEnums) { - auto Param = GetParam(); - assert(!Param.OldName.empty()); - assert(!Param.NewName.empty()); - std::string Actual = - runClangRenameOnCode(Param.Before, Param.OldName, Param.NewName); - CompareSnippets(Param.After, Actual); -} - -TEST_F(RenameEnumTest, RenameEnumDecl) { - std::string Before = R"( - namespace ns { - enum Old1 { Blue }; - } - )"; - std::string Expected = R"( - namespace ns { - enum New1 { Blue }; - } - )"; - std::string After = runClangRenameOnCode(Before, "ns::Old1", "ns::New1"); - CompareSnippets(Expected, After); -} - -TEST_F(RenameEnumTest, RenameScopedEnumDecl) { - std::string Before = R"( - namespace ns { - enum class Old1 { Blue }; - } - )"; - std::string Expected = R"( - namespace ns { - enum class New1 { Blue }; - } - )"; - std::string After = runClangRenameOnCode(Before, "ns::Old1", "ns::New1"); - CompareSnippets(Expected, After); -} - -} // anonymous namespace -} // namespace test -} // namespace clang_rename -} // namesdpace clang diff --git a/clang/unittests/Rename/RenameFunctionTest.cpp b/clang/unittests/Rename/RenameFunctionTest.cpp deleted file mode 100644 index 1c9b112232ebc8..00000000000000 --- a/clang/unittests/Rename/RenameFunctionTest.cpp +++ /dev/null @@ -1,573 +0,0 @@ -//===-- RenameFunctionTest.cpp - unit tests for renaming functions --------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "ClangRenameTest.h" - -namespace clang { -namespace clang_rename { -namespace test { -namespace { - -class RenameFunctionTest : public ClangRenameTest { -public: - RenameFunctionTest() { - AppendToHeader(R"( - struct A { - static bool Foo(); - static bool Spam(); - }; - struct B { - static void Same(); - static bool Foo(); - static int Eric(int x); - }; - void Same(int x); - int Eric(int x); - namespace base { - void Same(); - void ToNanoSeconds(); - void ToInt64NanoSeconds(); - })"); - } -}; - -TEST_F(RenameFunctionTest, RefactorsAFoo) { - std::string Before = R"( - void f() { - A::Foo(); - ::A::Foo(); - })"; - std::string Expected = R"( - void f() { - A::Bar(); - ::A::Bar(); - })"; - - std::string After = runClangRenameOnCode(Before, "A::Foo", "A::Bar"); - CompareSnippets(Expected, After); -} - -TEST_F(RenameFunctionTest, RefactorsNonCallingAFoo) { - std::string Before = R"( - bool g(bool (*func)()) { - return func(); - } - void f() { - auto *ref1 = A::Foo; - auto *ref2 = ::A::Foo; - g(A::Foo); - })"; - std::string Expected = R"( - bool g(bool (*func)()) { - return func(); - } - void f() { - auto *ref1 = A::Bar; - auto *ref2 = ::A::Bar; - g(A::Bar); - })"; - std::string After = runClangRenameOnCode(Before, "A::Foo", "A::Bar"); - CompareSnippets(Expected, After); -} - -TEST_F(RenameFunctionTest, RefactorsEric) { - std::string Before = R"( - void f() { - if (Eric(3)==4) ::Eric(2); - })"; - std::string Expected = R"( - void f() { - if (Larry(3)==4) ::Larry(2); - })"; - std::string After = runClangRenameOnCode(Before, "Eric", "Larry"); - CompareSnippets(Expected, After); -} - -TEST_F(RenameFunctionTest, RefactorsNonCallingEric) { - std::string Before = R"( - int g(int (*func)(int)) { - return func(1); - } - void f() { - auto *ref = ::Eric; - g(Eric); - })"; - std::string Expected = R"( - int g(int (*func)(int)) { - return func(1); - } - void f() { - auto *ref = ::Larry; - g(Larry); - })"; - std::string After = runClangRenameOnCode(Before, "Eric", "Larry"); - CompareSnippets(Expected, After); -} - -TEST_F(RenameFunctionTest, DoesNotRefactorBFoo) { - std::string Before = R"( - void f() { - B::Foo(); - })"; - std::string After = runClangRenameOnCode(Before, "A::Foo", "A::Bar"); - CompareSnippets(Before, After); -} - -TEST_F(RenameFunctionTest, DoesNotRefactorBEric) { - std::string Before = R"( - void f() { - B::Eric(2); - })"; - std::string After = runClangRenameOnCode(Before, "Eric", "Larry"); - CompareSnippets(Before, After); -} - -TEST_F(RenameFunctionTest, DoesNotRefactorCEric) { - std::string Before = R"( - namespace C { int Eric(int x); } - void f() { - if (C::Eric(3)==4) ::C::Eric(2); - })"; - std::string Expected = R"( - namespace C { int Eric(int x); } - void f() { - if (C::Eric(3)==4) ::C::Eric(2); - })"; - std::string After = runClangRenameOnCode(Before, "Eric", "Larry"); - CompareSnippets(Expected, After); -} - -TEST_F(RenameFunctionTest, DoesNotRefactorEricInNamespaceC) { - std::string Before = R"( - namespace C { - int Eric(int x); - void f() { - if (Eric(3)==4) Eric(2); - } - } // namespace C)"; - std::string After = runClangRenameOnCode(Before, "Eric", "Larry"); - CompareSnippets(Before, After); -} - -TEST_F(RenameFunctionTest, NamespaceQualified) { - std::string Before = R"( - void f() { - base::ToNanoSeconds(); - ::base::ToNanoSeconds(); - } - void g() { - using base::ToNanoSeconds; - base::ToNanoSeconds(); - ::base::ToNanoSeconds(); - ToNanoSeconds(); - } - namespace foo { - namespace base { - void ToNanoSeconds(); - void f() { - base::ToNanoSeconds(); - } - } - void f() { - ::base::ToNanoSeconds(); - } - })"; - std::string Expected = R"( - void f() { - base::ToInt64NanoSeconds(); - ::base::ToInt64NanoSeconds(); - } - void g() { - using base::ToInt64NanoSeconds; - base::ToInt64NanoSeconds(); - ::base::ToInt64NanoSeconds(); - base::ToInt64NanoSeconds(); - } - namespace foo { - namespace base { - void ToNanoSeconds(); - void f() { - base::ToNanoSeconds(); - } - } - void f() { - ::base::ToInt64NanoSeconds(); - } - })"; - std::string After = runClangRenameOnCode(Before, "base::ToNanoSeconds", - "base::ToInt64NanoSeconds"); - CompareSnippets(Expected, After); -} - -TEST_F(RenameFunctionTest, RenameFunctionDecls) { - std::string Before = R"( - namespace na { - void X(); - void X() {} - })"; - std::string Expected = R"( - namespace na { - void Y(); - void Y() {} - })"; - std::string After = runClangRenameOnCode(Before, "na::X", "na::Y"); - CompareSnippets(Expected, After); -} - -TEST_F(RenameFunctionTest, RenameTemplateFunctions) { - std::string Before = R"( - namespace na { - template T X(); - } - namespace na { void f() { X(); } } - namespace nb { void g() { na::X (); } } - )"; - std::string Expected = R"( - namespace na { - template T Y(); - } - namespace na { void f() { nb::Y(); } } - namespace nb { void g() { Y(); } } - )"; - std::string After = runClangRenameOnCode(Before, "na::X", "nb::Y"); - CompareSnippets(Expected, After); -} - -TEST_F(RenameFunctionTest, RenameOutOfLineFunctionDecls) { - std::string Before = R"( - namespace na { - void X(); - } - void na::X() {} - )"; - std::string Expected = R"( - namespace na { - void Y(); - } - void na::Y() {} - )"; - std::string After = runClangRenameOnCode(Before, "na::X", "na::Y"); - CompareSnippets(Expected, After); -} - -TEST_F(RenameFunctionTest, NewNamespaceWithoutLeadingDotDot) { - std::string Before = R"( - namespace old_ns { - void X(); - void X() {} - } - // Assume that the reference is in another file. - void f() { old_ns::X(); } - namespace old_ns { void g() { X(); } } - namespace new_ns { void h() { ::old_ns::X(); } } - )"; - std::string Expected = R"( - namespace old_ns { - void Y(); - void Y() {} - } - // Assume that the reference is in another file. - void f() { new_ns::Y(); } - namespace old_ns { void g() { new_ns::Y(); } } - namespace new_ns { void h() { Y(); } } - )"; - std::string After = runClangRenameOnCode(Before, "::old_ns::X", "new_ns::Y"); - CompareSnippets(Expected, After); -} - -TEST_F(RenameFunctionTest, NewNamespaceWithLeadingDotDot) { - std::string Before = R"( - namespace old_ns { - void X(); - void X() {} - } - // Assume that the reference is in another file. - void f() { old_ns::X(); } - namespace old_ns { void g() { X(); } } - namespace new_ns { void h() { ::old_ns::X(); } } - )"; - std::string Expected = R"( - namespace old_ns { - void Y(); - void Y() {} - } - // Assume that the reference is in another file. - void f() { ::new_ns::Y(); } - namespace old_ns { void g() { ::new_ns::Y(); } } - namespace new_ns { void h() { Y(); } } - )"; - std::string After = - runClangRenameOnCode(Before, "::old_ns::X", "::new_ns::Y"); - CompareSnippets(Expected, After); -} - -TEST_F(RenameFunctionTest, DontRenameSymbolsDefinedInAnonymousNamespace) { - std::string Before = R"( - namespace old_ns { - class X {}; - namespace { - void X(); - void X() {} - void f() { X(); } - } - } - )"; - std::string Expected = R"( - namespace old_ns { - class Y {}; - namespace { - void X(); - void X() {} - void f() { X(); } - } - } - )"; - std::string After = - runClangRenameOnCode(Before, "::old_ns::X", "::old_ns::Y"); - CompareSnippets(Expected, After); -} - -TEST_F(RenameFunctionTest, NewNestedNamespace) { - std::string Before = R"( - namespace old_ns { - void X(); - void X() {} - } - // Assume that the reference is in another file. - namespace old_ns { - void f() { X(); } - } - )"; - std::string Expected = R"( - namespace old_ns { - void X(); - void X() {} - } - // Assume that the reference is in another file. - namespace old_ns { - void f() { older_ns::X(); } - } - )"; - std::string After = - runClangRenameOnCode(Before, "::old_ns::X", "::old_ns::older_ns::X"); - CompareSnippets(Expected, After); -} - -TEST_F(RenameFunctionTest, MoveFromGlobalToNamespaceWithoutLeadingDotDot) { - std::string Before = R"( - void X(); - void X() {} - - // Assume that the reference is in another file. - namespace some_ns { - void f() { X(); } - } - )"; - std::string Expected = R"( - void X(); - void X() {} - - // Assume that the reference is in another file. - namespace some_ns { - void f() { ns::X(); } - } - )"; - std::string After = - runClangRenameOnCode(Before, "::X", "ns::X"); - CompareSnippets(Expected, After); -} - -TEST_F(RenameFunctionTest, MoveFromGlobalToNamespaceWithLeadingDotDot) { - std::string Before = R"( - void Y() {} - - // Assume that the reference is in another file. - namespace some_ns { - void f() { Y(); } - } - )"; - std::string Expected = R"( - void Y() {} - - // Assume that the reference is in another file. - namespace some_ns { - void f() { ::ns::Y(); } - } - )"; - std::string After = - runClangRenameOnCode(Before, "::Y", "::ns::Y"); - CompareSnippets(Expected, After); -} - -// FIXME: the rename of overloaded operator is not fully supported yet. -TEST_F(RenameFunctionTest, DISABLED_DoNotRenameOverloadedOperatorCalls) { - std::string Before = R"( - namespace old_ns { - class T { public: int x; }; - bool operator==(const T& lhs, const T& rhs) { - return lhs.x == rhs.x; - } - } // namespace old_ns - - // Assume that the reference is in another file. - bool f() { - auto eq = old_ns::operator==; - old_ns::T t1, t2; - old_ns::operator==(t1, t2); - return t1 == t2; - } - )"; - std::string Expected = R"( - namespace old_ns { - class T { public: int x; }; - bool operator==(const T& lhs, const T& rhs) { - return lhs.x == rhs.x; - } - } // namespace old_ns - - // Assume that the reference is in another file. - bool f() { - auto eq = new_ns::operator==; - old_ns::T t1, t2; - new_ns::operator==(t1, t2); - return t1 == t2; - } - )"; - std::string After = - runClangRenameOnCode(Before, "old_ns::operator==", "new_ns::operator=="); - CompareSnippets(Expected, After); -} - -TEST_F(RenameFunctionTest, FunctionRefAsTemplate) { - std::string Before = R"( - void X(); - - // Assume that the reference is in another file. - namespace some_ns { - template - class TIterator {}; - - template - class T { - public: - typedef TIterator IterType; - using TI = TIterator; - void g() { - Func(); - auto func = Func; - TIterator iter; - } - }; - - - void f() { T tx; tx.g(); } - } // namespace some_ns - )"; - std::string Expected = R"( - void X(); - - // Assume that the reference is in another file. - namespace some_ns { - template - class TIterator {}; - - template - class T { - public: - typedef TIterator IterType; - using TI = TIterator; - void g() { - Func(); - auto func = Func; - TIterator iter; - } - }; - - - void f() { T tx; tx.g(); } - } // namespace some_ns - )"; - std::string After = runClangRenameOnCode(Before, "::X", "ns::X"); - CompareSnippets(Expected, After); -} - -TEST_F(RenameFunctionTest, RenameFunctionInUsingDecl) { - std::string Before = R"( - using base::ToNanoSeconds; - namespace old_ns { - using base::ToNanoSeconds; - void f() { - using base::ToNanoSeconds; - } - } - )"; - std::string Expected = R"( - using base::ToInt64NanoSeconds; - namespace old_ns { - using base::ToInt64NanoSeconds; - void f() { - using base::ToInt64NanoSeconds; - } - } - )"; - std::string After = runClangRenameOnCode(Before, "base::ToNanoSeconds", - "base::ToInt64NanoSeconds"); - CompareSnippets(Expected, After); -} - -// FIXME: Fix the complex the case where the symbol being renamed is located in -// `std::function>`. -TEST_F(ClangRenameTest, DISABLED_ReferencesInLambdaFunctionParameters) { - std::string Before = R"( - template - class function; - template - class function { - public: - template - function(Functor f) {} - - function() {} - - R operator()(ArgTypes...) const {} - }; - - namespace ns { - void Old() {} - void f() { - function func; - } - } // namespace ns)"; - std::string Expected = R"( - template - class function; - template - class function { - public: - template - function(Functor f) {} - - function() {} - - R operator()(ArgTypes...) const {} - }; - - namespace ns { - void New() {} - void f() { - function func; - } - } // namespace ns)"; - std::string After = runClangRenameOnCode(Before, "ns::Old", "::new_ns::New"); - CompareSnippets(Expected, After); -} - -} // anonymous namespace -} // namespace test -} // namespace clang_rename -} // namesdpace clang diff --git a/clang/unittests/Rename/RenameMemberTest.cpp b/clang/unittests/Rename/RenameMemberTest.cpp deleted file mode 100644 index c16d16aa25f047..00000000000000 --- a/clang/unittests/Rename/RenameMemberTest.cpp +++ /dev/null @@ -1,228 +0,0 @@ -//===-- ClangMemberTests.cpp - unit tests for renaming class members ------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "ClangRenameTest.h" - -namespace clang { -namespace clang_rename { -namespace test { -namespace { - -class RenameMemberTest : public ClangRenameTest { -public: - RenameMemberTest() { - AppendToHeader(R"( - struct NA { - void Foo(); - void NotFoo(); - static void SFoo(); - static void SNotFoo(); - int Moo; - }; - struct A { - virtual void Foo(); - void NotFoo(); - static void SFoo(); - static void SNotFoo(); - int Moo; - int NotMoo; - static int SMoo; - }; - struct B : public A { - void Foo() override; - }; - template struct TA { - T* Foo(); - T* NotFoo(); - static T* SFoo(); - static T* NotSFoo(); - }; - template struct TB : public TA {}; - namespace ns { - template struct TA { - T* Foo(); - T* NotFoo(); - static T* SFoo(); - static T* NotSFoo(); - static int SMoo; - }; - template struct TB : public TA {}; - struct A { - void Foo(); - void NotFoo(); - static void SFoo(); - static void SNotFoo(); - }; - struct B : public A {}; - struct C { - template - void SFoo(const T& t) {} - template - void Foo() {} - }; - })"); - } -}; - -INSTANTIATE_TEST_SUITE_P( - DISABLED_RenameTemplatedClassStaticVariableTest, RenameMemberTest, - testing::ValuesIn(std::vector({ - // FIXME: support renaming static variables for template classes. - {"void f() { ns::TA::SMoo; }", - "void f() { ns::TA::SMeh; }", "ns::TA::SMoo", "ns::TA::SMeh"}, - })) ); - -INSTANTIATE_TEST_SUITE_P( - RenameMemberTest, RenameMemberTest, - testing::ValuesIn(std::vector({ - // Normal methods and fields. - {"void f() { A a; a.Foo(); }", "void f() { A a; a.Bar(); }", "A::Foo", - "A::Bar"}, - {"void f() { ns::A a; a.Foo(); }", "void f() { ns::A a; a.Bar(); }", - "ns::A::Foo", "ns::A::Bar"}, - {"void f() { A a; int x = a.Moo; }", "void f() { A a; int x = a.Meh; }", - "A::Moo", "A::Meh"}, - {"void f() { B b; b.Foo(); }", "void f() { B b; b.Bar(); }", "B::Foo", - "B::Bar"}, - {"void f() { ns::B b; b.Foo(); }", "void f() { ns::B b; b.Bar(); }", - "ns::A::Foo", "ns::A::Bar"}, - {"void f() { B b; int x = b.Moo; }", "void f() { B b; int x = b.Meh; }", - "A::Moo", "A::Meh"}, - - // Static methods. - {"void f() { A::SFoo(); }", "void f() { A::SBar(); }", "A::SFoo", - "A::SBar"}, - {"void f() { ns::A::SFoo(); }", "void f() { ns::A::SBar(); }", - "ns::A::SFoo", "ns::A::SBar"}, - {"void f() { TA::SFoo(); }", "void f() { TA::SBar(); }", - "TA::SFoo", "TA::SBar"}, - {"void f() { ns::TA::SFoo(); }", - "void f() { ns::TA::SBar(); }", "ns::TA::SFoo", "ns::TA::SBar"}, - - // Static variables. - {"void f() { A::SMoo; }", - "void f() { A::SMeh; }", "A::SMoo", "A::SMeh"}, - - // Templated methods. - {"void f() { TA a; a.Foo(); }", "void f() { TA a; a.Bar(); }", - "TA::Foo", "TA::Bar"}, - {"void f() { ns::TA a; a.Foo(); }", - "void f() { ns::TA a; a.Bar(); }", "ns::TA::Foo", "ns::TA::Bar"}, - {"void f() { TB b; b.Foo(); }", "void f() { TB b; b.Bar(); }", - "TA::Foo", "TA::Bar"}, - {"void f() { ns::TB b; b.Foo(); }", - "void f() { ns::TB b; b.Bar(); }", "ns::TA::Foo", "ns::TA::Bar"}, - {"void f() { ns::C c; int x; c.SFoo(x); }", - "void f() { ns::C c; int x; c.SBar(x); }", "ns::C::SFoo", - "ns::C::SBar"}, - {"void f() { ns::C c; c.Foo(); }", - "void f() { ns::C c; c.Bar(); }", "ns::C::Foo", "ns::C::Bar"}, - - // Pointers to methods. - {"void f() { auto p = &A::Foo; }", "void f() { auto p = &A::Bar; }", - "A::Foo", "A::Bar"}, - {"void f() { auto p = &A::SFoo; }", "void f() { auto p = &A::SBar; }", - "A::SFoo", "A::SBar"}, - {"void f() { auto p = &B::Foo; }", "void f() { auto p = &B::Bar; }", - "B::Foo", "B::Bar"}, - {"void f() { auto p = &ns::A::Foo; }", - "void f() { auto p = &ns::A::Bar; }", "ns::A::Foo", "ns::A::Bar"}, - {"void f() { auto p = &ns::A::SFoo; }", - "void f() { auto p = &ns::A::SBar; }", "ns::A::SFoo", "ns::A::SBar"}, - {"void f() { auto p = &ns::C::SFoo; }", - "void f() { auto p = &ns::C::SBar; }", "ns::C::SFoo", - "ns::C::SBar"}, - - // These methods are not declared or overridden in the subclass B, we - // have to use the qualified name with parent class A to identify them. - {"void f() { auto p = &ns::B::Foo; }", - "void f() { auto p = &ns::B::Bar; }", "ns::A::Foo", "ns::B::Bar"}, - {"void f() { B::SFoo(); }", "void f() { B::SBar(); }", "A::SFoo", - "B::SBar"}, - {"void f() { ns::B::SFoo(); }", "void f() { ns::B::SBar(); }", - "ns::A::SFoo", "ns::B::SBar"}, - {"void f() { auto p = &B::SFoo; }", "void f() { auto p = &B::SBar; }", - "A::SFoo", "B::SBar"}, - {"void f() { auto p = &ns::B::SFoo; }", - "void f() { auto p = &ns::B::SBar; }", "ns::A::SFoo", "ns::B::SBar"}, - {"void f() { TB::SFoo(); }", "void f() { TB::SBar(); }", - "TA::SFoo", "TB::SBar"}, - {"void f() { ns::TB::SFoo(); }", - "void f() { ns::TB::SBar(); }", "ns::TA::SFoo", "ns::TB::SBar"}, - })) ); - -TEST_P(RenameMemberTest, RenameMembers) { - auto Param = GetParam(); - assert(!Param.OldName.empty()); - assert(!Param.NewName.empty()); - std::string Actual = - runClangRenameOnCode(Param.Before, Param.OldName, Param.NewName); - CompareSnippets(Param.After, Actual); -} - -TEST_F(RenameMemberTest, RenameMemberInsideClassMethods) { - std::string Before = R"( - struct X { - int Moo; - void Baz() { Moo = 1; } - };)"; - std::string Expected = R"( - struct X { - int Meh; - void Baz() { Meh = 1; } - };)"; - std::string After = runClangRenameOnCode(Before, "X::Moo", "Y::Meh"); - CompareSnippets(Expected, After); -} - -TEST_F(RenameMemberTest, RenameMethodInsideClassMethods) { - std::string Before = R"( - struct X { - void Foo() {} - void Baz() { Foo(); } - };)"; - std::string Expected = R"( - struct X { - void Bar() {} - void Baz() { Bar(); } - };)"; - std::string After = runClangRenameOnCode(Before, "X::Foo", "X::Bar"); - CompareSnippets(Expected, After); -} - -TEST_F(RenameMemberTest, RenameCtorInitializer) { - std::string Before = R"( - class X { - public: - X(); - A a; - A a2; - B b; - }; - - X::X():a(), b() {} - )"; - std::string Expected = R"( - class X { - public: - X(); - A bar; - A a2; - B b; - }; - - X::X():bar(), b() {} - )"; - std::string After = runClangRenameOnCode(Before, "X::a", "X::bar"); - CompareSnippets(Expected, After); -} - -} // anonymous namespace -} // namespace test -} // namespace clang_rename -} // namesdpace clang diff --git a/llvm/utils/gn/secondary/clang-tools-extra/test/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/test/BUILD.gn index 2227ad42cf40a8..4f9ba335859bfc 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/test/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/test/BUILD.gn @@ -83,7 +83,6 @@ group("test") { "//clang-tools-extra/unittests", "//clang/lib/Headers", "//clang/tools/c-index-test", - "//clang/tools/clang-rename", "//clang/tools/driver:symlinks", "//llvm/tools/llvm-bcanalyzer", "//llvm/utils/FileCheck", diff --git a/llvm/utils/gn/secondary/clang/test/BUILD.gn b/llvm/utils/gn/secondary/clang/test/BUILD.gn index 1ec94a419f56c8..1d5b8025a12acb 100644 --- a/llvm/utils/gn/secondary/clang/test/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/test/BUILD.gn @@ -171,7 +171,6 @@ group("test") { "//clang/tools/clang-installapi", "//clang/tools/clang-offload-bundler", "//clang/tools/clang-refactor", - "//clang/tools/clang-rename", "//clang/tools/clang-repl", "//clang/tools/clang-scan-deps", "//clang/tools/diagtool", diff --git a/llvm/utils/gn/secondary/clang/tools/clang-rename/BUILD.gn b/llvm/utils/gn/secondary/clang/tools/clang-rename/BUILD.gn deleted file mode 100644 index 1c517b98934515..00000000000000 --- a/llvm/utils/gn/secondary/clang/tools/clang-rename/BUILD.gn +++ /dev/null @@ -1,14 +0,0 @@ -executable("clang-rename") { - configs += [ "//llvm/utils/gn/build:clang_code" ] - deps = [ - "//clang/lib/Basic", - "//clang/lib/Frontend", - "//clang/lib/Rewrite", - "//clang/lib/Tooling", - "//clang/lib/Tooling/Core", - "//clang/lib/Tooling/Refactoring", - "//llvm/lib/Option", - "//llvm/lib/Support", - ] - sources = [ "ClangRename.cpp" ] -} diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel index 1d0ba8bd4d586d..b39fb8f6795e1d 100644 --- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel @@ -26,7 +26,6 @@ exports_files([ "tools/clang-format/clang-format.el", "tools/clang-format/clang-format-test.el", "tools/clang-format/clang-format.py", - "tools/clang-rename/clang-rename.el", "tools/extra/clang-include-fixer/tool/clang-include-fixer.el", "tools/extra/clang-include-fixer/tool/clang-include-fixer-test.el", ]) @@ -2590,20 +2589,6 @@ cc_binary( ], ) -cc_binary( - name = "clang-rename", - srcs = glob(["tools/clang-rename/*.cpp"]), - stamp = 0, - deps = [ - ":basic", - ":frontend", - ":rewrite", - ":tooling", - ":tooling_refactoring", - "//llvm:Support", - ], -) - cc_binary( name = "clang-repl", srcs = glob(["tools/clang-repl/*.cpp"]), From 2bb3621faa886ba6df99c751b49011f55ef4ca1e Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 18 Sep 2024 05:35:26 -0700 Subject: [PATCH 047/321] [LLVM][TableGen] Change DecoderEmitter to use const RecordKeeper (#109040) Change DecoderEmitter to use const RecordKeeper. This is a part of effort to have better const correctness in TableGen backends: https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089 --- llvm/utils/TableGen/DecoderEmitter.cpp | 28 +++++++++++++------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp index b5da37b5134696..edecb9067bccf7 100644 --- a/llvm/utils/TableGen/DecoderEmitter.cpp +++ b/llvm/utils/TableGen/DecoderEmitter.cpp @@ -155,12 +155,12 @@ raw_ostream &operator<<(raw_ostream &OS, const EncodingAndInst &Value) { } class DecoderEmitter { - RecordKeeper &RK; + const RecordKeeper &RK; std::vector NumberedEncodings; public: - DecoderEmitter(RecordKeeper &R, std::string PredicateNamespace) - : RK(R), Target(R), PredicateNamespace(std::move(PredicateNamespace)) {} + DecoderEmitter(const RecordKeeper &R, const std::string &PredicateNamespace) + : RK(R), Target(R), PredicateNamespace(PredicateNamespace) {} // Emit the decoder state machine table. void emitTable(formatted_raw_ostream &o, DecoderTable &Table, @@ -181,7 +181,7 @@ class DecoderEmitter { CodeGenTarget Target; public: - std::string PredicateNamespace; + const std::string &PredicateNamespace; }; } // end anonymous namespace @@ -1302,7 +1302,7 @@ bool FilterChooser::emitPredicateMatch(raw_ostream &o, unsigned &Indentation, AllInstructions[Opc].EncodingDef->getValueAsListInit("Predicates"); bool IsFirstEmission = true; for (unsigned i = 0; i < Predicates->size(); ++i) { - Record *Pred = Predicates->getElementAsRecord(i); + const Record *Pred = Predicates->getElementAsRecord(i); if (!Pred->getValue("AssemblerMatcherPredicate")) continue; @@ -1320,10 +1320,10 @@ bool FilterChooser::emitPredicateMatch(raw_ostream &o, unsigned &Indentation, } bool FilterChooser::doesOpcodeNeedPredicate(unsigned Opc) const { - ListInit *Predicates = + const ListInit *Predicates = AllInstructions[Opc].EncodingDef->getValueAsListInit("Predicates"); for (unsigned i = 0; i < Predicates->size(); ++i) { - Record *Pred = Predicates->getElementAsRecord(i); + const Record *Pred = Predicates->getElementAsRecord(i); if (!Pred->getValue("AssemblerMatcherPredicate")) continue; @@ -1868,7 +1868,7 @@ static std::string findOperandDecoderMethod(const Record *Record) { std::string Decoder; const RecordVal *DecoderString = Record->getValue("DecoderMethod"); - StringInit *String = + const StringInit *String = DecoderString ? dyn_cast(DecoderString->getValue()) : nullptr; if (String) { Decoder = std::string(String->getValue()); @@ -2010,7 +2010,7 @@ static void addOneOperandFields(const Record &EncodingDef, const BitsInit &Bits, } static unsigned -populateInstruction(CodeGenTarget &Target, const Record &EncodingDef, +populateInstruction(const CodeGenTarget &Target, const Record &EncodingDef, const CodeGenInstruction &CGI, unsigned Opc, std::map> &Operands, bool IsVarLenInst) { @@ -2089,12 +2089,12 @@ populateInstruction(CodeGenTarget &Target, const Record &EncodingDef, DagInit *SubArgDag = dyn_cast(OpInit); if (SubArgDag) OpInit = SubArgDag->getOperator(); - Record *OpTypeRec = cast(OpInit)->getDef(); + const Record *OpTypeRec = cast(OpInit)->getDef(); // Lookup the sub-operands from the operand type record (note that only // Operand subclasses have MIOperandInfo, see CodeGenInstruction.cpp). - DagInit *SubOps = OpTypeRec->isSubClassOf("Operand") - ? OpTypeRec->getValueAsDag("MIOperandInfo") - : nullptr; + const DagInit *SubOps = OpTypeRec->isSubClassOf("Operand") + ? OpTypeRec->getValueAsDag("MIOperandInfo") + : nullptr; // Lookup the decoder method and construct a new OperandInfo to hold our // result. @@ -2549,7 +2549,7 @@ namespace llvm { handleHwModesUnrelatedEncodings(NumberedInstruction, HwModeNames, NamespacesWithHwModes, NumberedEncodings); } - for (const auto &NumberedAlias : + for (const Record *NumberedAlias : RK.getAllDerivedDefinitions("AdditionalEncoding")) NumberedEncodings.emplace_back( NumberedAlias, From ef71226fcd0bbfe62f4ef71f72005fa98ea9ca24 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 18 Sep 2024 05:37:54 -0700 Subject: [PATCH 048/321] [LLVM][TableGen] Change WebAsm Emitter to use const RecordKeeper (#109051) Change WebAssemblyDisassemblerEmitter to use const RecordKeeper. This is a part of effort to have better const correctness in TableGen backends: https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089 --- llvm/include/llvm/TableGen/Record.h | 1 + llvm/lib/TableGen/Record.cpp | 20 ++++++---- .../WebAssemblyDisassemblerEmitter.cpp | 39 +++++++------------ .../TableGen/WebAssemblyDisassemblerEmitter.h | 3 +- 4 files changed, 30 insertions(+), 33 deletions(-) diff --git a/llvm/include/llvm/TableGen/Record.h b/llvm/include/llvm/TableGen/Record.h index c9e01e3f221bad..f1420731d69081 100644 --- a/llvm/include/llvm/TableGen/Record.h +++ b/llvm/include/llvm/TableGen/Record.h @@ -603,6 +603,7 @@ class BitsInit final : public TypedInit, public FoldingSetNode, Init *convertInitializerTo(RecTy *Ty) const override; Init *convertInitializerBitRange(ArrayRef Bits) const override; + std::optional convertInitializerToInt() const; bool isComplete() const override { for (unsigned i = 0; i != getNumBits(); ++i) diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp index 97ae0b092b81b1..567545ec02f666 100644 --- a/llvm/lib/TableGen/Record.cpp +++ b/llvm/lib/TableGen/Record.cpp @@ -497,18 +497,24 @@ Init *BitsInit::convertInitializerTo(RecTy *Ty) const { } if (isa(Ty)) { - int64_t Result = 0; - for (unsigned i = 0, e = getNumBits(); i != e; ++i) - if (auto *Bit = dyn_cast(getBit(i))) - Result |= static_cast(Bit->getValue()) << i; - else - return nullptr; - return IntInit::get(getRecordKeeper(), Result); + std::optional Result = convertInitializerToInt(); + if (Result) + return IntInit::get(getRecordKeeper(), *Result); } return nullptr; } +std::optional BitsInit::convertInitializerToInt() const { + int64_t Result = 0; + for (unsigned i = 0, e = getNumBits(); i != e; ++i) + if (auto *Bit = dyn_cast(getBit(i))) + Result |= static_cast(Bit->getValue()) << i; + else + return std::nullopt; + return Result; +} + Init * BitsInit::convertInitializerBitRange(ArrayRef Bits) const { SmallVector NewBits(Bits.size()); diff --git a/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp b/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp index e9436ab16e448a..7373494e8b12f8 100644 --- a/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp +++ b/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp @@ -19,28 +19,23 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/TableGen/Record.h" -namespace llvm { - static constexpr int WebAssemblyInstructionTableSize = 256; -void emitWebAssemblyDisassemblerTables( +void llvm::emitWebAssemblyDisassemblerTables( raw_ostream &OS, - const ArrayRef &NumberedInstructions) { + ArrayRef NumberedInstructions) { // First lets organize all opcodes by (prefix) byte. Prefix 0 is the // starting table. std::map>> OpcodeTable; for (unsigned I = 0; I != NumberedInstructions.size(); ++I) { - auto &CGI = *NumberedInstructions[I]; - auto &Def = *CGI.TheDef; + const CodeGenInstruction &CGI = *NumberedInstructions[I]; + const Record &Def = *CGI.TheDef; if (!Def.getValue("Inst")) continue; - auto &Inst = *Def.getValueAsBitsInit("Inst"); - RecordKeeper &RK = Inst.getRecordKeeper(); - unsigned Opc = static_cast( - cast(Inst.convertInitializerTo(IntRecTy::get(RK))) - ->getValue()); + const BitsInit &Inst = *Def.getValueAsBitsInit("Inst"); + unsigned Opc = static_cast(*Inst.convertInitializerToInt()); if (Opc == 0xFFFFFFFF) continue; // No opcode defined. assert(Opc <= 0xFFFFFF); @@ -97,14 +92,14 @@ void emitWebAssemblyDisassemblerTables( OS << "};\n\n"; std::vector OperandTable, CurOperandList; // Output one table per prefix. - for (auto &PrefixPair : OpcodeTable) { - if (PrefixPair.second.empty()) + for (const auto &[Prefix, Table] : OpcodeTable) { + if (Table.empty()) continue; - OS << "WebAssemblyInstruction InstructionTable" << PrefixPair.first; + OS << "WebAssemblyInstruction InstructionTable" << Prefix; OS << "[] = {\n"; for (unsigned I = 0; I < WebAssemblyInstructionTableSize; I++) { - auto InstIt = PrefixPair.second.find(I); - if (InstIt != PrefixPair.second.end()) { + auto InstIt = Table.find(I); + if (InstIt != Table.end()) { // Regular instruction. assert(InstIt->second.second); auto &CGI = *InstIt->second.second; @@ -144,7 +139,7 @@ void emitWebAssemblyDisassemblerTables( } else { auto PrefixIt = OpcodeTable.find(I); // If we have a non-empty table for it that's not 0, this is a prefix. - if (PrefixIt != OpcodeTable.end() && I && !PrefixPair.first) { + if (PrefixIt != OpcodeTable.end() && I && !Prefix) { OS << " { 0, ET_Prefix, 0, 0"; } else { OS << " { 0, ET_Unused, 0, 0"; @@ -163,15 +158,11 @@ void emitWebAssemblyDisassemblerTables( // Create a table of all extension tables: OS << "struct { uint8_t Prefix; const WebAssemblyInstruction *Table; }\n"; OS << "PrefixTable[] = {\n"; - for (auto &PrefixPair : OpcodeTable) { - if (PrefixPair.second.empty() || !PrefixPair.first) + for (const auto &[Prefix, Table] : OpcodeTable) { + if (Table.empty() || !Prefix) continue; - OS << " { " << PrefixPair.first << ", InstructionTable" - << PrefixPair.first; - OS << " },\n"; + OS << " { " << Prefix << ", InstructionTable" << Prefix << " },\n"; } OS << " { 0, nullptr }\n};\n\n"; OS << "} // end namespace llvm\n"; } - -} // namespace llvm diff --git a/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.h b/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.h index aba3a4bfd3024b..2d814cf0675aeb 100644 --- a/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.h +++ b/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.h @@ -22,8 +22,7 @@ class CodeGenInstruction; class raw_ostream; void emitWebAssemblyDisassemblerTables( - raw_ostream &OS, - const ArrayRef &NumberedInstructions); + raw_ostream &OS, ArrayRef NumberedInstructions); } // namespace llvm From 4eee0cfc8a922fc952ce94130505eb0e7aad6935 Mon Sep 17 00:00:00 2001 From: Tulio Magno Quites Machado Filho Date: Wed, 18 Sep 2024 09:48:40 -0300 Subject: [PATCH 049/321] [MLIR] Reuse the path to runner_utils libraries (#108579) Prefer to get the path to libmlir_runner_utils and libmlir_c_runner_utils via %mlir_runner_utils and %mlir_c_runner_utils. Fallback to the previous paths only if they aren't defined. This ensures the test will pass regardless of the build configuration used downstream. --- mlir/test/python/execution_engine.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/mlir/test/python/execution_engine.py b/mlir/test/python/execution_engine.py index 1cdda63eefe300..7c375ce81de0eb 100644 --- a/mlir/test/python/execution_engine.py +++ b/mlir/test/python/execution_engine.py @@ -1,4 +1,4 @@ -# RUN: %PYTHON %s 2>&1 | FileCheck %s +# RUN: env MLIR_RUNNER_UTILS=%mlir_runner_utils MLIR_C_RUNNER_UTILS=%mlir_c_runner_utils %PYTHON %s 2>&1 | FileCheck %s # REQUIRES: host-supports-jit import gc, sys, os, tempfile from mlir.ir import * @@ -7,6 +7,12 @@ from mlir.runtime import * from ml_dtypes import bfloat16, float8_e5m2 +MLIR_RUNNER_UTILS = os.getenv( + "MLIR_RUNNER_UTILS", "../../../../lib/libmlir_runner_utils.so" +) +MLIR_C_RUNNER_UTILS = os.getenv( + "MLIR_C_RUNNER_UTILS", "../../../../lib/libmlir_c_runner_utils.so" +) # Log everything to stderr and flush so that we have a unified stream to match # errors/info emitted by MLIR to stderr. @@ -700,8 +706,8 @@ def testSharedLibLoad(): ] else: shared_libs = [ - "../../../../lib/libmlir_runner_utils.so", - "../../../../lib/libmlir_c_runner_utils.so", + MLIR_RUNNER_UTILS, + MLIR_C_RUNNER_UTILS, ] execution_engine = ExecutionEngine( @@ -743,8 +749,8 @@ def testNanoTime(): ] else: shared_libs = [ - "../../../../lib/libmlir_runner_utils.so", - "../../../../lib/libmlir_c_runner_utils.so", + MLIR_RUNNER_UTILS, + MLIR_C_RUNNER_UTILS, ] execution_engine = ExecutionEngine( From 13b4d1bfeacc441d792557b42759f258dc4316e6 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 18 Sep 2024 14:48:08 +0200 Subject: [PATCH 050/321] [SimplifyCFG][LICM] Add additional speculation tests These are related to https://github.com/llvm/llvm-project/issues/108854. --- llvm/test/Transforms/LICM/hoist-deref-load.ll | 81 ++++++- .../SimplifyCFG/speculate-derefable-load.ll | 198 ++++++++++++++++++ 2 files changed, 275 insertions(+), 4 deletions(-) create mode 100644 llvm/test/Transforms/SimplifyCFG/speculate-derefable-load.ll diff --git a/llvm/test/Transforms/LICM/hoist-deref-load.ll b/llvm/test/Transforms/LICM/hoist-deref-load.ll index 149976ab18746b..c498e85ddd6c29 100644 --- a/llvm/test/Transforms/LICM/hoist-deref-load.ll +++ b/llvm/test/Transforms/LICM/hoist-deref-load.ll @@ -420,7 +420,7 @@ for.end: ; preds = %for.inc, %entry define void @test7(ptr noalias %a, ptr %b, ptr %cptr, i32 %n) #0 { ; CHECK-LABEL: @test7( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[C:%.*]] = load ptr, ptr [[CPTR:%.*]], align 8, !dereferenceable !0, !align !0 +; CHECK-NEXT: [[C:%.*]] = load ptr, ptr [[CPTR:%.*]], align 8, !dereferenceable [[META0:![0-9]+]], !align [[META0]] ; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] ; CHECK: for.body.preheader: @@ -492,7 +492,7 @@ for.end: ; preds = %for.inc, %entry define void @test8(ptr noalias %a, ptr %b, ptr %cptr, i32 %n) #0 { ; CHECK-LABEL: @test8( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[C:%.*]] = load ptr, ptr [[CPTR:%.*]], align 8, !dereferenceable_or_null !0, !align !0 +; CHECK-NEXT: [[C:%.*]] = load ptr, ptr [[CPTR:%.*]], align 8, !dereferenceable_or_null [[META0]], !align [[META0]] ; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp ne ptr [[C]], null ; CHECK-NEXT: br i1 [[NOT_NULL]], label [[NOT_NULL:%.*]], label [[FOR_END:%.*]] ; CHECK: not.null: @@ -562,7 +562,7 @@ for.end: ; preds = %for.inc, %entry, %n define void @test9(ptr noalias %a, ptr %b, ptr %cptr, i32 %n) #0 { ; CHECK-LABEL: @test9( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[C:%.*]] = load ptr, ptr [[CPTR:%.*]], align 8, !dereferenceable_or_null !0 +; CHECK-NEXT: [[C:%.*]] = load ptr, ptr [[CPTR:%.*]], align 8, !dereferenceable_or_null [[META0]] ; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] ; CHECK: for.body.preheader: @@ -693,7 +693,7 @@ define void @test11(ptr noalias %a, ptr %b, ptr dereferenceable(8) %cptr, i32 %n ; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] ; CHECK: for.body.preheader: -; CHECK-NEXT: [[C:%.*]] = load ptr, ptr [[CPTR:%.*]], align 8, !dereferenceable !0 +; CHECK-NEXT: [[C:%.*]] = load ptr, ptr [[CPTR:%.*]], align 8, !dereferenceable [[META0]] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_BODY_PREHEADER]] ] @@ -1164,5 +1164,78 @@ for.end: ; preds = %for.inc, %entry ret void } +declare void @use(i64) + +define void @licm_deref_no_hoist(i1 %c1, i1 %c2, ptr align 8 dereferenceable(8) %p1) { +; CHECK-LABEL: @licm_deref_no_hoist( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P2:%.*]] = load ptr, ptr [[P1:%.*]], align 8, !align [[META1:![0-9]+]] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: br i1 [[C1:%.*]], label [[IF:%.*]], label [[LOOP_LATCH:%.*]] +; CHECK: if: +; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[P2]], align 8 +; CHECK-NEXT: call void @use(i64 [[V]]) #[[ATTR1:[0-9]+]] +; CHECK-NEXT: br label [[LOOP_LATCH]] +; CHECK: loop.latch: +; CHECK-NEXT: br i1 [[C2:%.*]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + br i1 %c1, label %if, label %loop.latch + +if: + %p2 = load ptr, ptr %p1, align 8, !dereferenceable !1, !align !1 + %v = load i64, ptr %p2, align 8 + call void @use(i64 %v) memory(none) + br label %loop.latch + +loop.latch: + br i1 %c2, label %loop, label %exit + +exit: + ret void +} + +define void @licm_deref_hoist(i1 %c1, i1 %c2, ptr align 8 dereferenceable(8) %p1) { +; CHECK-LABEL: @licm_deref_hoist( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P2:%.*]] = load ptr, ptr [[P1:%.*]], align 8, !dereferenceable [[META1]], !align [[META1]] +; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[P2]], align 8 +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: br i1 [[C1:%.*]], label [[IF:%.*]], label [[LOOP_LATCH:%.*]] +; CHECK: if: +; CHECK-NEXT: call void @use(i64 [[V]]) #[[ATTR1]] +; CHECK-NEXT: br label [[LOOP_LATCH]] +; CHECK: loop.latch: +; CHECK-NEXT: br i1 [[C2:%.*]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %p2 = load ptr, ptr %p1, align 8, !dereferenceable !1, !align !1 + br label %loop + +loop: + br i1 %c1, label %if, label %loop.latch + +if: + %v = load i64, ptr %p2, align 8 + call void @use(i64 %v) memory(none) + br label %loop.latch + +loop.latch: + br i1 %c2, label %loop, label %exit + +exit: + ret void +} + attributes #0 = { nounwind uwtable nofree nosync } !0 = !{i64 4} +!1 = !{i64 8} diff --git a/llvm/test/Transforms/SimplifyCFG/speculate-derefable-load.ll b/llvm/test/Transforms/SimplifyCFG/speculate-derefable-load.ll new file mode 100644 index 00000000000000..9e3f333018e680 --- /dev/null +++ b/llvm/test/Transforms/SimplifyCFG/speculate-derefable-load.ll @@ -0,0 +1,198 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=simplifycfg < %s | FileCheck %s + +define i64 @align_deref_align(i1 %c, ptr %p) { +; CHECK-LABEL: define i64 @align_deref_align( +; CHECK-SAME: i1 [[C:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[P]], i64 8), "align"(ptr [[P]], i64 8) ] +; CHECK-NEXT: br i1 [[C]], label %[[IF:.*]], label %[[EXIT:.*]] +; CHECK: [[IF]]: +; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[P]], align 8 +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[V]], %[[IF]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: ret i64 [[RES]] +; +entry: + call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %p, i64 8), "align"(ptr %p, i64 8) ] + br i1 %c, label %if, label %exit + +if: + %v = load i64, ptr %p, align 8 + br label %exit + +exit: + %res = phi i64 [ %v, %if ], [ 0, %entry ] + ret i64 %res +} + +define i64 @assume_deref_align2(i1 %c1, i32 %x, ptr %p) { +; CHECK-LABEL: define i64 @assume_deref_align2( +; CHECK-SAME: i1 [[C1:%.*]], i32 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[P]], i64 8), "align"(ptr [[P]], i64 8) ] +; CHECK-NEXT: br i1 [[C1]], label %[[IF1:.*]], label %[[EXIT:.*]] +; CHECK: [[IF1]]: +; CHECK-NEXT: [[C2:%.*]] = icmp ugt i32 [[X]], 10 +; CHECK-NEXT: br i1 [[C2]], label %[[IF2:.*]], label %[[EXIT]] +; CHECK: [[IF2]]: +; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[P]], align 8 +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[V]], %[[IF2]] ], [ 1, %[[IF1]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: ret i64 [[RES]] +; +entry: + call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %p, i64 8), "align"(ptr %p, i64 8) ] + br i1 %c1, label %if1, label %exit + +if1: + %c2 = icmp ugt i32 %x, 10 + br i1 %c2, label %if2, label %exit + +if2: + %v = load i64, ptr %p, align 8 + br label %exit + +exit: + %res = phi i64 [ %v, %if2 ], [ 1, %if1 ], [ 0, %entry ] + ret i64 %res +} + +define i64 @assume_deref_align_not_dominating(i1 %c, ptr %p) { +; CHECK-LABEL: define i64 @assume_deref_align_not_dominating( +; CHECK-SAME: i1 [[C:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 [[C]], label %[[IF:.*]], label %[[EXIT:.*]] +; CHECK: [[IF]]: +; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[P]], align 8 +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[V]], %[[IF]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[P]], i64 8), "align"(ptr [[P]], i64 8) ] +; CHECK-NEXT: ret i64 [[RES]] +; +entry: + br i1 %c, label %if, label %exit + +if: + %v = load i64, ptr %p, align 8 + br label %exit + +exit: + %res = phi i64 [ %v, %if ], [ 0, %entry ] + call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %p, i64 8), "align"(ptr %p, i64 8) ] + ret i64 %res +} + +; FIXME: This is a miscompile. +define i64 @deref_no_hoist(i1 %c, ptr align 8 dereferenceable(8) %p1) { +; CHECK-LABEL: define i64 @deref_no_hoist( +; CHECK-SAME: i1 [[C:%.*]], ptr align 8 dereferenceable(8) [[P1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[P2:%.*]] = load ptr, ptr [[P1]], align 8, !align [[META0:![0-9]+]] +; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[P2]], align 8 +; CHECK-NEXT: [[RES:%.*]] = select i1 [[C]], i64 [[V]], i64 0 +; CHECK-NEXT: ret i64 [[RES]] +; +entry: + br i1 %c, label %if, label %exit + +if: + %p2 = load ptr, ptr %p1, align 8, !dereferenceable !0, !align !0 + %v = load i64, ptr %p2, align 8 + br label %exit + +exit: + %res = phi i64 [ %v, %if ], [ 0, %entry ] + ret i64 %res +} + +define i64 @deref_hoist(i1 %c, ptr align 8 dereferenceable(8) %p1) { +; CHECK-LABEL: define i64 @deref_hoist( +; CHECK-SAME: i1 [[C:%.*]], ptr align 8 dereferenceable(8) [[P1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[P2:%.*]] = load ptr, ptr [[P1]], align 8, !dereferenceable [[META0]], !align [[META0]] +; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[P2]], align 8 +; CHECK-NEXT: [[RES:%.*]] = select i1 [[C]], i64 [[V]], i64 0 +; CHECK-NEXT: ret i64 [[RES]] +; +entry: + %p2 = load ptr, ptr %p1, align 8, !dereferenceable !0, !align !0 + br i1 %c, label %if, label %exit + +if: + %v = load i64, ptr %p2, align 8 + br label %exit + +exit: + %res = phi i64 [ %v, %if ], [ 0, %entry ] + ret i64 %res +} + +define i64 @deref_no_hoist2(i1 %c1, i32 %x, ptr align 8 dereferenceable(8) %p1) { +; CHECK-LABEL: define i64 @deref_no_hoist2( +; CHECK-SAME: i1 [[C1:%.*]], i32 [[X:%.*]], ptr align 8 dereferenceable(8) [[P1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 [[C1]], label %[[IF1:.*]], label %[[EXIT:.*]] +; CHECK: [[IF1]]: +; CHECK-NEXT: [[C2:%.*]] = icmp ugt i32 [[X]], 10 +; CHECK-NEXT: br i1 [[C2]], label %[[IF2:.*]], label %[[EXIT]] +; CHECK: [[IF2]]: +; CHECK-NEXT: [[P2:%.*]] = load ptr, ptr [[P1]], align 8, !dereferenceable [[META0]], !align [[META0]] +; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[P2]], align 8 +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[V]], %[[IF2]] ], [ 1, %[[IF1]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: ret i64 [[RES]] +; +entry: + br i1 %c1, label %if1, label %exit + +if1: + %c2 = icmp ugt i32 %x, 10 + br i1 %c2, label %if2, label %exit + +if2: + %p2 = load ptr, ptr %p1, align 8, !dereferenceable !0, !align !0 + %v = load i64, ptr %p2, align 8 + br label %exit + +exit: + %res = phi i64 [ %v, %if2 ], [ 1, %if1 ], [ 0, %entry ] + ret i64 %res +} + +define i64 @deref_hoist2(i1 %c1, i32 %x, ptr align 8 dereferenceable(8) %p1) { +; CHECK-LABEL: define i64 @deref_hoist2( +; CHECK-SAME: i1 [[C1:%.*]], i32 [[X:%.*]], ptr align 8 dereferenceable(8) [[P1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[P2:%.*]] = load ptr, ptr [[P1]], align 8, !dereferenceable [[META0]], !align [[META0]] +; CHECK-NEXT: [[C2:%.*]] = icmp ugt i32 [[X]], 10 +; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[P2]], align 8 +; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[C2]], i64 [[V]], i64 1 +; CHECK-NEXT: [[RES:%.*]] = select i1 [[C1]], i64 [[SPEC_SELECT]], i64 0 +; CHECK-NEXT: ret i64 [[RES]] +; +entry: + %p2 = load ptr, ptr %p1, align 8, !dereferenceable !0, !align !0 + br i1 %c1, label %if1, label %exit + +if1: + %c2 = icmp ugt i32 %x, 10 + br i1 %c2, label %if2, label %exit + +if2: + %v = load i64, ptr %p2, align 8 + br label %exit + +exit: + %res = phi i64 [ %v, %if2 ], [ 1, %if1 ], [ 0, %entry ] + ret i64 %res +} + +!0 = !{i64 8} +;. +; CHECK: [[META0]] = !{i64 8} +;. From 9690b30ba9acc3deb1068deb37f3b507826b27fe Mon Sep 17 00:00:00 2001 From: Ilia Kuklin Date: Wed, 18 Sep 2024 17:50:09 +0500 Subject: [PATCH 051/321] [LLDB] Fix operators <= and >= returning a wrong result when comparing to a floating point NaN (#108060) Implement operators `<=` and `>=` to explicitly check the comparison results to be `cmpLessThan` or `cmpEqual` instead of negating the result of `operators<`. Fixes #85947 --- lldb/include/lldb/Utility/Scalar.h | 10 +- lldb/source/Utility/Scalar.cpp | 49 ++++---- lldb/test/API/lang/cpp/fpnan/Makefile | 3 + lldb/test/API/lang/cpp/fpnan/TestFPNaN.py | 130 ++++++++++++++++++++++ lldb/test/API/lang/cpp/fpnan/main.cpp | 8 ++ 5 files changed, 168 insertions(+), 32 deletions(-) create mode 100644 lldb/test/API/lang/cpp/fpnan/Makefile create mode 100644 lldb/test/API/lang/cpp/fpnan/TestFPNaN.py create mode 100644 lldb/test/API/lang/cpp/fpnan/main.cpp diff --git a/lldb/include/lldb/Utility/Scalar.h b/lldb/include/lldb/Utility/Scalar.h index 0d8eba3c9726d5..b4b9c7e1895825 100644 --- a/lldb/include/lldb/Utility/Scalar.h +++ b/lldb/include/lldb/Utility/Scalar.h @@ -210,6 +210,7 @@ class Scalar { static PromotionKey GetFloatPromoKey(const llvm::fltSemantics &semantics); private: + friend llvm::APFloat::cmpResult compare(Scalar lhs, Scalar rhs); friend const Scalar operator+(const Scalar &lhs, const Scalar &rhs); friend const Scalar operator-(Scalar lhs, Scalar rhs); friend const Scalar operator/(Scalar lhs, Scalar rhs); @@ -220,9 +221,9 @@ class Scalar { friend const Scalar operator^(Scalar lhs, Scalar rhs); friend const Scalar operator<<(const Scalar &lhs, const Scalar &rhs); friend const Scalar operator>>(const Scalar &lhs, const Scalar &rhs); - friend bool operator==(Scalar lhs, Scalar rhs); + friend bool operator==(const Scalar &lhs, const Scalar &rhs); friend bool operator!=(const Scalar &lhs, const Scalar &rhs); - friend bool operator<(Scalar lhs, Scalar rhs); + friend bool operator<(const Scalar &lhs, const Scalar &rhs); friend bool operator<=(const Scalar &lhs, const Scalar &rhs); friend bool operator>(const Scalar &lhs, const Scalar &rhs); friend bool operator>=(const Scalar &lhs, const Scalar &rhs); @@ -241,6 +242,7 @@ class Scalar { // Item 19 of "Effective C++ Second Edition" by Scott Meyers // Differentiate among members functions, non-member functions, and // friend functions +llvm::APFloat::cmpResult compare(Scalar lhs, Scalar rhs); const Scalar operator+(const Scalar &lhs, const Scalar &rhs); const Scalar operator-(Scalar lhs, Scalar rhs); const Scalar operator/(Scalar lhs, Scalar rhs); @@ -251,9 +253,9 @@ const Scalar operator%(Scalar lhs, Scalar rhs); const Scalar operator^(Scalar lhs, Scalar rhs); const Scalar operator<<(const Scalar &lhs, const Scalar &rhs); const Scalar operator>>(const Scalar &lhs, const Scalar &rhs); -bool operator==(Scalar lhs, Scalar rhs); +bool operator==(const Scalar &lhs, const Scalar &rhs); bool operator!=(const Scalar &lhs, const Scalar &rhs); -bool operator<(Scalar lhs, Scalar rhs); +bool operator<(const Scalar &lhs, const Scalar &rhs); bool operator<=(const Scalar &lhs, const Scalar &rhs); bool operator>(const Scalar &lhs, const Scalar &rhs); bool operator>=(const Scalar &lhs, const Scalar &rhs); diff --git a/lldb/source/Utility/Scalar.cpp b/lldb/source/Utility/Scalar.cpp index 329f5b6e4b9a5b..f07a9f3bed00c7 100644 --- a/lldb/source/Utility/Scalar.cpp +++ b/lldb/source/Utility/Scalar.cpp @@ -852,57 +852,50 @@ llvm::APFloat Scalar::CreateAPFloatFromAPFloat(lldb::BasicType basic_type) { } } -bool lldb_private::operator==(Scalar lhs, Scalar rhs) { +APFloat::cmpResult lldb_private::compare(Scalar lhs, Scalar rhs) { // If either entry is void then we can just compare the types if (lhs.m_type == Scalar::e_void || rhs.m_type == Scalar::e_void) - return lhs.m_type == rhs.m_type; + return lhs.m_type == rhs.m_type ? APFloat::cmpEqual : APFloat::cmpUnordered; - llvm::APFloat::cmpResult result; switch (Scalar::PromoteToMaxType(lhs, rhs)) { case Scalar::e_void: break; case Scalar::e_int: - return lhs.m_integer == rhs.m_integer; + if (lhs.m_integer < rhs.m_integer) + return APFloat::cmpLessThan; + if (lhs.m_integer > rhs.m_integer) + return APFloat::cmpGreaterThan; + return APFloat::cmpEqual; case Scalar::e_float: - result = lhs.m_float.compare(rhs.m_float); - if (result == llvm::APFloat::cmpEqual) - return true; + return lhs.m_float.compare(rhs.m_float); } - return false; + return APFloat::cmpUnordered; } -bool lldb_private::operator!=(const Scalar &lhs, const Scalar &rhs) { - return !(lhs == rhs); +bool lldb_private::operator==(const Scalar &lhs, const Scalar &rhs) { + return compare(lhs, rhs) == APFloat::cmpEqual; } -bool lldb_private::operator<(Scalar lhs, Scalar rhs) { - if (lhs.m_type == Scalar::e_void || rhs.m_type == Scalar::e_void) - return false; +bool lldb_private::operator!=(const Scalar &lhs, const Scalar &rhs) { + return compare(lhs, rhs) != APFloat::cmpEqual; +} - llvm::APFloat::cmpResult result; - switch (Scalar::PromoteToMaxType(lhs, rhs)) { - case Scalar::e_void: - break; - case Scalar::e_int: - return lhs.m_integer < rhs.m_integer; - case Scalar::e_float: - result = lhs.m_float.compare(rhs.m_float); - if (result == llvm::APFloat::cmpLessThan) - return true; - } - return false; +bool lldb_private::operator<(const Scalar &lhs, const Scalar &rhs) { + return compare(lhs, rhs) == APFloat::cmpLessThan; } bool lldb_private::operator<=(const Scalar &lhs, const Scalar &rhs) { - return !(rhs < lhs); + APFloat::cmpResult Res = compare(lhs, rhs); + return Res == APFloat::cmpLessThan || Res == APFloat::cmpEqual; } bool lldb_private::operator>(const Scalar &lhs, const Scalar &rhs) { - return rhs < lhs; + return compare(lhs, rhs) == APFloat::cmpGreaterThan; } bool lldb_private::operator>=(const Scalar &lhs, const Scalar &rhs) { - return !(lhs < rhs); + APFloat::cmpResult Res = compare(lhs, rhs); + return Res == APFloat::cmpGreaterThan || Res == APFloat::cmpEqual; } bool Scalar::ClearBit(uint32_t bit) { diff --git a/lldb/test/API/lang/cpp/fpnan/Makefile b/lldb/test/API/lang/cpp/fpnan/Makefile new file mode 100644 index 00000000000000..99998b20bcb050 --- /dev/null +++ b/lldb/test/API/lang/cpp/fpnan/Makefile @@ -0,0 +1,3 @@ +CXX_SOURCES := main.cpp + +include Makefile.rules diff --git a/lldb/test/API/lang/cpp/fpnan/TestFPNaN.py b/lldb/test/API/lang/cpp/fpnan/TestFPNaN.py new file mode 100644 index 00000000000000..6093ef91ac1f03 --- /dev/null +++ b/lldb/test/API/lang/cpp/fpnan/TestFPNaN.py @@ -0,0 +1,130 @@ +""" +Test floating point expressions with zero, NaN, dernormalized and infinite +numbers. +""" + +import lldb +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + + +class FPNaNTestCase(TestBase): + def setUp(self): + # Call super's setUp(). + TestBase.setUp(self) + # Find the line number to break inside main(). + self.line = line_number("main.cpp", "// Set break point at this line.") + + def test(self): + self.build() + exe = self.getBuildArtifact("a.out") + self.runCmd("file " + exe, CURRENT_EXECUTABLE_SET) + + # Break inside the main. + lldbutil.run_break_set_by_file_and_line( + self, "main.cpp", self.line, num_expected_locations=1 + ) + + self.runCmd("run", RUN_SUCCEEDED) + # Zero and denorm + self.expect( + "expr +0.0", + VARIABLES_DISPLAYED_CORRECTLY, + substrs=["double", "0"], + ) + self.expect( + "expr -0.0", + VARIABLES_DISPLAYED_CORRECTLY, + substrs=["double", "0"], + ) + self.expect( + "expr 0.0 / 0", + VARIABLES_DISPLAYED_CORRECTLY, + substrs=["double", "NaN"], + ) + self.expect( + "expr 0 / 0.0", + VARIABLES_DISPLAYED_CORRECTLY, + substrs=["double", "NaN"], + ) + self.expect( + "expr 1 / +0.0", + VARIABLES_DISPLAYED_CORRECTLY, + substrs=["double", "+Inf"], + ) + self.expect( + "expr 1 / -0.0", + VARIABLES_DISPLAYED_CORRECTLY, + substrs=["double", "-Inf"], + ) + self.expect( + "expr +0.0 / +0.0 != +0.0 / +0.0", + VARIABLES_DISPLAYED_CORRECTLY, + substrs=["bool", "true"], + ) + self.expect( + "expr -1.f * 0", + VARIABLES_DISPLAYED_CORRECTLY, + substrs=["float", "-0"], + ) + self.expect( + "expr 0x0.123p-1", + VARIABLES_DISPLAYED_CORRECTLY, + substrs=["double", "0.0355224609375"], + ) + # NaN + self.expect( + "expr fnan < fnan", + VARIABLES_DISPLAYED_CORRECTLY, + substrs=["bool", "false"], + ) + self.expect( + "expr fnan <= fnan", + VARIABLES_DISPLAYED_CORRECTLY, + substrs=["bool", "false"], + ) + self.expect( + "expr fnan > fnan", + VARIABLES_DISPLAYED_CORRECTLY, + substrs=["bool", "false"], + ) + self.expect( + "expr fnan >= fnan", + VARIABLES_DISPLAYED_CORRECTLY, + substrs=["bool", "false"], + ) + self.expect( + "expr fnan == fnan", + VARIABLES_DISPLAYED_CORRECTLY, + substrs=["bool", "false"], + ) + self.expect( + "expr fnan != fnan", + VARIABLES_DISPLAYED_CORRECTLY, + substrs=["bool", "true"], + ) + self.expect( + "expr 1.0 <= fnan", + VARIABLES_DISPLAYED_CORRECTLY, + substrs=["bool", "false"], + ) + self.expect( + "expr 1.0f < fnan", + VARIABLES_DISPLAYED_CORRECTLY, + substrs=["bool", "false"], + ) + self.expect( + "expr 1.0f != fnan", + VARIABLES_DISPLAYED_CORRECTLY, + substrs=["bool", "true"], + ) + self.expect( + "expr (unsigned int) fdenorm", + VARIABLES_DISPLAYED_CORRECTLY, + substrs=["int", "0"], + ) + self.expect( + "expr (unsigned int) (1.0f + fdenorm)", + VARIABLES_DISPLAYED_CORRECTLY, + substrs=["int", "1"], + ) diff --git a/lldb/test/API/lang/cpp/fpnan/main.cpp b/lldb/test/API/lang/cpp/fpnan/main.cpp new file mode 100644 index 00000000000000..8bcfebfaea8e1e --- /dev/null +++ b/lldb/test/API/lang/cpp/fpnan/main.cpp @@ -0,0 +1,8 @@ +#include + +int main() { + float fnan = std::numeric_limits::quiet_NaN(); + float fdenorm = std::numeric_limits::denorm_min(); + + // Set break point at this line. +} From 76347ee9584bfcdaceb4ee48d39441c29aeb2124 Mon Sep 17 00:00:00 2001 From: Abid Qadeer Date: Wed, 18 Sep 2024 13:52:23 +0100 Subject: [PATCH 052/321] [flang][debug] Improve handling of dummy character arguments. (#108283) As described in #107998, we were not handling the case well when length of the character is not part of the type. This PR handles one of the case when the length can be calculated by looking at the result of corresponding `fir.unboxchar`. The DIStringTypeAttr have a `stringLength` field that can be a variable. We create an artificial variable that will hold the length and used as value of `stringLength` field. The variable is then attached with a `DbgValueOp`. Fixes #107998. --- .../Transforms/DebugTypeGenerator.cpp | 27 +++++++++++++++++-- flang/test/Transforms/debug-107988.fir | 23 ++++++++++++++++ 2 files changed, 48 insertions(+), 2 deletions(-) create mode 100644 flang/test/Transforms/debug-107988.fir diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp index 82c6a6618e0ed8..1390fae062b934 100644 --- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp +++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp @@ -271,6 +271,7 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertCharacterType( uint64_t sizeInBits = 0; mlir::LLVM::DIExpressionAttr lenExpr = nullptr; mlir::LLVM::DIExpressionAttr locExpr = nullptr; + mlir::LLVM::DIVariableAttr varAttr = nullptr; if (hasDescriptor) { llvm::SmallVector ops; @@ -289,7 +290,29 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertCharacterType( sizeInBits = charTy.getLen() * kindMapping.getCharacterBitsize(charTy.getFKind()); } else { - return genPlaceholderType(context); + // In assumed length string, the len of the character is not part of the + // type but can be found at the runtime. Here we create an artificial + // variable that will contain that length. This variable is used as + // 'stringLength' in DIStringTypeAttr. + if (declOp && !declOp.getTypeparams().empty()) { + mlir::Operation *op = declOp.getTypeparams()[0].getDefiningOp(); + if (auto unbox = mlir::dyn_cast_or_null(op)) { + auto name = + mlir::StringAttr::get(context, "." + declOp.getUniqName().str()); + mlir::OpBuilder builder(context); + builder.setInsertionPoint(declOp); + mlir::Type i64Ty = builder.getIntegerType(64); + auto convOp = builder.create(unbox.getLoc(), i64Ty, + unbox.getResult(1)); + mlir::LLVM::DITypeAttr Ty = convertType(i64Ty, fileAttr, scope, declOp); + auto lvAttr = mlir::LLVM::DILocalVariableAttr::get( + context, scope, name, fileAttr, /*line=*/0, /*argNo=*/0, + /*alignInBits=*/0, Ty, mlir::LLVM::DIFlags::Artificial); + builder.create(convOp.getLoc(), convOp, lvAttr, + nullptr); + varAttr = mlir::cast(lvAttr); + } + } } // FIXME: Currently the DIStringType in llvm does not have the option to set @@ -299,7 +322,7 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertCharacterType( return mlir::LLVM::DIStringTypeAttr::get( context, llvm::dwarf::DW_TAG_string_type, mlir::StringAttr::get(context, ""), sizeInBits, /*alignInBits=*/0, - /*stringLength=*/nullptr, lenExpr, locExpr, encoding); + /*stringLength=*/varAttr, lenExpr, locExpr, encoding); } mlir::LLVM::DITypeAttr DebugTypeGenerator::convertPointerLikeType( diff --git a/flang/test/Transforms/debug-107988.fir b/flang/test/Transforms/debug-107988.fir new file mode 100644 index 00000000000000..308f78a865120c --- /dev/null +++ b/flang/test/Transforms/debug-107988.fir @@ -0,0 +1,23 @@ +// RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s -o - | FileCheck %s + +module attributes {dlti.dl_spec = #dlti.dl_spec<>} { + func.func @test(%arg0: !fir.ref> {fir.bindc_name = "str"}, %arg1: i64) { + %0 = fir.emboxchar %arg0, %arg1 : (!fir.ref>, i64) -> !fir.boxchar<1> + %1 = fir.undefined !fir.dscope + %2:2 = fir.unboxchar %0 : (!fir.boxchar<1>) -> (!fir.ref>, index) loc(#loc1) + %3 = fircg.ext_declare %2#0 typeparams %2#1 dummy_scope %1 {uniq_name = "_QFtestEstr"} : (!fir.ref>, index, !fir.dscope) -> !fir.ref> loc(#loc1) + return + } loc(#loc2) +} + +#loc1 = loc("test.f90":5:1) +#loc2 = loc("test.f90":15:1) + +// CHECK: #[[VAR:.*]] = #llvm.di_local_variable<{{.*}}name = "._QFtestEstr"{{.*}}flags = Artificial> +// CHECK: func.func @test +// CHECK: %[[V1:.*]]:2 = fir.unboxchar{{.*}} +// CHECK: %[[V2:.*]] = fir.convert %[[V1]]#1 : (index) -> i64 +// CHECK: llvm.intr.dbg.value #di_local_variable = %[[V2]] : i64 +// CHECK: #[[STR_TY:.*]] = #llvm.di_string_type +// CHECK: #llvm.di_local_variable<{{.*}}name = "str"{{.*}}type = #[[STR_TY]]> + From 311e4e3245818d42e2bd148157c960f567f37096 Mon Sep 17 00:00:00 2001 From: Mahesh-Attarde <145317060+mahesh-attarde@users.noreply.github.com> Date: Wed, 18 Sep 2024 06:01:51 -0700 Subject: [PATCH 053/321] [X86][AVX10.2] Support AVX10.2 MOVZXC new Instructions. (#108537) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ref.: https://cdrdv2.intel.com/v1/dl/getContent/828965 Chapter 14 INTEL® AVX10 ZERO-EXTENDING PARTIAL VECTOR COPY INSTRUCTIONS --------- Co-authored-by: mattarde --- clang/lib/Headers/CMakeLists.txt | 1 + clang/lib/Headers/avx10_2copyintrin.h | 34 +++++++ clang/lib/Headers/immintrin.h | 1 + clang/test/CodeGen/X86/avx512copy-builtins.c | 17 ++++ llvm/lib/Target/X86/X86ISelLowering.cpp | 2 +- llvm/lib/Target/X86/X86InstrAVX10.td | 64 ++++++++++++ .../test/CodeGen/X86/avx512copy-intrinsics.ll | 35 +++++++ .../MC/Disassembler/X86/avx10.2-copy-32.txt | 34 +++++++ .../MC/Disassembler/X86/avx10.2-copy-64.txt | 34 +++++++ llvm/test/MC/X86/avx10.2-copy-32-att.s | 82 ++++++++++++++++ llvm/test/MC/X86/avx10.2-copy-32-intel.s | 81 ++++++++++++++++ llvm/test/MC/X86/avx10.2-copy-64-att.s | 97 +++++++++++++++++++ llvm/test/MC/X86/avx10.2-copy-64-intel.s | 97 +++++++++++++++++++ llvm/test/TableGen/x86-fold-tables.inc | 2 + llvm/utils/TableGen/X86ManualInstrMapping.def | 1 + 15 files changed, 581 insertions(+), 1 deletion(-) create mode 100644 clang/lib/Headers/avx10_2copyintrin.h create mode 100644 clang/test/CodeGen/X86/avx512copy-builtins.c create mode 100644 llvm/test/CodeGen/X86/avx512copy-intrinsics.ll create mode 100644 llvm/test/MC/Disassembler/X86/avx10.2-copy-32.txt create mode 100644 llvm/test/MC/Disassembler/X86/avx10.2-copy-64.txt create mode 100644 llvm/test/MC/X86/avx10.2-copy-32-att.s create mode 100644 llvm/test/MC/X86/avx10.2-copy-32-intel.s create mode 100644 llvm/test/MC/X86/avx10.2-copy-64-att.s create mode 100644 llvm/test/MC/X86/avx10.2-copy-64-intel.s diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index 4c75c638b41bae..f5cc07c303f9eb 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -156,6 +156,7 @@ set(x86_files avx10_2_512satcvtintrin.h avx10_2bf16intrin.h avx10_2convertintrin.h + avx10_2copyintrin.h avx10_2minmaxintrin.h avx10_2niintrin.h avx10_2satcvtdsintrin.h diff --git a/clang/lib/Headers/avx10_2copyintrin.h b/clang/lib/Headers/avx10_2copyintrin.h new file mode 100644 index 00000000000000..7fc31190781d91 --- /dev/null +++ b/clang/lib/Headers/avx10_2copyintrin.h @@ -0,0 +1,34 @@ +/*===---- avx10_2copyintrin.h - AVX10.2 Copy intrinsics -------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error \ + "Never use directly; include instead." +#endif // __IMMINTRIN_H + +#ifndef __AVX10_2COPYINTRIN_H +#define __AVX10_2COPYINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \ + __min_vector_width__(128))) + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_move_epi32(__m128i __A) { + return (__m128i)__builtin_shufflevector( + (__v4si)__A, (__v4si)_mm_setzero_si128(), 0, 4, 4, 4); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_move_epi16(__m128i __A) { + return (__m128i)__builtin_shufflevector( + (__v8hi)__A, (__v8hi)_mm_setzero_si128(), 0, 8, 8, 8, 8, 8, 8, 8); +} + +#undef __DEFAULT_FN_ATTRS128 + +#endif // __AVX10_2COPYINTRIN_H diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h index 280154f3c1026e..3fbabffa98df20 100644 --- a/clang/lib/Headers/immintrin.h +++ b/clang/lib/Headers/immintrin.h @@ -651,6 +651,7 @@ _storebe_i64(void * __P, long long __D) { #if !defined(__SCE__) || __has_feature(modules) || defined(__AVX10_2__) #include #include +#include #include #include #include diff --git a/clang/test/CodeGen/X86/avx512copy-builtins.c b/clang/test/CodeGen/X86/avx512copy-builtins.c new file mode 100644 index 00000000000000..06f7507bde53ed --- /dev/null +++ b/clang/test/CodeGen/X86/avx512copy-builtins.c @@ -0,0 +1,17 @@ +// RUN: %clang_cc1 %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-unknown-unknown -target-feature +avx10.2-512 \ +// RUN: -emit-llvm -o - -Wall -Werror -pedantic -Wno-gnu-statement-expression | FileCheck %s + +#include +#include + +__m128i test_mm_move_epi32(__m128i A) { + // CHECK-LABEL: test_mm_move_epi32 + // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> + return _mm_move_epi32(A); +} + +__m128i test_mm_move_epi16(__m128i A) { + // CHECK-LABEL: test_mm_move_epi16 + // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> + return _mm_move_epi16(A); +} diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 182f6c08366a99..68563f556ecfb4 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -12348,7 +12348,7 @@ static SDValue lowerShuffleAsElementInsertion( } V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S); } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 || - EltVT == MVT::i16) { + (EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) { // Either not inserting from the low element of the input or the input // element size is too small to use VZEXT_MOVL to clear the high bits. return SDValue(); diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td index f0334109a32b68..625f2e01d47218 100644 --- a/llvm/lib/Target/X86/X86InstrAVX10.td +++ b/llvm/lib/Target/X86/X86InstrAVX10.td @@ -1583,3 +1583,67 @@ let Defs = [EFLAGS], Uses = [MXCSR], Predicates = [HasAVX10_2] in { "vucomxss", SSEPackedSingle>, TB, XD, VEX_LIG, EVEX_CD8<32, CD8VT1>; } + +//------------------------------------------------- +// AVX10 MOVZXC (COPY) instructions +//------------------------------------------------- +let Predicates = [HasAVX10_2] in { + def VMOVZPDILo2PDIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst), + (ins VR128X:$src), + "vmovd\t{$src, $dst|$dst, $src}", + [(set VR128X:$dst, (v4i32 (X86vzmovl + (v4i32 VR128X:$src))))]>, EVEX, + Sched<[WriteVecMoveFromGpr]>; + +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in + def VMOVZPDILo2PDIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst), + (ins i32mem:$src), + "vmovd\t{$src, $dst|$dst, $src}", []>, EVEX, + EVEX_CD8<32, CD8VT1>, + Sched<[WriteVecLoad]>; + +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in + def VMOVZPDILo2PDIZmr : AVX512PDI<0xD6, MRMDestMem, (outs), + (ins i32mem:$dst, VR128X:$src), + "vmovd\t{$src, $dst|$dst, $src}", []>, EVEX, + EVEX_CD8<32, CD8VT1>, + Sched<[WriteVecStore]>; + +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in + def VMOVZPDILo2PDIZrr2 : AVX512PDI<0xD6, MRMSrcReg, (outs VR128X:$dst), + (ins VR128X:$src), + "vmovd\t{$src, $dst|$dst, $src}", []>, EVEX, + Sched<[WriteVecMoveFromGpr]>; + def : InstAlias<"vmovd.s\t{$src, $dst|$dst, $src}", + (VMOVZPDILo2PDIZrr2 VR128X:$dst, VR128X:$src), 0>; + +def VMOVZPWILo2PWIZrr : AVX512XSI<0x6E, MRMSrcReg, (outs VR128X:$dst), + (ins VR128X:$src), + "vmovw\t{$src, $dst|$dst, $src}", + [(set VR128X:$dst, (v8i16 (X86vzmovl + (v8i16 VR128X:$src))))]>, EVEX, T_MAP5, + Sched<[WriteVecMoveFromGpr]>; + +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in + def VMOVZPWILo2PWIZrm : AVX512XSI<0x6E, MRMSrcMem, (outs VR128X:$dst), + (ins i16mem:$src), + "vmovw\t{$src, $dst|$dst, $src}", []>, EVEX, + EVEX_CD8<16, CD8VT1>, T_MAP5, + Sched<[WriteVecLoad]>; + +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in + def VMOVZPWILo2PWIZmr : AVX512XSI<0x7E, MRMDestMem, (outs), + (ins i32mem:$dst, VR128X:$src), + "vmovw\t{$src, $dst|$dst, $src}", []>, EVEX, + EVEX_CD8<16, CD8VT1>, T_MAP5, + Sched<[WriteVecStore]>; + +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in + def VMOVZPWILo2PWIZrr2 : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst), + (ins VR128X:$src), + "vmovw\t{$src, $dst|$dst, $src}", + []>, EVEX, T_MAP5, + Sched<[WriteVecMoveFromGpr]>; + def : InstAlias<"vmovw.s\t{$src, $dst|$dst, $src}", + (VMOVZPWILo2PWIZrr2 VR128X:$dst, VR128X:$src), 0>; +} diff --git a/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll b/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll new file mode 100644 index 00000000000000..a7ca23792e6feb --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=AVX102 +; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx512f | FileCheck %s --check-prefixes=NOAVX512MOVZXC + +define <4 x i32> @test_mm_move_epi32(<4 x i32> %a0) nounwind { +; AVX102-LABEL: test_mm_move_epi32: +; AVX102: # %bb.0: +; AVX102-NEXT: vmovd %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7e,0x08,0x7e,0xc0] +; AVX102-NEXT: retq # encoding: [0xc3] +; +; NOAVX512MOVZXC-LABEL: test_mm_move_epi32: +; NOAVX512MOVZXC: # %bb.0: +; NOAVX512MOVZXC-NEXT: vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9] +; NOAVX512MOVZXC-NEXT: vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01] +; NOAVX512MOVZXC-NEXT: # xmm0 = xmm0[0],xmm1[1,2,3] +; NOAVX512MOVZXC-NEXT: retq # encoding: [0xc3] + %res = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %res +} + +define <8 x i16> @test_mm_move_epi16(<8 x i16> %a0) nounwind { +; AVX102-LABEL: test_mm_move_epi16: +; AVX102: # %bb.0: +; AVX102-NEXT: vmovw %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x6e,0xc0] +; AVX102-NEXT: retq # encoding: [0xc3] +; +; NOAVX512MOVZXC-LABEL: test_mm_move_epi16: +; NOAVX512MOVZXC: # %bb.0: +; NOAVX512MOVZXC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xef,0xc9] +; NOAVX512MOVZXC-NEXT: vpblendw $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0e,0xc0,0x01] +; NOAVX512MOVZXC-NEXT: # xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; NOAVX512MOVZXC-NEXT: retq # encoding: [0xc3] + %res = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> + ret <8 x i16> %res +} diff --git a/llvm/test/MC/Disassembler/X86/avx10.2-copy-32.txt b/llvm/test/MC/Disassembler/X86/avx10.2-copy-32.txt new file mode 100644 index 00000000000000..e86c2340a486c5 --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/avx10.2-copy-32.txt @@ -0,0 +1,34 @@ +# RUN: llvm-mc --disassemble %s -triple=i386 | FileCheck %s --check-prefixes=ATT +# RUN: llvm-mc --disassemble %s -triple=i386 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL + +# ATT: vmovd (%ecx), %xmm5 +# INTEL: vmovd xmm5, dword ptr [ecx] +0x62 0xf1 0x7e 0x08 0x7e 0x29 + +# ATT: vmovd %xmm5, (%ecx) +# INTEL: vmovd dword ptr [ecx], xmm5 +0x62 0xf1 0x7d 0x08 0xd6 0x29 + +# ATT: vmovd %xmm2, %xmm1 +# INTEL: vmovd xmm1, xmm2 +0x62 0xf1 0x7e 0x08 0x7e 0xca + +# ATT: vmovd %xmm2, %xmm1 +# INTEL: vmovd xmm1, xmm2 +0x62 0xf1 0x7d 0x08 0xd6 0xca + +# ATT: vmovw %xmm5, (%ecx) +# INTEL: vmovw dword ptr [ecx], xmm5 +0x62 0xf5 0x7e 0x08 0x7e 0x29 + +# ATT: vmovw (%ecx), %xmm5 +# INTEL: vmovw xmm5, word ptr [ecx] +0x62 0xf5 0x7e 0x08 0x6e 0x29 + +# ATT: vmovw %xmm2, %xmm1 +# INTEL: vmovw xmm1, xmm2 +0x62 0xf5 0x7e 0x08 0x6e 0xca + +# ATT: vmovw %xmm2, %xmm1 +# INTEL: vmovw xmm1, xmm2 +0x62 0xf5 0x7e 0x08 0x7e 0xca diff --git a/llvm/test/MC/Disassembler/X86/avx10.2-copy-64.txt b/llvm/test/MC/Disassembler/X86/avx10.2-copy-64.txt new file mode 100644 index 00000000000000..36ddd75a77ad39 --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/avx10.2-copy-64.txt @@ -0,0 +1,34 @@ +# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT +# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL + +# ATT: vmovd (%rcx), %xmm29 +# INTEL: vmovd xmm29, dword ptr [rcx] +0x62 0x61 0x7e 0x08 0x7e 0x29 + +# ATT: vmovd %xmm29, (%rcx) +# INTEL: vmovd dword ptr [rcx], xmm29 +0x62 0x61 0x7d 0x08 0xd6 0x29 + +# ATT: vmovd %xmm22, %xmm21 +# INTEL: vmovd xmm21, xmm22 +0x62 0xa1 0x7e 0x08 0x7e 0xee + +# ATT: vmovd %xmm22, %xmm21 +# INTEL: vmovd xmm21, xmm22 +0x62 0xa1 0x7d 0x08 0xd6 0xee + +# ATT: vmovw %xmm29, (%rcx) +# INTEL: vmovw dword ptr [rcx], xmm29 +0x62 0x65 0x7e 0x08 0x7e 0x29 + +# ATT: vmovw (%rcx), %xmm29 +# INTEL: vmovw xmm29, word ptr [rcx] +0x62 0x65 0x7e 0x08 0x6e 0x29 + +# ATT: vmovw %xmm22, %xmm21 +# INTEL: vmovw xmm21, xmm22 +0x62 0xa5 0x7e 0x08 0x6e 0xee + +# ATT: vmovw %xmm22, %xmm21 +# INTEL: vmovw xmm21, xmm22 +0x62 0xa5 0x7e 0x08 0x7e 0xee diff --git a/llvm/test/MC/X86/avx10.2-copy-32-att.s b/llvm/test/MC/X86/avx10.2-copy-32-att.s new file mode 100644 index 00000000000000..2bc498720849c9 --- /dev/null +++ b/llvm/test/MC/X86/avx10.2-copy-32-att.s @@ -0,0 +1,82 @@ +// RUN: llvm-mc -triple i386 --show-encoding %s | FileCheck %s + +// CHECK: vmovd 268435456(%esp,%esi,8), %xmm2 +// CHECK: encoding: [0xc5,0xf9,0x6e,0x94,0xf4,0x00,0x00,0x00,0x10] + vmovd 268435456(%esp,%esi,8), %xmm2 + +// CHECK: vmovd 291(%edi,%eax,4), %xmm2 +// CHECK: encoding: [0xc5,0xf9,0x6e,0x94,0x87,0x23,0x01,0x00,0x00] + vmovd 291(%edi,%eax,4), %xmm2 + +// CHECK: vmovd (%eax), %xmm2 +// CHECK: encoding: [0xc5,0xf9,0x6e,0x10] + vmovd (%eax), %xmm2 + +// CHECK: vmovd -128(,%ebp,2), %xmm2 +// CHECK: encoding: [0xc5,0xf9,0x6e,0x14,0x6d,0x80,0xff,0xff,0xff] + vmovd -128(,%ebp,2), %xmm2 + +// CHECK: vmovd %xmm3, 268435456(%esp,%esi,8) +// CHECK: encoding: [0xc5,0xf9,0x7e,0x9c,0xf4,0x00,0x00,0x00,0x10] + vmovd %xmm3, 268435456(%esp,%esi,8) + +// CHECK: vmovd %xmm3, 291(%edi,%eax,4) +// CHECK: encoding: [0xc5,0xf9,0x7e,0x9c,0x87,0x23,0x01,0x00,0x00] + vmovd %xmm3, 291(%edi,%eax,4) + +// CHECK: vmovd %xmm3, (%eax) +// CHECK: encoding: [0xc5,0xf9,0x7e,0x18] + vmovd %xmm3, (%eax) + +// CHECK: vmovd %xmm3, -128(,%ebp,2) +// CHECK: encoding: [0xc5,0xf9,0x7e,0x1c,0x6d,0x80,0xff,0xff,0xff] + vmovd %xmm3, -128(,%ebp,2) + +// CHECK: vmovw 268435456(%esp,%esi,8), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0x94,0xf4,0x00,0x00,0x00,0x10] + vmovw 268435456(%esp,%esi,8), %xmm2 + +// CHECK: vmovw 291(%edi,%eax,4), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0x94,0x87,0x23,0x01,0x00,0x00] + vmovw 291(%edi,%eax,4), %xmm2 + +// CHECK: vmovw (%eax), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0x10] + vmovw (%eax), %xmm2 + +// CHECK: vmovw -64(,%ebp,2), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0x14,0x6d,0xc0,0xff,0xff,0xff] + vmovw -64(,%ebp,2), %xmm2 + +// CHECK: vmovw 254(%ecx), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0x51,0x7f] + vmovw 254(%ecx), %xmm2 + +// CHECK: vmovw -256(%edx), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0x52,0x80] + vmovw -256(%edx), %xmm2 + +// CHECK: vmovw %xmm3, 268435456(%esp,%esi,8) +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0x9c,0xf4,0x00,0x00,0x00,0x10] + vmovw %xmm3, 268435456(%esp,%esi,8) + +// CHECK: vmovw %xmm3, 291(%edi,%eax,4) +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0x9c,0x87,0x23,0x01,0x00,0x00] + vmovw %xmm3, 291(%edi,%eax,4) + +// CHECK: vmovw %xmm3, (%eax) +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0x18] + vmovw %xmm3, (%eax) + +// CHECK: vmovw %xmm3, -64(,%ebp,2) +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0x1c,0x6d,0xc0,0xff,0xff,0xff] + vmovw %xmm3, -64(,%ebp,2) + +// CHECK: vmovw %xmm3, 254(%ecx) +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0x59,0x7f] + vmovw %xmm3, 254(%ecx) + +// CHECK: vmovw %xmm3, -256(%edx) +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0x5a,0x80] + vmovw %xmm3, -256(%edx) + diff --git a/llvm/test/MC/X86/avx10.2-copy-32-intel.s b/llvm/test/MC/X86/avx10.2-copy-32-intel.s new file mode 100644 index 00000000000000..aa84548e5f75dd --- /dev/null +++ b/llvm/test/MC/X86/avx10.2-copy-32-intel.s @@ -0,0 +1,81 @@ +// RUN: llvm-mc -triple i386 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s + +// CHECK: vmovd xmm2, dword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0xc5,0xf9,0x6e,0x94,0xf4,0x00,0x00,0x00,0x10] + vmovd xmm2, dword ptr [esp + 8*esi + 268435456] + +// CHECK: vmovd xmm2, dword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0xc5,0xf9,0x6e,0x94,0x87,0x23,0x01,0x00,0x00] + vmovd xmm2, dword ptr [edi + 4*eax + 291] + +// CHECK: vmovd xmm2, dword ptr [eax] +// CHECK: encoding: [0xc5,0xf9,0x6e,0x10] + vmovd xmm2, dword ptr [eax] + +// CHECK: vmovd xmm2, dword ptr [2*ebp - 128] +// CHECK: encoding: [0xc5,0xf9,0x6e,0x14,0x6d,0x80,0xff,0xff,0xff] + vmovd xmm2, dword ptr [2*ebp - 128] + +// CHECK: vmovd dword ptr [esp + 8*esi + 268435456], xmm3 +// CHECK: encoding: [0xc5,0xf9,0x7e,0x9c,0xf4,0x00,0x00,0x00,0x10] + vmovd dword ptr [esp + 8*esi + 268435456], xmm3 + +// CHECK: vmovd dword ptr [edi + 4*eax + 291], xmm3 +// CHECK: encoding: [0xc5,0xf9,0x7e,0x9c,0x87,0x23,0x01,0x00,0x00] + vmovd dword ptr [edi + 4*eax + 291], xmm3 + +// CHECK: vmovd dword ptr [eax], xmm3 +// CHECK: encoding: [0xc5,0xf9,0x7e,0x18] + vmovd dword ptr [eax], xmm3 + +// CHECK: vmovd dword ptr [2*ebp - 128], xmm3 +// CHECK: encoding: [0xc5,0xf9,0x7e,0x1c,0x6d,0x80,0xff,0xff,0xff] + vmovd dword ptr [2*ebp - 128], xmm3 + +// CHECK: vmovw xmm2, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0x94,0xf4,0x00,0x00,0x00,0x10] + vmovw xmm2, word ptr [esp + 8*esi + 268435456] + +// CHECK: vmovw xmm2, word ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0x94,0x87,0x23,0x01,0x00,0x00] + vmovw xmm2, word ptr [edi + 4*eax + 291] + +// CHECK: vmovw xmm2, word ptr [eax] +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0x10] + vmovw xmm2, word ptr [eax] + +// CHECK: vmovw xmm2, word ptr [2*ebp - 64] +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0x14,0x6d,0xc0,0xff,0xff,0xff] + vmovw xmm2, word ptr [2*ebp - 64] + +// CHECK: vmovw xmm2, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0x51,0x7f] + vmovw xmm2, word ptr [ecx + 254] + +// CHECK: vmovw xmm2, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0x52,0x80] + vmovw xmm2, word ptr [edx - 256] + +// CHECK: vmovw word ptr [esp + 8*esi + 268435456], xmm3 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0x9c,0xf4,0x00,0x00,0x00,0x10] + vmovw word ptr [esp + 8*esi + 268435456], xmm3 + +// CHECK: vmovw word ptr [edi + 4*eax + 291], xmm3 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0x9c,0x87,0x23,0x01,0x00,0x00] + vmovw word ptr [edi + 4*eax + 291], xmm3 + +// CHECK: vmovw word ptr [eax], xmm3 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0x18] + vmovw word ptr [eax], xmm3 + +// CHECK: vmovw word ptr [2*ebp - 64], xmm3 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0x1c,0x6d,0xc0,0xff,0xff,0xff] + vmovw word ptr [2*ebp - 64], xmm3 + +// CHECK: vmovw word ptr [ecx + 254], xmm3 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0x59,0x7f] + vmovw word ptr [ecx + 254], xmm3 + +// CHECK: vmovw word ptr [edx - 256], xmm3 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0x5a,0x80] + vmovw word ptr [edx - 256], xmm3 diff --git a/llvm/test/MC/X86/avx10.2-copy-64-att.s b/llvm/test/MC/X86/avx10.2-copy-64-att.s new file mode 100644 index 00000000000000..a672b2d842240c --- /dev/null +++ b/llvm/test/MC/X86/avx10.2-copy-64-att.s @@ -0,0 +1,97 @@ +// RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s + +// CHECK: vmovd 268435456(%rbp,%r14,8), %xmm22 +// CHECK: encoding: [0x62,0xa1,0x7d,0x08,0x6e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmovd 268435456(%rbp,%r14,8), %xmm22 + +// CHECK: vmovd 291(%r8,%rax,4), %xmm22 +// CHECK: encoding: [0x62,0xc1,0x7d,0x08,0x6e,0xb4,0x80,0x23,0x01,0x00,0x00] + vmovd 291(%r8,%rax,4), %xmm22 + +// CHECK: vmovd (%rip), %xmm22 +// CHECK: encoding: [0x62,0xe1,0x7d,0x08,0x6e,0x35,0x00,0x00,0x00,0x00] + vmovd (%rip), %xmm22 + +// CHECK: vmovd -128(,%rbp,2), %xmm22 +// CHECK: encoding: [0x62,0xe1,0x7d,0x08,0x6e,0x34,0x6d,0x80,0xff,0xff,0xff] + vmovd -128(,%rbp,2), %xmm22 + +// CHECK: vmovd 508(%rcx), %xmm22 +// CHECK: encoding: [0x62,0xe1,0x7d,0x08,0x6e,0x71,0x7f] + vmovd 508(%rcx), %xmm22 + +// CHECK: vmovd -512(%rdx), %xmm22 +// CHECK: encoding: [0x62,0xe1,0x7d,0x08,0x6e,0x72,0x80] + vmovd -512(%rdx), %xmm22 + +// CHECK: vmovd %xmm23, 268435456(%rbp,%r14,8) +// CHECK: encoding: [0x62,0xa1,0x7d,0x08,0x7e,0xbc,0xf5,0x00,0x00,0x00,0x10] + vmovd %xmm23, 268435456(%rbp,%r14,8) + +// CHECK: vmovd %xmm23, 291(%r8,%rax,4) +// CHECK: encoding: [0x62,0xc1,0x7d,0x08,0x7e,0xbc,0x80,0x23,0x01,0x00,0x00] + vmovd %xmm23, 291(%r8,%rax,4) + +// CHECK: vmovd %xmm23, (%rip) +// CHECK: encoding: [0x62,0xe1,0x7d,0x08,0x7e,0x3d,0x00,0x00,0x00,0x00] + vmovd %xmm23, (%rip) + +// CHECK: vmovd %xmm23, -128(,%rbp,2) +// CHECK: encoding: [0x62,0xe1,0x7d,0x08,0x7e,0x3c,0x6d,0x80,0xff,0xff,0xff] + vmovd %xmm23, -128(,%rbp,2) + +// CHECK: vmovd %xmm23, 508(%rcx) +// CHECK: encoding: [0x62,0xe1,0x7d,0x08,0x7e,0x79,0x7f] + vmovd %xmm23, 508(%rcx) + +// CHECK: vmovd %xmm23, -512(%rdx) +// CHECK: encoding: [0x62,0xe1,0x7d,0x08,0x7e,0x7a,0x80] + vmovd %xmm23, -512(%rdx) + +// CHECK: vmovw 268435456(%rbp,%r14,8), %xmm22 +// CHECK: encoding: [0x62,0xa5,0x7d,0x08,0x6e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmovw 268435456(%rbp,%r14,8), %xmm22 + +// CHECK: vmovw 291(%r8,%rax,4), %xmm22 +// CHECK: encoding: [0x62,0xc5,0x7d,0x08,0x6e,0xb4,0x80,0x23,0x01,0x00,0x00] + vmovw 291(%r8,%rax,4), %xmm22 + +// CHECK: vmovw (%rip), %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x6e,0x35,0x00,0x00,0x00,0x00] + vmovw (%rip), %xmm22 + +// CHECK: vmovw -64(,%rbp,2), %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x6e,0x34,0x6d,0xc0,0xff,0xff,0xff] + vmovw -64(,%rbp,2), %xmm22 + +// CHECK: vmovw 254(%rcx), %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x6e,0x71,0x7f] + vmovw 254(%rcx), %xmm22 + +// CHECK: vmovw -256(%rdx), %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x6e,0x72,0x80] + vmovw -256(%rdx), %xmm22 + +// CHECK: vmovw %xmm23, 268435456(%rbp,%r14,8) +// CHECK: encoding: [0x62,0xa5,0x7d,0x08,0x7e,0xbc,0xf5,0x00,0x00,0x00,0x10] + vmovw %xmm23, 268435456(%rbp,%r14,8) + +// CHECK: vmovw %xmm23, 291(%r8,%rax,4) +// CHECK: encoding: [0x62,0xc5,0x7d,0x08,0x7e,0xbc,0x80,0x23,0x01,0x00,0x00] + vmovw %xmm23, 291(%r8,%rax,4) + +// CHECK: vmovw %xmm23, (%rip) +// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x7e,0x3d,0x00,0x00,0x00,0x00] + vmovw %xmm23, (%rip) + +// CHECK: vmovw %xmm23, -64(,%rbp,2) +// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x7e,0x3c,0x6d,0xc0,0xff,0xff,0xff] + vmovw %xmm23, -64(,%rbp,2) + +// CHECK: vmovw %xmm23, 254(%rcx) +// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x7e,0x79,0x7f] + vmovw %xmm23, 254(%rcx) + +// CHECK: vmovw %xmm23, -256(%rdx) +// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x7e,0x7a,0x80] + vmovw %xmm23, -256(%rdx) diff --git a/llvm/test/MC/X86/avx10.2-copy-64-intel.s b/llvm/test/MC/X86/avx10.2-copy-64-intel.s new file mode 100644 index 00000000000000..4fd7b67dfa5db5 --- /dev/null +++ b/llvm/test/MC/X86/avx10.2-copy-64-intel.s @@ -0,0 +1,97 @@ +// RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s + +// CHECK: vmovd xmm22, dword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa1,0x7d,0x08,0x6e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmovd xmm22, dword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vmovd xmm22, dword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc1,0x7d,0x08,0x6e,0xb4,0x80,0x23,0x01,0x00,0x00] + vmovd xmm22, dword ptr [r8 + 4*rax + 291] + +// CHECK: vmovd xmm22, dword ptr [rip] +// CHECK: encoding: [0x62,0xe1,0x7d,0x08,0x6e,0x35,0x00,0x00,0x00,0x00] + vmovd xmm22, dword ptr [rip] + +// CHECK: vmovd xmm22, dword ptr [2*rbp - 128] +// CHECK: encoding: [0x62,0xe1,0x7d,0x08,0x6e,0x34,0x6d,0x80,0xff,0xff,0xff] + vmovd xmm22, dword ptr [2*rbp - 128] + +// CHECK: vmovd xmm22, dword ptr [rcx + 508] +// CHECK: encoding: [0x62,0xe1,0x7d,0x08,0x6e,0x71,0x7f] + vmovd xmm22, dword ptr [rcx + 508] + +// CHECK: vmovd xmm22, dword ptr [rdx - 512] +// CHECK: encoding: [0x62,0xe1,0x7d,0x08,0x6e,0x72,0x80] + vmovd xmm22, dword ptr [rdx - 512] + +// CHECK: vmovd dword ptr [rbp + 8*r14 + 268435456], xmm23 +// CHECK: encoding: [0x62,0xa1,0x7d,0x08,0x7e,0xbc,0xf5,0x00,0x00,0x00,0x10] + vmovd dword ptr [rbp + 8*r14 + 268435456], xmm23 + +// CHECK: vmovd dword ptr [r8 + 4*rax + 291], xmm23 +// CHECK: encoding: [0x62,0xc1,0x7d,0x08,0x7e,0xbc,0x80,0x23,0x01,0x00,0x00] + vmovd dword ptr [r8 + 4*rax + 291], xmm23 + +// CHECK: vmovd dword ptr [rip], xmm23 +// CHECK: encoding: [0x62,0xe1,0x7d,0x08,0x7e,0x3d,0x00,0x00,0x00,0x00] + vmovd dword ptr [rip], xmm23 + +// CHECK: vmovd dword ptr [2*rbp - 128], xmm23 +// CHECK: encoding: [0x62,0xe1,0x7d,0x08,0x7e,0x3c,0x6d,0x80,0xff,0xff,0xff] + vmovd dword ptr [2*rbp - 128], xmm23 + +// CHECK: vmovd dword ptr [rcx + 508], xmm23 +// CHECK: encoding: [0x62,0xe1,0x7d,0x08,0x7e,0x79,0x7f] + vmovd dword ptr [rcx + 508], xmm23 + +// CHECK: vmovd dword ptr [rdx - 512], xmm23 +// CHECK: encoding: [0x62,0xe1,0x7d,0x08,0x7e,0x7a,0x80] + vmovd dword ptr [rdx - 512], xmm23 + +// CHECK: vmovw xmm22, word ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x7d,0x08,0x6e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmovw xmm22, word ptr [rbp + 8*r14 + 268435456] + +// CHECK: vmovw xmm22, word ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x7d,0x08,0x6e,0xb4,0x80,0x23,0x01,0x00,0x00] + vmovw xmm22, word ptr [r8 + 4*rax + 291] + +// CHECK: vmovw xmm22, word ptr [rip] +// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x6e,0x35,0x00,0x00,0x00,0x00] + vmovw xmm22, word ptr [rip] + +// CHECK: vmovw xmm22, word ptr [2*rbp - 64] +// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x6e,0x34,0x6d,0xc0,0xff,0xff,0xff] + vmovw xmm22, word ptr [2*rbp - 64] + +// CHECK: vmovw xmm22, word ptr [rcx + 254] +// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x6e,0x71,0x7f] + vmovw xmm22, word ptr [rcx + 254] + +// CHECK: vmovw xmm22, word ptr [rdx - 256] +// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x6e,0x72,0x80] + vmovw xmm22, word ptr [rdx - 256] + +// CHECK: vmovw word ptr [rbp + 8*r14 + 268435456], xmm23 +// CHECK: encoding: [0x62,0xa5,0x7d,0x08,0x7e,0xbc,0xf5,0x00,0x00,0x00,0x10] + vmovw word ptr [rbp + 8*r14 + 268435456], xmm23 + +// CHECK: vmovw word ptr [r8 + 4*rax + 291], xmm23 +// CHECK: encoding: [0x62,0xc5,0x7d,0x08,0x7e,0xbc,0x80,0x23,0x01,0x00,0x00] + vmovw word ptr [r8 + 4*rax + 291], xmm23 + +// CHECK: vmovw word ptr [rip], xmm23 +// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x7e,0x3d,0x00,0x00,0x00,0x00] + vmovw word ptr [rip], xmm23 + +// CHECK: vmovw word ptr [2*rbp - 64], xmm23 +// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x7e,0x3c,0x6d,0xc0,0xff,0xff,0xff] + vmovw word ptr [2*rbp - 64], xmm23 + +// CHECK: vmovw word ptr [rcx + 254], xmm23 +// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x7e,0x79,0x7f] + vmovw word ptr [rcx + 254], xmm23 + +// CHECK: vmovw word ptr [rdx - 256], xmm23 +// CHECK: encoding: [0x62,0xe5,0x7d,0x08,0x7e,0x7a,0x80] + vmovw word ptr [rdx - 256], xmm23 diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc index 94347839d281f9..85d9b02ac0cbf1 100644 --- a/llvm/test/TableGen/x86-fold-tables.inc +++ b/llvm/test/TableGen/x86-fold-tables.inc @@ -1617,8 +1617,10 @@ static const X86FoldTableEntry Table1[] = { {X86::VMOVUPSZrr, X86::VMOVUPSZrm, 0}, {X86::VMOVUPSrr, X86::VMOVUPSrm, 0}, {X86::VMOVW2SHrr, X86::VMOVWrm, TB_NO_REVERSE}, + {X86::VMOVZPDILo2PDIZrr, X86::VMOVZPDILo2PDIZrm, TB_NO_REVERSE}, {X86::VMOVZPQILo2PQIZrr, X86::VMOVQI2PQIZrm, TB_NO_REVERSE}, {X86::VMOVZPQILo2PQIrr, X86::VMOVQI2PQIrm, TB_NO_REVERSE}, + {X86::VMOVZPWILo2PWIZrr, X86::VMOVZPWILo2PWIZrm, TB_NO_REVERSE}, {X86::VPABSBYrr, X86::VPABSBYrm, 0}, {X86::VPABSBZ128rr, X86::VPABSBZ128rm, 0}, {X86::VPABSBZ256rr, X86::VPABSBZ256rm, 0}, diff --git a/llvm/utils/TableGen/X86ManualInstrMapping.def b/llvm/utils/TableGen/X86ManualInstrMapping.def index d76c404722b0ac..bc539d792f38df 100644 --- a/llvm/utils/TableGen/X86ManualInstrMapping.def +++ b/llvm/utils/TableGen/X86ManualInstrMapping.def @@ -32,6 +32,7 @@ NOCOMP(VPSRAQZ128ri) NOCOMP(VPSRAQZ128rm) NOCOMP(VPSRAQZ128rr) NOCOMP(VSCALEFPSZ128rm) +NOCOMP(VMOVZPDILo2PDIZrr) NOCOMP(VDBPSADBWZ256rmi) NOCOMP(VDBPSADBWZ256rri) NOCOMP(VPMAXSQZ256rm) From a4586bd2d4fa7d6c0100893496a9383fd581e2e9 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 18 Sep 2024 15:01:12 +0200 Subject: [PATCH 054/321] [Loads] Extract some checks into a lambda (NFC) This makes it easier to add additional checks. --- llvm/lib/Analysis/Loads.cpp | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index a88469ab81a8c8..957ac883490c45 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -93,20 +93,26 @@ static bool isDereferenceableAndAlignedPointer( Visited, MaxDepth); } - bool CheckForNonNull, CheckForFreed; - APInt KnownDerefBytes(Size.getBitWidth(), - V->getPointerDereferenceableBytes(DL, CheckForNonNull, - CheckForFreed)); - if (KnownDerefBytes.getBoolValue() && KnownDerefBytes.uge(Size) && - !CheckForFreed) - if (!CheckForNonNull || - isKnownNonZero(V, SimplifyQuery(DL, DT, AC, CtxI))) { - // As we recursed through GEPs to get here, we've incrementally checked - // that each step advanced by a multiple of the alignment. If our base is - // properly aligned, then the original offset accessed must also be. - APInt Offset(DL.getTypeStoreSizeInBits(V->getType()), 0); - return isAligned(V, Offset, Alignment, DL); - } + auto IsKnownDeref = [&]() { + bool CheckForNonNull, CheckForFreed; + APInt KnownDerefBytes(Size.getBitWidth(), + V->getPointerDereferenceableBytes(DL, CheckForNonNull, + CheckForFreed)); + if (!KnownDerefBytes.getBoolValue() || !KnownDerefBytes.uge(Size) || + CheckForFreed) + return false; + if (CheckForNonNull && + !isKnownNonZero(V, SimplifyQuery(DL, DT, AC, CtxI))) + return false; + return true; + }; + if (IsKnownDeref()) { + // As we recursed through GEPs to get here, we've incrementally checked + // that each step advanced by a multiple of the alignment. If our base is + // properly aligned, then the original offset accessed must also be. + APInt Offset(DL.getTypeStoreSizeInBits(V->getType()), 0); + return isAligned(V, Offset, Alignment, DL); + } /// TODO refactor this function to be able to search independently for /// Dereferencability and Alignment requirements. From 5ac97d397c2088c3ac0a113506e57ab9b1e69ac8 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Wed, 18 Sep 2024 09:16:17 -0400 Subject: [PATCH 055/321] [gn] port 40c45b6b4318 more (rm clang-rename) 40c45b6b4318 already removed most traces of clang-rename from the GN build (thanks!), but it didn't delete the build file for unit tests. --- .../gn/secondary/clang/unittests/BUILD.gn | 1 - .../secondary/clang/unittests/Rename/BUILD.gn | 28 ------------------- 2 files changed, 29 deletions(-) delete mode 100644 llvm/utils/gn/secondary/clang/unittests/Rename/BUILD.gn diff --git a/llvm/utils/gn/secondary/clang/unittests/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/BUILD.gn index a6a4a5708341f0..4aa844ac5a3c2f 100644 --- a/llvm/utils/gn/secondary/clang/unittests/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/unittests/BUILD.gn @@ -16,7 +16,6 @@ group("unittests") { "InstallAPI:InstallAPITests", "Interpreter:ClangReplInterpreterTests", "Lex:LexTests", - "Rename:ClangRenameTests", "Rewrite:RewriteTests", "Sema:SemaTests", "Serialization:SerializationTests", diff --git a/llvm/utils/gn/secondary/clang/unittests/Rename/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Rename/BUILD.gn deleted file mode 100644 index 10c922424186ff..00000000000000 --- a/llvm/utils/gn/secondary/clang/unittests/Rename/BUILD.gn +++ /dev/null @@ -1,28 +0,0 @@ -import("//third-party/unittest/unittest.gni") - -unittest("ClangRenameTests") { - configs += [ "//llvm/utils/gn/build:clang_code" ] - - # We'd like clang/unittests/Tooling/RewriterTestContext.h in the test. - include_dirs = [ "../.." ] - - deps = [ - "//clang/lib/AST", - "//clang/lib/ASTMatchers", - "//clang/lib/Basic", - "//clang/lib/Format", - "//clang/lib/Frontend", - "//clang/lib/Rewrite", - "//clang/lib/Tooling", - "//clang/lib/Tooling/Core", - "//clang/lib/Tooling/Refactoring", - "//llvm/lib/Support", - ] - sources = [ - "RenameAliasTest.cpp", - "RenameClassTest.cpp", - "RenameEnumTest.cpp", - "RenameFunctionTest.cpp", - "RenameMemberTest.cpp", - ] -} From 76eda76f9f36646b8b393f2369359d02e24e20c8 Mon Sep 17 00:00:00 2001 From: Phoebe Wang Date: Wed, 18 Sep 2024 21:23:10 +0800 Subject: [PATCH 056/321] [X86][BF16] Add libcall for F80 -> BF16 (#109116) This fixes #108936, but the calling convention doesn't match with GCC. I doubt we have such a lib function for now, so leave the calling convention as is. --- compiler-rt/lib/builtins/CMakeLists.txt | 1 + compiler-rt/lib/builtins/truncxfbf2.c | 13 +++++++++++++ llvm/include/llvm/IR/RuntimeLibcalls.def | 1 + llvm/lib/CodeGen/TargetLoweringBase.cpp | 2 ++ llvm/test/CodeGen/X86/bfloat.ll | 22 ++++++++++++++++++++++ 5 files changed, 39 insertions(+) create mode 100644 compiler-rt/lib/builtins/truncxfbf2.c diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt index 2c3b0fa84a4782..9a0a50ee7003f1 100644 --- a/compiler-rt/lib/builtins/CMakeLists.txt +++ b/compiler-rt/lib/builtins/CMakeLists.txt @@ -192,6 +192,7 @@ set(GENERIC_SOURCES set(BF16_SOURCES extendbfsf2.c truncdfbf2.c + truncxfbf2.c truncsfbf2.c ) diff --git a/compiler-rt/lib/builtins/truncxfbf2.c b/compiler-rt/lib/builtins/truncxfbf2.c new file mode 100644 index 00000000000000..cc6f70b25149d6 --- /dev/null +++ b/compiler-rt/lib/builtins/truncxfbf2.c @@ -0,0 +1,13 @@ +//===-- lib/truncxfbf2.c - long double -> bfloat conversion -------*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#define SRC_80 +#define DST_BFLOAT +#include "fp_trunc_impl.inc" + +COMPILER_RT_ABI dst_t __truncxfbf2(long double a) { return __truncXfYf2__(a); } diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.def b/llvm/include/llvm/IR/RuntimeLibcalls.def index c3d5ef9f4e4f82..69cf43140ad4bd 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.def +++ b/llvm/include/llvm/IR/RuntimeLibcalls.def @@ -367,6 +367,7 @@ HANDLE_LIBCALL(FPROUND_F128_F16, "__trunctfhf2") HANDLE_LIBCALL(FPROUND_PPCF128_F16, "__trunctfhf2") HANDLE_LIBCALL(FPROUND_F32_BF16, "__truncsfbf2") HANDLE_LIBCALL(FPROUND_F64_BF16, "__truncdfbf2") +HANDLE_LIBCALL(FPROUND_F80_BF16, "__truncxfbf2") HANDLE_LIBCALL(FPROUND_F64_F32, "__truncdfsf2") HANDLE_LIBCALL(FPROUND_F80_F32, "__truncxfsf2") HANDLE_LIBCALL(FPROUND_F128_F32, "__trunctfsf2") diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index eb3190c7cd247a..9fdde454559171 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -169,6 +169,8 @@ RTLIB::Libcall RTLIB::getFPROUND(EVT OpVT, EVT RetVT) { return FPROUND_F32_BF16; if (OpVT == MVT::f64) return FPROUND_F64_BF16; + if (OpVT == MVT::f80) + return FPROUND_F80_BF16; } else if (RetVT == MVT::f32) { if (OpVT == MVT::f64) return FPROUND_F64_F32; diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll index 3759909a2ccc8e..3144fd56d9ccf3 100644 --- a/llvm/test/CodeGen/X86/bfloat.ll +++ b/llvm/test/CodeGen/X86/bfloat.ll @@ -1953,3 +1953,25 @@ define void @PR92471(ptr %0, ptr %1) nounwind { store <7 x float> %4, ptr %1, align 4 ret void } + +define bfloat @PR108936(x86_fp80 %0) nounwind { +; X86-LABEL: PR108936: +; X86: # %bb.0: +; X86-NEXT: subl $12, %esp +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt (%esp) +; X86-NEXT: calll __truncxfbf2 +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl +; +; CHECK-LABEL: PR108936: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fstpt (%rsp) +; CHECK-NEXT: callq __truncxfbf2@PLT +; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: retq + %2 = fptrunc x86_fp80 %0 to bfloat + ret bfloat %2 +} From e8e42999a559457292190b9faf7b2a83ec8d1ac5 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Wed, 18 Sep 2024 09:27:49 -0400 Subject: [PATCH 057/321] [gn] port 76eda76f9f36 --- llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn index 1e2c0cdc863061..c2075f3a3ab4e7 100644 --- a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn +++ b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn @@ -278,6 +278,7 @@ static_library("builtins") { "i386/fp_mode.c", "extendbfsf2.c", "truncdfbf2.c", + "truncxfbf2.c", "truncsfbf2.c", ] if (long_double_is_80_bits) { From 07d7fdd7a58ee34c3e518e2474f2c28d461796d6 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Wed, 18 Sep 2024 09:28:23 -0400 Subject: [PATCH 058/321] [gn build] Port 311e4e324581 --- llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn index 87a2e771dda698..cba7867854dff4 100644 --- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn @@ -142,6 +142,7 @@ copy("Headers") { "avx10_2_512satcvtintrin.h", "avx10_2bf16intrin.h", "avx10_2convertintrin.h", + "avx10_2copyintrin.h", "avx10_2minmaxintrin.h", "avx10_2niintrin.h", "avx10_2satcvtdsintrin.h", From aa43f3abe0e9a7199a8df3f71364d7084f968825 Mon Sep 17 00:00:00 2001 From: Chris Apple Date: Wed, 18 Sep 2024 07:34:18 -0600 Subject: [PATCH 059/321] [compiler-rt][rtsan] Use Die instead of exit, define cf.exitcode (#107635) --- compiler-rt/lib/rtsan/rtsan_context.cpp | 2 +- compiler-rt/lib/rtsan/rtsan_flags.cpp | 1 + compiler-rt/lib/rtsan/tests/rtsan_test_main.cpp | 17 +++++++++++++++++ .../lib/rtsan/tests/rtsan_test_utilities.h | 4 ++-- compiler-rt/test/rtsan/lit.cfg.py | 16 ++++++++++++++++ 5 files changed, 37 insertions(+), 3 deletions(-) diff --git a/compiler-rt/lib/rtsan/rtsan_context.cpp b/compiler-rt/lib/rtsan/rtsan_context.cpp index 8609394fa222fc..e69fb259798d94 100644 --- a/compiler-rt/lib/rtsan/rtsan_context.cpp +++ b/compiler-rt/lib/rtsan/rtsan_context.cpp @@ -62,7 +62,7 @@ static __rtsan::Context &GetContextForThisThreadImpl() { Until then, and to keep the first PRs small, only the exit mode is available. */ -static void InvokeViolationDetectedAction() { exit(EXIT_FAILURE); } +static void InvokeViolationDetectedAction() { Die(); } __rtsan::Context::Context() = default; diff --git a/compiler-rt/lib/rtsan/rtsan_flags.cpp b/compiler-rt/lib/rtsan/rtsan_flags.cpp index beab2a2fc5d895..9c90d23d742630 100644 --- a/compiler-rt/lib/rtsan/rtsan_flags.cpp +++ b/compiler-rt/lib/rtsan/rtsan_flags.cpp @@ -35,6 +35,7 @@ void __rtsan::InitializeFlags() { { CommonFlags cf; cf.CopyFrom(*common_flags()); + cf.exitcode = 43; cf.external_symbolizer_path = GetEnv("RTSAN_SYMBOLIZER_PATH"); OverrideCommonFlags(cf); } diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_main.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_main.cpp index 255ac9497103e9..50c726e09f287f 100644 --- a/compiler-rt/lib/rtsan/tests/rtsan_test_main.cpp +++ b/compiler-rt/lib/rtsan/tests/rtsan_test_main.cpp @@ -8,8 +8,25 @@ // //===----------------------------------------------------------------------===// +#include "sanitizer_common/sanitizer_platform.h" #include "sanitizer_test_utils.h" +// Default RTSAN_OPTIONS for the unit tests. +extern "C" const char *__rtsan_default_options() { +#if SANITIZER_APPLE + // On Darwin, we default to `abort_on_error=1`, which would make tests run + // much slower. Let's override this and run lit tests with 'abort_on_error=0' + // and make sure we do not overwhelm the syslog while testing. Also, let's + // turn symbolization off to speed up testing, especially when not running + // with llvm-symbolizer but with atos. + return "symbolize=false:abort_on_error=0:log_to_syslog=0"; +#else + // Let's turn symbolization off to speed up testing (more than 3 times speedup + // observed). + return "symbolize=false"; +#endif +} + int main(int argc, char **argv) { testing::GTEST_FLAG(death_test_style) = "threadsafe"; testing::InitGoogleTest(&argc, argv); diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_utilities.h b/compiler-rt/lib/rtsan/tests/rtsan_test_utilities.h index 4ba4fc5e530864..f0cf90e057e36e 100644 --- a/compiler-rt/lib/rtsan/tests/rtsan_test_utilities.h +++ b/compiler-rt/lib/rtsan/tests/rtsan_test_utilities.h @@ -37,8 +37,8 @@ void ExpectRealtimeDeath(Function &&Func, : ""; }; - EXPECT_EXIT(RealtimeInvoke(std::forward(Func)), - ExitedWithCode(EXIT_FAILURE), GetExpectedErrorSubstring()); + EXPECT_EXIT(RealtimeInvoke(std::forward(Func)), ExitedWithCode(43), + GetExpectedErrorSubstring()); } template void ExpectNonRealtimeSurvival(Function &&Func) { diff --git a/compiler-rt/test/rtsan/lit.cfg.py b/compiler-rt/test/rtsan/lit.cfg.py index b262ecfa7fb4bb..7c75515a7608d0 100644 --- a/compiler-rt/test/rtsan/lit.cfg.py +++ b/compiler-rt/test/rtsan/lit.cfg.py @@ -3,6 +3,22 @@ # Setup config name. config.name = "RTSAN" + config.name_suffix + +default_rtsan_opts = "atexit_sleep_ms=0" + +if config.host_os == "Darwin": + # On Darwin, we default to `abort_on_error=1`, which would make tests run + # much slower. Let's override this and run lit tests with 'abort_on_error=0'. + default_rtsan_opts += ":abort_on_error=0" + +if default_rtsan_opts: + config.environment["RTSAN_OPTIONS"] = default_rtsan_opts + default_rtsan_opts += ":" + +config.substitutions.append( + ("%env_rtsan_opts=", "env RTSAN_OPTIONS=" + default_rtsan_opts) +) + # Setup source root. config.test_source_root = os.path.dirname(__file__) From a10c9f994be143e2ac63918aa495bc2aeb3ffb48 Mon Sep 17 00:00:00 2001 From: Phoebe Wang Date: Wed, 18 Sep 2024 21:35:38 +0800 Subject: [PATCH 060/321] Revert "[X86][BF16] Add libcall for F80 -> BF16" (#109140) Reverts llvm/llvm-project#109116 --- compiler-rt/lib/builtins/CMakeLists.txt | 1 - compiler-rt/lib/builtins/truncxfbf2.c | 13 ------------- llvm/include/llvm/IR/RuntimeLibcalls.def | 1 - llvm/lib/CodeGen/TargetLoweringBase.cpp | 2 -- llvm/test/CodeGen/X86/bfloat.ll | 22 ---------------------- 5 files changed, 39 deletions(-) delete mode 100644 compiler-rt/lib/builtins/truncxfbf2.c diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt index 9a0a50ee7003f1..2c3b0fa84a4782 100644 --- a/compiler-rt/lib/builtins/CMakeLists.txt +++ b/compiler-rt/lib/builtins/CMakeLists.txt @@ -192,7 +192,6 @@ set(GENERIC_SOURCES set(BF16_SOURCES extendbfsf2.c truncdfbf2.c - truncxfbf2.c truncsfbf2.c ) diff --git a/compiler-rt/lib/builtins/truncxfbf2.c b/compiler-rt/lib/builtins/truncxfbf2.c deleted file mode 100644 index cc6f70b25149d6..00000000000000 --- a/compiler-rt/lib/builtins/truncxfbf2.c +++ /dev/null @@ -1,13 +0,0 @@ -//===-- lib/truncxfbf2.c - long double -> bfloat conversion -------*- C -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#define SRC_80 -#define DST_BFLOAT -#include "fp_trunc_impl.inc" - -COMPILER_RT_ABI dst_t __truncxfbf2(long double a) { return __truncXfYf2__(a); } diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.def b/llvm/include/llvm/IR/RuntimeLibcalls.def index 69cf43140ad4bd..c3d5ef9f4e4f82 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.def +++ b/llvm/include/llvm/IR/RuntimeLibcalls.def @@ -367,7 +367,6 @@ HANDLE_LIBCALL(FPROUND_F128_F16, "__trunctfhf2") HANDLE_LIBCALL(FPROUND_PPCF128_F16, "__trunctfhf2") HANDLE_LIBCALL(FPROUND_F32_BF16, "__truncsfbf2") HANDLE_LIBCALL(FPROUND_F64_BF16, "__truncdfbf2") -HANDLE_LIBCALL(FPROUND_F80_BF16, "__truncxfbf2") HANDLE_LIBCALL(FPROUND_F64_F32, "__truncdfsf2") HANDLE_LIBCALL(FPROUND_F80_F32, "__truncxfsf2") HANDLE_LIBCALL(FPROUND_F128_F32, "__trunctfsf2") diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 9fdde454559171..eb3190c7cd247a 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -169,8 +169,6 @@ RTLIB::Libcall RTLIB::getFPROUND(EVT OpVT, EVT RetVT) { return FPROUND_F32_BF16; if (OpVT == MVT::f64) return FPROUND_F64_BF16; - if (OpVT == MVT::f80) - return FPROUND_F80_BF16; } else if (RetVT == MVT::f32) { if (OpVT == MVT::f64) return FPROUND_F64_F32; diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll index 3144fd56d9ccf3..3759909a2ccc8e 100644 --- a/llvm/test/CodeGen/X86/bfloat.ll +++ b/llvm/test/CodeGen/X86/bfloat.ll @@ -1953,25 +1953,3 @@ define void @PR92471(ptr %0, ptr %1) nounwind { store <7 x float> %4, ptr %1, align 4 ret void } - -define bfloat @PR108936(x86_fp80 %0) nounwind { -; X86-LABEL: PR108936: -; X86: # %bb.0: -; X86-NEXT: subl $12, %esp -; X86-NEXT: fldt {{[0-9]+}}(%esp) -; X86-NEXT: fstpt (%esp) -; X86-NEXT: calll __truncxfbf2 -; X86-NEXT: addl $12, %esp -; X86-NEXT: retl -; -; CHECK-LABEL: PR108936: -; CHECK: # %bb.0: -; CHECK-NEXT: subq $24, %rsp -; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) -; CHECK-NEXT: fstpt (%rsp) -; CHECK-NEXT: callq __truncxfbf2@PLT -; CHECK-NEXT: addq $24, %rsp -; CHECK-NEXT: retq - %2 = fptrunc x86_fp80 %0 to bfloat - ret bfloat %2 -} From ce74d5ff878b901d6582e407fddd808ad1236d20 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Wed, 18 Sep 2024 09:40:04 -0400 Subject: [PATCH 061/321] [gn build] Port a10c9f994be1 --- llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn index c2075f3a3ab4e7..2fd3b9a7dd3db5 100644 --- a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn +++ b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn @@ -275,10 +275,9 @@ static_library("builtins") { sources -= [ "fp_mode.c" ] sources += [ "cpu_model/x86.c", - "i386/fp_mode.c", "extendbfsf2.c", + "i386/fp_mode.c", "truncdfbf2.c", - "truncxfbf2.c", "truncsfbf2.c", ] if (long_double_is_80_bits) { From 0d736e296c2feed7709e002d5972ed60844b6b56 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 18 Sep 2024 14:41:56 +0100 Subject: [PATCH 062/321] [VPlan] Add getSCEVExprForVPValue util, use to get trip count SCEV (NFC) (#94464) Add a new getSCEVExprForVPValue utility which can be used to get a SCEV expression for a VPValue. The initial implementation only returns SCEVs for live-in IR values (by constructing a SCEV based on the live-in IR value) and VPExpandSCEVRecipe. This is enough to serve its first use, getting a SCEV for a VPlan's trip count, but will be extended in the future. It also removes createTripCountSCEV, as the new helper can be used to retrieve the SCEV from the VPlan. PR: https://github.com/llvm/llvm-project/pull/94464 --- .../Transforms/Vectorize/LoopVectorize.cpp | 26 +++++++------------ llvm/lib/Transforms/Vectorize/VPlan.cpp | 11 +++++++- llvm/lib/Transforms/Vectorize/VPlan.h | 9 +++---- .../Transforms/Vectorize/VPlanAnalysis.cpp | 1 + .../Transforms/Vectorize/VPlanTransforms.cpp | 7 ++--- llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 12 +++++++++ llvm/lib/Transforms/Vectorize/VPlanUtils.h | 9 +++++++ .../Transforms/Vectorize/VPlanHCFGTest.cpp | 7 +++-- .../Transforms/Vectorize/VPlanTestBase.h | 18 +++++++------ 9 files changed, 62 insertions(+), 38 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 0fa7c2add1faa2..9fb684427cfe9d 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -905,15 +905,6 @@ Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { return B.CreateElementCount(Ty, VF); } -const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE, - Loop *OrigLoop) { - const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); - assert(!isa(BackedgeTakenCount) && "Invalid loop count"); - - ScalarEvolution &SE = *PSE.getSE(); - return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop); -} - void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, @@ -4750,7 +4741,10 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) { // TODO: extend to support scalable VFs. if (!RemainingIterations) { - const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop); + const SCEV *TC = vputils::getSCEVExprForVPValue( + getPlanFor(NextVF.Width).getTripCount(), SE); + assert(!isa(TC) && + "Trip count SCEV must be computable"); RemainingIterations = SE.getURemExpr( TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC)); } @@ -8863,10 +8857,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { return !CM.requiresScalarEpilogue(VF.isVector()); }, Range); - VPlanPtr Plan = VPlan::createInitialVPlan( - createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop), - *PSE.getSE(), RequiresScalarEpilogueCheck, CM.foldTailByMasking(), - OrigLoop); + VPlanPtr Plan = VPlan::createInitialVPlan(Legal->getWidestInductionType(), + PSE, RequiresScalarEpilogueCheck, + CM.foldTailByMasking(), OrigLoop); // Don't use getDecisionAndClampRange here, because we don't know the UF // so this function is better to be conservative, rather than to split @@ -9081,9 +9074,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); // Create new empty VPlan - auto Plan = VPlan::createInitialVPlan( - createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop), - *PSE.getSE(), true, false, OrigLoop); + auto Plan = VPlan::createInitialVPlan(Legal->getWidestInductionType(), PSE, + true, false, OrigLoop); // Build hierarchical CFG VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 41e281f3fa9973..2169d78542cbaf 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -869,14 +869,23 @@ static VPIRBasicBlock *createVPIRBasicBlockFor(BasicBlock *BB) { return VPIRBB; } -VPlanPtr VPlan::createInitialVPlan(const SCEV *TripCount, ScalarEvolution &SE, +VPlanPtr VPlan::createInitialVPlan(Type *InductionTy, + PredicatedScalarEvolution &PSE, bool RequiresScalarEpilogueCheck, bool TailFolded, Loop *TheLoop) { VPIRBasicBlock *Entry = createVPIRBasicBlockFor(TheLoop->getLoopPreheader()); VPBasicBlock *VecPreheader = new VPBasicBlock("vector.ph"); auto Plan = std::make_unique(Entry, VecPreheader); + + // Create SCEV and VPValue for the trip count. + const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); + assert(!isa(BackedgeTakenCount) && "Invalid loop count"); + ScalarEvolution &SE = *PSE.getSE(); + const SCEV *TripCount = + SE.getTripCountFromExitCount(BackedgeTakenCount, InductionTy, TheLoop); Plan->TripCount = vputils::getOrCreateVPValueForSCEVExpr(*Plan, TripCount, SE); + // Create VPRegionBlock, with empty header and latch blocks, to be filled // during processing later. VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index eac4fe8ce8b0f2..9b9e710ddc88cb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -83,9 +83,6 @@ Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF); Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step); -const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE, - Loop *CurLoop = nullptr); - /// A helper function that returns the reciprocal of the block probability of /// predicated blocks. If we return X, we are assuming the predicated block /// will execute once for every X iterations of the loop header. @@ -3477,8 +3474,10 @@ class VPlan { /// middle VPBasicBlock. If a check is needed to guard executing the scalar /// epilogue loop, it will be added to the middle block, together with /// VPBasicBlocks for the scalar preheader and exit blocks. - static VPlanPtr createInitialVPlan(const SCEV *TripCount, - ScalarEvolution &PSE, + /// \p InductionTy is the type of the canonical induction and used for related + /// values, like the trip count expression. + static VPlanPtr createInitialVPlan(Type *InductionTy, + PredicatedScalarEvolution &PSE, bool RequiresScalarEpilogueCheck, bool TailFolded, Loop *TheLoop); diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index f091ee5a71b297..277df0637372d8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -11,6 +11,7 @@ #include "VPlanCFG.h" #include "VPlanDominatorTree.h" #include "llvm/ADT/TypeSwitch.h" +#include "llvm/Analysis/ScalarEvolution.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/GenericDomTreeConstruction.h" diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 1d84550010017f..edcd7d26e60daa 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -685,10 +685,11 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, m_BranchOnCond(m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue()))))) return; - Type *IdxTy = - Plan.getCanonicalIV()->getStartValue()->getLiveInIRValue()->getType(); - const SCEV *TripCount = createTripCountSCEV(IdxTy, PSE); ScalarEvolution &SE = *PSE.getSE(); + const SCEV *TripCount = + vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE); + assert(!isa(TripCount) && + "Trip count SCEV must be computable"); ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF); const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements); if (TripCount->isZero() || diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index c18bea4f4c5926..414f8866d24f0f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -8,6 +8,7 @@ #include "VPlanUtils.h" #include "VPlanPatternMatch.h" +#include "llvm/ADT/TypeSwitch.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" using namespace llvm; @@ -60,3 +61,14 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) { return match(V, m_Binary(m_VPValue(A), m_VPValue(B))) && IsWideCanonicalIV(A) && B == Plan.getOrCreateBackedgeTakenCount(); } + +const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) { + if (V->isLiveIn()) + return SE.getSCEV(V->getLiveInIRValue()); + + // TODO: Support constructing SCEVs for more recipes as needed. + return TypeSwitch(V->getDefiningRecipe()) + .Case( + [](const VPExpandSCEVRecipe *R) { return R->getSCEV(); }) + .Default([&SE](const VPRecipeBase *) { return SE.getCouldNotCompute(); }); +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index fc11208a433961..7b5d4300655f5a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -11,6 +11,11 @@ #include "VPlan.h" +namespace llvm { +class ScalarEvolution; +class SCEV; +} // namespace llvm + namespace llvm::vputils { /// Returns true if only the first lane of \p Def is used. bool onlyFirstLaneUsed(const VPValue *Def); @@ -26,6 +31,10 @@ bool onlyFirstPartUsed(const VPValue *Def); VPValue *getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr, ScalarEvolution &SE); +/// Return the SCEV expression for \p V. Returns SCEVCouldNotCompute if no +/// SCEV expression could be constructed. +const SCEV *getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE); + /// Returns true if \p VPV is uniform after vectorization. inline bool isUniformAfterVectorization(const VPValue *VPV) { // A value defined outside the vector region must be uniform after diff --git a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp index 6dd6d860273ce7..4926afbfc6d8ce 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp @@ -101,13 +101,12 @@ TEST_F(VPlanHCFGTest, testBuildHCFGInnerLoop) { raw_string_ostream OS(FullDump); Plan->printDOT(OS); const char *ExpectedStr = R"(digraph VPlan { -graph [labelloc=t, fontsize=30; label="Vectorization Plan\n for UF\>=1\nLive-in vp\<%0\> = vector-trip-count\nvp\<%1\> = original trip-count\n"] +graph [labelloc=t, fontsize=30; label="Vectorization Plan\n for UF\>=1\nLive-in vp\<%0\> = vector-trip-count\nLive-in ir\<%N\> = original trip-count\n"] node [shape=rect, fontname=Courier, fontsize=30] edge [fontname=Courier, fontsize=30] compound=true N0 [label = "ir-bb\:\l" + - " EMIT vp\<%1\> = EXPAND SCEV (-1 + %N)\l" + "No successors\l" ] N1 [label = @@ -134,8 +133,8 @@ compound=true N2 -> N4 [ label="" ltail=cluster_N3] N4 [label = "middle.block:\l" + - " EMIT vp\<%2\> = icmp eq vp\<%1\>, vp\<%0\>\l" + - " EMIT branch-on-cond vp\<%2\>\l" + + " EMIT vp\<%1\> = icmp eq ir\<%N\>, vp\<%0\>\l" + + " EMIT branch-on-cond vp\<%1\>\l" + "Successor(s): ir-bb\, scalar.ph\l" ] N4 -> N5 [ label="T"] diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h index e7b51190489159..06e091da9054e3 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h +++ b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h @@ -67,10 +67,11 @@ class VPlanTestBase : public testing::Test { assert(!verifyFunction(F) && "input function must be valid"); doAnalysis(F); - auto Plan = VPlan::createInitialVPlan( - SE->getBackedgeTakenCount(LI->getLoopFor(LoopHeader)), *SE, true, false, - LI->getLoopFor(LoopHeader)); - VPlanHCFGBuilder HCFGBuilder(LI->getLoopFor(LoopHeader), LI.get(), *Plan); + Loop *L = LI->getLoopFor(LoopHeader); + PredicatedScalarEvolution PSE(*SE, *L); + auto Plan = VPlan::createInitialVPlan(IntegerType::get(*Ctx, 64), PSE, true, + false, L); + VPlanHCFGBuilder HCFGBuilder(L, LI.get(), *Plan); HCFGBuilder.buildHierarchicalCFG(); return Plan; } @@ -81,10 +82,11 @@ class VPlanTestBase : public testing::Test { assert(!verifyFunction(F) && "input function must be valid"); doAnalysis(F); - auto Plan = VPlan::createInitialVPlan( - SE->getBackedgeTakenCount(LI->getLoopFor(LoopHeader)), *SE, true, false, - LI->getLoopFor(LoopHeader)); - VPlanHCFGBuilder HCFGBuilder(LI->getLoopFor(LoopHeader), LI.get(), *Plan); + Loop *L = LI->getLoopFor(LoopHeader); + PredicatedScalarEvolution PSE(*SE, *L); + auto Plan = VPlan::createInitialVPlan(IntegerType::get(*Ctx, 64), PSE, true, + false, L); + VPlanHCFGBuilder HCFGBuilder(L, LI.get(), *Plan); HCFGBuilder.buildPlainCFG(); return Plan; } From ba8c96593c78cda44523abf2abcd7faeef0471af Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Wed, 18 Sep 2024 06:44:07 -0700 Subject: [PATCH 063/321] [Clang] Do not implicitly link C libraries for the GPU targets (#109052) Summary: I initially thought that it would be convenient to automatically link these libraries like they are for standard C/C++ targets. However, this created issues when trying to use C++ as a GPU target. This patch moves the logic to now implicitly pass it as part of the offloading toolchain instead, if found. This means that the user needs to set the target toolchain for the link job for automatic detection, but can still be done manually via `-Xoffload-linker -lc`. --- clang/lib/Driver/ToolChains/AMDGPU.cpp | 4 +--- clang/lib/Driver/ToolChains/Clang.cpp | 19 +++++++++++++++++++ clang/lib/Driver/ToolChains/CommonArgs.cpp | 16 ---------------- clang/lib/Driver/ToolChains/CommonArgs.h | 3 --- clang/lib/Driver/ToolChains/Cuda.cpp | 2 -- clang/test/Driver/openmp-offload-gpu.c | 2 +- .../modules/prepare_libc_gpu_build.cmake | 4 ++-- libcxx/cmake/caches/AMDGPU.cmake | 2 +- libcxx/cmake/caches/NVPTX.cmake | 2 +- 9 files changed, 25 insertions(+), 29 deletions(-) diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp index 74f70573c5feb8..2c85d21ebd738c 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -648,8 +648,6 @@ void amdgpu::Linker::ConstructJob(Compilation &C, const JobAction &JA, Args.MakeArgString("-plugin-opt=-mattr=" + llvm::join(Features, ","))); } - addGPULibraries(getToolChain(), Args, CmdArgs); - CmdArgs.push_back("-o"); CmdArgs.push_back(Output.getFilename()); C.addCommand(std::make_unique( @@ -1089,4 +1087,4 @@ bool AMDGPUToolChain::shouldSkipSanitizeOption( return true; } return false; -} \ No newline at end of file +} diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 494883500342e4..c00df5f5bc729c 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -9223,6 +9223,25 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA, A->claim(); } + // Pass in the C library for GPUs if present and not disabled. + if (!Args.hasArg(options::OPT_nostdlib, options::OPT_r, options::OPT_nogpulib, + options::OPT_nodefaultlibs, options::OPT_nolibc, + options::OPT_nogpulibc)) { + forAllAssociatedToolChains(C, JA, getToolChain(), [&](const ToolChain &TC) { + // The device C library is only available for NVPTX and AMDGPU targets + // currently. + if (!TC.getTriple().isNVPTX() && !TC.getTriple().isAMDGPU()) + return; + bool HasLibC = TC.getStdlibIncludePath().has_value(); + if (HasLibC) { + CmdArgs.push_back(Args.MakeArgString( + "--device-linker=" + TC.getTripleString() + "=" + "-lc")); + CmdArgs.push_back(Args.MakeArgString( + "--device-linker=" + TC.getTripleString() + "=" + "-lm")); + } + }); + } + // If we disable the GPU C library support it needs to be forwarded to the // link job. if (!Args.hasFlag(options::OPT_gpulibc, options::OPT_nogpulibc, true)) diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 502aba2ce4aa9c..043d9e48764439 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -510,22 +510,6 @@ void tools::addLinkerCompressDebugSectionsOption( } } -void tools::addGPULibraries(const ToolChain &TC, const llvm::opt::ArgList &Args, - llvm::opt::ArgStringList &CmdArgs) { - if (Args.hasArg(options::OPT_nostdlib, options::OPT_r, - options::OPT_nodefaultlibs, options::OPT_nolibc, - options::OPT_nogpulibc)) - return; - - // If the user's toolchain has the 'include//` path, we assume it - // supports the standard C libraries for the GPU and include them. - bool HasLibC = TC.getStdlibIncludePath().has_value(); - if (HasLibC) { - CmdArgs.push_back("-lc"); - CmdArgs.push_back("-lm"); - } -} - void tools::AddTargetFeature(const ArgList &Args, std::vector &Features, OptSpecifier OnOpt, OptSpecifier OffOpt, diff --git a/clang/lib/Driver/ToolChains/CommonArgs.h b/clang/lib/Driver/ToolChains/CommonArgs.h index 0c97398dfcfa34..8695d3fe5b55b8 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.h +++ b/clang/lib/Driver/ToolChains/CommonArgs.h @@ -35,9 +35,6 @@ void addLinkerCompressDebugSectionsOption(const ToolChain &TC, const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs); -void addGPULibraries(const ToolChain &TC, const llvm::opt::ArgList &Args, - llvm::opt::ArgStringList &CmdArgs); - void claimNoWarnArgs(const llvm::opt::ArgList &Args); bool addSanitizerRuntimes(const ToolChain &TC, const llvm::opt::ArgList &Args, diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp index ef44ffa5594daf..509cd87b28c37e 100644 --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -635,8 +635,6 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA, for (StringRef Feature : Features) CmdArgs.append({"--feature", Args.MakeArgString(Feature)}); - addGPULibraries(getToolChain(), Args, CmdArgs); - // Add paths for the default clang library path. SmallString<256> DefaultLibPath = llvm::sys::path::parent_path(TC.getDriver().Dir); diff --git a/clang/test/Driver/openmp-offload-gpu.c b/clang/test/Driver/openmp-offload-gpu.c index ef6cbdded6a6f2..f6e2245dcdbc05 100644 --- a/clang/test/Driver/openmp-offload-gpu.c +++ b/clang/test/Driver/openmp-offload-gpu.c @@ -377,4 +377,4 @@ // RUN: --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \ // RUN: --offload-arch=sm_52 -nogpulibc -nogpuinc %s 2>&1 \ // RUN: | FileCheck --check-prefix=LIBC-GPU %s -// LIBC-GPU: clang-linker-wrapper{{.*}}"--device-compiler=-nolibc" +// LIBC-GPU-NOT: clang-linker-wrapper{{.*}}"--device-linker" diff --git a/libc/cmake/modules/prepare_libc_gpu_build.cmake b/libc/cmake/modules/prepare_libc_gpu_build.cmake index 14ae8f6e9eecd7..e20591b80e6f29 100644 --- a/libc/cmake/modules/prepare_libc_gpu_build.cmake +++ b/libc/cmake/modules/prepare_libc_gpu_build.cmake @@ -21,10 +21,10 @@ if(LIBC_TARGET_TRIPLE) set(CMAKE_REQUIRED_FLAGS "--target=${LIBC_TARGET_TRIPLE}") endif() if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU) - set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -nogpulib -nostdlib") + set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -nogpulib") elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX) set(CMAKE_REQUIRED_FLAGS - "${CMAKE_REQUIRED_FLAGS} -flto -c -Wno-unused-command-line-argument -nostdlib") + "${CMAKE_REQUIRED_FLAGS} -flto -c -Wno-unused-command-line-argument") endif() # Optionally set up a job pool to limit the number of GPU tests run in parallel. diff --git a/libcxx/cmake/caches/AMDGPU.cmake b/libcxx/cmake/caches/AMDGPU.cmake index 0cd2eebfb9c16a..7443470b2e8a8e 100644 --- a/libcxx/cmake/caches/AMDGPU.cmake +++ b/libcxx/cmake/caches/AMDGPU.cmake @@ -33,4 +33,4 @@ set(LIBCXX_ADDITIONAL_COMPILE_FLAGS "-nogpulib;-flto;-fconvergent-functions;-Xclang;-mcode-object-version=none" CACHE STRING "") set(LIBCXXABI_ADDITIONAL_COMPILE_FLAGS "-nogpulib;-flto;-fconvergent-functions;-Xclang;-mcode-object-version=none" CACHE STRING "") -set(CMAKE_REQUIRED_FLAGS "-nogpulib -nodefaultlibs" CACHE STRING "") +set(CMAKE_REQUIRED_FLAGS "-nogpulib" CACHE STRING "") diff --git a/libcxx/cmake/caches/NVPTX.cmake b/libcxx/cmake/caches/NVPTX.cmake index 47a24a349e996e..3685ddcbb66624 100644 --- a/libcxx/cmake/caches/NVPTX.cmake +++ b/libcxx/cmake/caches/NVPTX.cmake @@ -33,4 +33,4 @@ set(LIBCXX_ADDITIONAL_COMPILE_FLAGS "-nogpulib;-flto;-fconvergent-functions;--cuda-feature=+ptx63" CACHE STRING "") set(LIBCXXABI_ADDITIONAL_COMPILE_FLAGS "-nogpulib;-flto;-fconvergent-functions;--cuda-feature=+ptx63" CACHE STRING "") -set(CMAKE_REQUIRED_FLAGS "-nogpulib -nodefaultlibs -flto -c" CACHE STRING "") +set(CMAKE_REQUIRED_FLAGS "-nogpulib -flto -c" CACHE STRING "") From 11b95deab9a00d53e94a089b5d4bf3c05e5d5370 Mon Sep 17 00:00:00 2001 From: Congcong Cai Date: Wed, 18 Sep 2024 21:51:14 +0800 Subject: [PATCH 064/321] [clang-tidy][NFC] fix typo in clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp --- clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp b/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp index a1786ba5acfdf5..1c6a1618ebbc4f 100644 --- a/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp @@ -262,7 +262,7 @@ StatementMatcher makeIteratorLoopMatcher(bool IsReverse) { /// EndVarName: 'j' (as a VarDecl) /// In the second example only: /// EndCallName: 'container.size()' (as a CXXMemberCallExpr) or -/// 'size(contaner)' (as a CallExpr) +/// 'size(container)' (as a CallExpr) /// /// Client code will need to make sure that: /// - The containers on which 'size()' is called is the container indexed. @@ -491,7 +491,7 @@ static bool isDirectMemberExpr(const Expr *E) { } /// Given an expression that represents an usage of an element from the -/// containter that we are iterating over, returns false when it can be +/// container that we are iterating over, returns false when it can be /// guaranteed this element cannot be modified as a result of this usage. static bool canBeModified(ASTContext *Context, const Expr *E) { if (E->getType().isConstQualified()) @@ -922,7 +922,7 @@ bool LoopConvertCheck::isConvertible(ASTContext *Context, const ast_matchers::BoundNodes &Nodes, const ForStmt *Loop, LoopFixerKind FixerKind) { - // In self contained diagnosics mode we don't want dependancies on other + // In self contained diagnostic mode we don't want dependencies on other // loops, otherwise, If we already modified the range of this for loop, don't // do any further updates on this iteration. if (areDiagsSelfContained()) From a9d9b0a03daf7ca986182477a0866df525cfceff Mon Sep 17 00:00:00 2001 From: Mariya Podchishchaeva Date: Wed, 18 Sep 2024 16:05:23 +0200 Subject: [PATCH 065/321] [clang][C23] Claim N3030 Enhancements to Enumerations supported (#107260) Clang already implemented functionality as an extension. --- .../clang/Basic/DiagnosticParseKinds.td | 9 +- clang/lib/Parse/ParseDecl.cpp | 8 +- clang/test/C/C23/n3030.c | 93 +++++++++++++++++++ clang/test/Sema/fixed-enum.c | 25 +++-- clang/www/c_status.html | 2 +- 5 files changed, 122 insertions(+), 15 deletions(-) create mode 100644 clang/test/C/C23/n3030.c diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td index 1afadb3bff750d..78510e61a639fa 100644 --- a/clang/include/clang/Basic/DiagnosticParseKinds.td +++ b/clang/include/clang/Basic/DiagnosticParseKinds.td @@ -113,9 +113,12 @@ def ext_cxx11_enum_fixed_underlying_type : Extension< def ext_ms_c_enum_fixed_underlying_type : Extension< "enumeration types with a fixed underlying type are a Microsoft extension">, InGroup; -def ext_clang_c_enum_fixed_underlying_type : Extension< - "enumeration types with a fixed underlying type are a Clang extension">, - InGroup>; +def ext_c23_enum_fixed_underlying_type : Extension< + "enumeration types with a fixed underlying type are a C23 extension">, + InGroup; +def warn_c17_compat_enum_fixed_underlying_type : Warning< + "enumeration types with a fixed underlying type are incompatible with C standards before C23">, + DefaultIgnore, InGroup; def warn_cxx98_compat_enum_fixed_underlying_type : Warning< "enumeration types with a fixed underlying type are incompatible with C++98">, InGroup, DefaultIgnore; diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index 1f56884be392d6..a04eed9873c0d4 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -5439,18 +5439,20 @@ void Parser::ParseEnumSpecifier(SourceLocation StartLoc, DeclSpec &DS, BaseRange = SourceRange(ColonLoc, DeclaratorInfo.getSourceRange().getEnd()); - if (!getLangOpts().ObjC && !getLangOpts().C23) { + if (!getLangOpts().ObjC) { if (getLangOpts().CPlusPlus11) Diag(ColonLoc, diag::warn_cxx98_compat_enum_fixed_underlying_type) << BaseRange; else if (getLangOpts().CPlusPlus) Diag(ColonLoc, diag::ext_cxx11_enum_fixed_underlying_type) << BaseRange; - else if (getLangOpts().MicrosoftExt) + else if (getLangOpts().MicrosoftExt && !getLangOpts().C23) Diag(ColonLoc, diag::ext_ms_c_enum_fixed_underlying_type) << BaseRange; else - Diag(ColonLoc, diag::ext_clang_c_enum_fixed_underlying_type) + Diag(ColonLoc, getLangOpts().C23 + ? diag::warn_c17_compat_enum_fixed_underlying_type + : diag::ext_c23_enum_fixed_underlying_type) << BaseRange; } } diff --git a/clang/test/C/C23/n3030.c b/clang/test/C/C23/n3030.c new file mode 100644 index 00000000000000..9e1405a2e0e1fd --- /dev/null +++ b/clang/test/C/C23/n3030.c @@ -0,0 +1,93 @@ +// RUN: %clang_cc1 -verify -triple x86_64-unknown-linux-gnu -fsyntax-only -std=c23 %s -pedantic -Wall + +#include + +enum us : unsigned short { + us_max = USHRT_MAX, + us_violation, // expected-error {{enumerator value 65536 is not representable in the underlying type 'unsigned short'}} + us_violation_2 = us_max + 1, // expected-error {{enumerator value is not representable in the underlying type 'unsigned short'}} + us_wrap_around_to_zero = (unsigned short)(USHRT_MAX + 1) /* Okay: conversion + done in constant expression before conversion to + underlying type: unsigned semantics okay. */ +}; + +enum ui : unsigned int { + ui_max = UINT_MAX, + ui_violation, // expected-error {{enumerator value 4294967296 is not representable in the underlying type 'unsigned int'}} + ui_no_violation = ui_max + 1, + ui_wrap_around_to_zero = (unsigned int)(UINT_MAX + 1) +}; + +enum E1 : short; +enum E2 : short; // expected-note {{previous}} +enum E3; // expected-warning {{ISO C forbids forward references to 'enum' types}} +enum E4 : unsigned long long; + +enum E1 : short { m11, m12 }; +enum E1 x = m11; + +enum E2 : long { // expected-error {{enumeration redeclared with different underlying type 'long' (was 'short')}} + m21, + m22 +}; + +enum E3 { // expected-note {{definition of 'enum E3' is not complete until the closing '}'}} + // expected-note@-1 {{previous}} + m31, + m32, + m33 = sizeof(enum E3) // expected-error {{invalid application of 'sizeof' to an incomplete type 'enum E3'}} +}; +enum E3 : int; // expected-error {{enumeration previously declared with nonfixed underlying type}} + +enum E4 : unsigned long long { + m40 = sizeof(enum E4), + m41 = ULLONG_MAX, + m42 // expected-error {{enumerator value 18446744073709551616 is not representable in the underlying type 'unsigned long long'}} +}; + +enum E5 y; // expected-error {{tentative definition has type 'enum E5' that is never completed}} + // expected-warning@-1 {{ISO C forbids forward references to 'enum' types}} + // expected-note@-2 {{forward declaration of 'enum E5'}} +enum E6 : long int z; // expected-error {{non-defining declaration of enumeration with a fixed underlying type is only permitted as a standalone declaration; missing list of enumerators?}} +enum E7 : long int = 0; // expected-error {{non-defining declaration of enumeration with a fixed underlying type is only permitted as a standalone declaration; missing list of enumerators?}} + // expected-error@-1 {{expected identifier or '('}} + +enum underlying : unsigned char { b0 }; + +constexpr int a = _Generic(b0, int: 2, unsigned char: 1, default: 0); +constexpr int b = _Generic((enum underlying)b0, int: 2, unsigned char: 1, default: 0); +static_assert(a == 1); +static_assert(b == 1); + +void f1(enum a : long b); // expected-error {{non-defining declaration of enumeration with a fixed underlying type is only permitted as a standalone declaration; missing list of enumerators?}} + // expected-warning@-1 {{declaration of 'enum a' will not be visible outside of this function}} +void f2(enum c : long{x} d); // expected-warning {{declaration of 'enum c' will not be visible outside of this function}} +enum e : int f3(); // expected-error {{non-defining declaration of enumeration with a fixed underlying type is only permitted as a standalone declaration; missing list of enumerators?}} + +typedef enum t u; // expected-warning {{ISO C forbids forward references to 'enum' types}} +typedef enum v : short W; // expected-error {{non-defining declaration of enumeration with a fixed underlying type is only permitted as a standalone declaration; missing list of enumerators?}} +typedef enum q : short { s } R; + +struct s1 { + int x; + enum e:int : 1; // expected-error {{non-defining declaration of enumeration with a fixed underlying type is only permitted as a standalone declaration; missing list of enumerators?}} + int y; +}; + +enum forward; // expected-warning {{ISO C forbids forward references to 'enum' types}} +extern enum forward fwd_val0; /* Constraint violation: incomplete type */ +extern enum forward *fwd_ptr0; // expected-note {{previous}} +extern int + *fwd_ptr0; // expected-error {{redeclaration of 'fwd_ptr0' with a different type: 'int *' vs 'enum forward *'}} + +enum forward1 : int; +extern enum forward1 fwd_val1; +extern int fwd_val1; +extern enum forward1 *fwd_ptr1; +extern int *fwd_ptr1; + +enum ee1 : short; +enum e : short f = 0; // expected-error {{non-defining declaration of enumeration with a fixed underlying type is only permitted as a standalone declaration; missing list of enumerators?}} +enum g : short { yyy } h = yyy; + +enum ee2 : typeof ((enum ee3 : short { A })0, (short)0); diff --git a/clang/test/Sema/fixed-enum.c b/clang/test/Sema/fixed-enum.c index 954ff8c452b80c..2b02def0e1788d 100644 --- a/clang/test/Sema/fixed-enum.c +++ b/clang/test/Sema/fixed-enum.c @@ -5,9 +5,9 @@ // RUN: %clang_cc1 -pedantic -std=c11 -xc -DC11 -verify %s // RUN: %clang_cc1 -Weverything -std=c11 -xc -fms-extensions -DMS -verify %s // RUN: %clang_cc1 -Weverything -std=c2x -xc -DC23 -verify %s -// RUN: %clang_cc1 -pedantic -std=c2x -xc -DC23 -verify %s +// RUN: %clang_cc1 -pedantic -std=c2x -xc -DC23 -verify -Wpre-c23-compat %s // RUN: %clang_cc1 -Weverything -std=c23 -xc -DC23 -verify %s -// RUN: %clang_cc1 -pedantic -std=c23 -xc -DC23 -verify %s +// RUN: %clang_cc1 -pedantic -std=c23 -xc -DC23 -verify -Wpre-c23-compat %s // RUN: %clang_cc1 -Weverything -std=c23 -xc -fms-extensions -DC23 -verify %s enum X : int {e}; @@ -15,12 +15,14 @@ enum X : int {e}; // expected-warning@-2{{enumeration types with a fixed underlying type are incompatible with C++98}} #elif defined(CXX03) // expected-warning@-4{{enumeration types with a fixed underlying type are a C++11 extension}} -#elif defined(OBJC) || defined(C23) -// No diagnostic +#elif defined(OBJC) +// diagnostic +#elif defined(C23) +// expected-warning@-8{{enumeration types with a fixed underlying type are incompatible with C standards before C23}} #elif defined(C11) -// expected-warning@-8{{enumeration types with a fixed underlying type are a Clang extension}} +// expected-warning@-10{{enumeration types with a fixed underlying type are a C23 extension}} #elif defined(MS) -// expected-warning@-10{{enumeration types with a fixed underlying type are a Microsoft extension}} +// expected-warning@-12{{enumeration types with a fixed underlying type are a Microsoft extension}} #endif // Don't warn about the forward declaration in any language mode. @@ -29,16 +31,23 @@ enum Fwd : int { e2 }; #if !defined(OBJC) && !defined(C23) // expected-warning@-3 {{enumeration types with a fixed underlying type}} // expected-warning@-3 {{enumeration types with a fixed underlying type}} +#elif defined(C23) +// expected-warning@-6 {{enumeration types with a fixed underlying type are incompatible with C standards before C23}} +// expected-warning@-6 {{enumeration types with a fixed underlying type are incompatible with C standards before C23}} #endif // Always error on the incompatible redeclaration. enum BadFwd : int; #if !defined(OBJC) && !defined(C23) // expected-warning@-2 {{enumeration types with a fixed underlying type}} +#elif defined(C23) +// expected-warning@-4 {{enumeration types with a fixed underlying type are incompatible with C standards before C23}} #endif -// expected-note@-4 {{previous declaration is here}} +// expected-note@-6 {{previous declaration is here}} enum BadFwd : char { e3 }; #if !defined(OBJC) && !defined(C23) // expected-warning@-2 {{enumeration types with a fixed underlying type}} +#elif defined(C23) +// expected-warning@-4 {{enumeration types with a fixed underlying type are incompatible with C standards before C23}} #endif -// expected-error@-4 {{enumeration redeclared with different underlying type 'char' (was 'int')}} +// expected-error@-6 {{enumeration redeclared with different underlying type 'char' (was 'int')}} diff --git a/clang/www/c_status.html b/clang/www/c_status.html index 148405ec31a9ba..e5da7f3c87a519 100644 --- a/clang/www/c_status.html +++ b/clang/www/c_status.html @@ -697,7 +697,7 @@

C23 implementation status

Enhanced enumerations N3030 - Unknown + Clang 20 Freestanding C and IEC 60559 conformance scope reduction From 620738e66260f00f08808b3e676a697fc32db92e Mon Sep 17 00:00:00 2001 From: Adrian Vogelsgesang Date: Wed, 18 Sep 2024 16:16:31 +0200 Subject: [PATCH 066/321] [lldb-dap][test] Fix `readMemory` test (#109057) So far, the test case was also testing the offset -1. This test cases failed if the string is immediately at the beginning of the memory region, though, and the offset -1 hence belonged to a different memory region. The fix is rather straightforward: Passing an offset of -1 is not used by any actual clients of lldb-dap, anyway. As such, this commit simply removes the corresponding test case. --- lldb/test/API/tools/lldb-dap/memory/TestDAP_memory.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/lldb/test/API/tools/lldb-dap/memory/TestDAP_memory.py b/lldb/test/API/tools/lldb-dap/memory/TestDAP_memory.py index 3d8aaeda7f4b85..1082541aebcf7c 100644 --- a/lldb/test/API/tools/lldb-dap/memory/TestDAP_memory.py +++ b/lldb/test/API/tools/lldb-dap/memory/TestDAP_memory.py @@ -74,10 +74,6 @@ def test_memory_refs_set_variable(self): ].keys(), ) - # lldb-dap assumes that all reads will be within the same region. On Windows - # the target string is at the very start of a region so the -1 offset causes - # the read to only read from the previous region and only return 1 byte. - @skipIfWindows def test_readMemory(self): """ Tests the 'readMemory' request @@ -104,10 +100,6 @@ def test_readMemory(self): mem = self.dap_server.request_readMemory(memref, 2, 3)["body"] self.assertEqual(b64decode(mem["data"]), b"ad\0") - # Use a negative offset - mem = self.dap_server.request_readMemory(memref, -1, 6)["body"] - self.assertEqual(b64decode(mem["data"])[1:], b"dead\0") - # Reads of size 0 are successful # VS-Code sends those in order to check if a `memoryReference` can actually be dereferenced. mem = self.dap_server.request_readMemory(memref, 0, 0) From 47c3df2a7fcdfb33064d4d5e7d82dde1ea379023 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 18 Sep 2024 07:19:40 -0700 Subject: [PATCH 067/321] [LLVM][TableGen] Change CallingConvEmitter to use const RecordKeeper (#108955) Change CallingConvEmitter to use const RecordKeeper. This is a part of effort to have better const correctness in TableGen backends: https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089 --- llvm/include/llvm/TableGen/Record.h | 9 +++-- llvm/lib/TableGen/Record.cpp | 2 +- llvm/utils/TableGen/CallingConvEmitter.cpp | 45 ++++++++++++---------- 3 files changed, 30 insertions(+), 26 deletions(-) diff --git a/llvm/include/llvm/TableGen/Record.h b/llvm/include/llvm/TableGen/Record.h index f1420731d69081..5348c1177f63ed 100644 --- a/llvm/include/llvm/TableGen/Record.h +++ b/llvm/include/llvm/TableGen/Record.h @@ -2036,7 +2036,7 @@ class RecordKeeper { } /// Start timing a phase. Automatically stops any previous phase timer. - void startTimer(StringRef Name); + void startTimer(StringRef Name) const; /// Stop timing a phase. void stopTimer(); @@ -2110,12 +2110,13 @@ class RecordKeeper { mutable std::map> ClassRecordsMap; GlobalMap ExtraGlobals; + // TODO: Move timing related code out of RecordKeeper. // These members are for the phase timing feature. We need a timer group, // the last timer started, and a flag to say whether the last timer // is the special "backend overall timer." - TimerGroup *TimingGroup = nullptr; - Timer *LastTimer = nullptr; - bool BackendTimer = false; + mutable TimerGroup *TimingGroup = nullptr; + mutable Timer *LastTimer = nullptr; + mutable bool BackendTimer = false; /// The internal uniquer implementation of the RecordKeeper. std::unique_ptr Impl; diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp index 567545ec02f666..ff2da3badb3628 100644 --- a/llvm/lib/TableGen/Record.cpp +++ b/llvm/lib/TableGen/Record.cpp @@ -3225,7 +3225,7 @@ Init *RecordKeeper::getNewAnonymousName() { // These functions implement the phase timing facility. Starting a timer // when one is already running stops the running one. -void RecordKeeper::startTimer(StringRef Name) { +void RecordKeeper::startTimer(StringRef Name) const { if (TimingGroup) { if (LastTimer && LastTimer->isRunning()) { LastTimer->stopTimer(); diff --git a/llvm/utils/TableGen/CallingConvEmitter.cpp b/llvm/utils/TableGen/CallingConvEmitter.cpp index 6a3030bfc1b7e3..8876bb3ad31e19 100644 --- a/llvm/utils/TableGen/CallingConvEmitter.cpp +++ b/llvm/utils/TableGen/CallingConvEmitter.cpp @@ -22,7 +22,7 @@ using namespace llvm; namespace { class CallingConvEmitter { - RecordKeeper &Records; + const RecordKeeper &Records; unsigned Counter = 0u; std::string CurrentAction; bool SwiftAction = false; @@ -32,13 +32,13 @@ class CallingConvEmitter { std::map> DelegateToMap; public: - explicit CallingConvEmitter(RecordKeeper &R) : Records(R) {} + explicit CallingConvEmitter(const RecordKeeper &R) : Records(R) {} void run(raw_ostream &o); private: - void EmitCallingConv(Record *CC, raw_ostream &O); - void EmitAction(Record *Action, unsigned Indent, raw_ostream &O); + void EmitCallingConv(const Record *CC, raw_ostream &O); + void EmitAction(const Record *Action, unsigned Indent, raw_ostream &O); void EmitArgRegisterLists(raw_ostream &O); }; } // End anonymous namespace @@ -46,13 +46,14 @@ class CallingConvEmitter { void CallingConvEmitter::run(raw_ostream &O) { emitSourceFileHeader("Calling Convention Implementation Fragment", O); - std::vector CCs = Records.getAllDerivedDefinitions("CallingConv"); + ArrayRef CCs = + Records.getAllDerivedDefinitions("CallingConv"); // Emit prototypes for all of the non-custom CC's so that they can forward ref // each other. Records.startTimer("Emit prototypes"); O << "#ifndef GET_CC_REGISTER_LISTS\n\n"; - for (Record *CC : CCs) { + for (const Record *CC : CCs) { if (!CC->getValueAsBit("Custom")) { unsigned Pad = CC->getName().size(); if (CC->getValueAsBit("Entry")) { @@ -71,7 +72,7 @@ void CallingConvEmitter::run(raw_ostream &O) { // Emit each non-custom calling convention description in full. Records.startTimer("Emit full descriptions"); - for (Record *CC : CCs) { + for (const Record *CC : CCs) { if (!CC->getValueAsBit("Custom")) { EmitCallingConv(CC, O); } @@ -82,8 +83,8 @@ void CallingConvEmitter::run(raw_ostream &O) { O << "\n#endif // CC_REGISTER_LIST\n"; } -void CallingConvEmitter::EmitCallingConv(Record *CC, raw_ostream &O) { - ListInit *CCActions = CC->getValueAsListInit("Actions"); +void CallingConvEmitter::EmitCallingConv(const Record *CC, raw_ostream &O) { + const ListInit *CCActions = CC->getValueAsListInit("Actions"); Counter = 0; CurrentAction = CC->getName().str(); @@ -106,7 +107,7 @@ void CallingConvEmitter::EmitCallingConv(Record *CC, raw_ostream &O) { << std::string(Pad, ' ') << "ISD::ArgFlagsTy ArgFlags, CCState &State) {\n"; // Emit all of the actions, in order. for (unsigned i = 0, e = CCActions->size(); i != e; ++i) { - Record *Action = CCActions->getElementAsRecord(i); + const Record *Action = CCActions->getElementAsRecord(i); SwiftAction = llvm::any_of(Action->getSuperClasses(), [](const std::pair &Class) { @@ -122,7 +123,7 @@ void CallingConvEmitter::EmitCallingConv(Record *CC, raw_ostream &O) { O << "}\n"; } -void CallingConvEmitter::EmitAction(Record *Action, unsigned Indent, +void CallingConvEmitter::EmitAction(const Record *Action, unsigned Indent, raw_ostream &O) { std::string IndentStr = std::string(Indent, ' '); @@ -150,14 +151,14 @@ void CallingConvEmitter::EmitAction(Record *Action, unsigned Indent, O << IndentStr << "}\n"; } else { if (Action->isSubClassOf("CCDelegateTo")) { - Record *CC = Action->getValueAsDef("CC"); + const Record *CC = Action->getValueAsDef("CC"); O << IndentStr << "if (!" << CC->getName() << "(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))\n" << IndentStr << " return false;\n"; DelegateToMap[CurrentAction].insert(CC->getName().str()); } else if (Action->isSubClassOf("CCAssignToReg") || Action->isSubClassOf("CCAssignToRegAndStack")) { - ListInit *RegList = Action->getValueAsListInit("RegList"); + const ListInit *RegList = Action->getValueAsListInit("RegList"); if (RegList->size() == 1) { std::string Name = getQualifiedName(RegList->getElementAsRecord(0)); O << IndentStr << "if (MCRegister Reg = State.AllocateReg(" << Name @@ -210,8 +211,9 @@ void CallingConvEmitter::EmitAction(Record *Action, unsigned Indent, O << IndentStr << " return false;\n"; O << IndentStr << "}\n"; } else if (Action->isSubClassOf("CCAssignToRegWithShadow")) { - ListInit *RegList = Action->getValueAsListInit("RegList"); - ListInit *ShadowRegList = Action->getValueAsListInit("ShadowRegList"); + const ListInit *RegList = Action->getValueAsListInit("RegList"); + const ListInit *ShadowRegList = + Action->getValueAsListInit("ShadowRegList"); if (!ShadowRegList->empty() && ShadowRegList->size() != RegList->size()) PrintFatalError(Action->getLoc(), "Invalid length of list of shadowed registers"); @@ -278,7 +280,8 @@ void CallingConvEmitter::EmitAction(Record *Action, unsigned Indent, } else if (Action->isSubClassOf("CCAssignToStackWithShadow")) { int Size = Action->getValueAsInt("Size"); int Align = Action->getValueAsInt("Align"); - ListInit *ShadowRegList = Action->getValueAsListInit("ShadowRegList"); + const ListInit *ShadowRegList = + Action->getValueAsListInit("ShadowRegList"); unsigned ShadowRegListNumber = ++Counter; @@ -297,7 +300,7 @@ void CallingConvEmitter::EmitAction(Record *Action, unsigned Indent, << Counter << ", LocVT, LocInfo));\n"; O << IndentStr << "return false;\n"; } else if (Action->isSubClassOf("CCPromoteToType")) { - Record *DestTy = Action->getValueAsDef("DestTy"); + const Record *DestTy = Action->getValueAsDef("DestTy"); MVT::SimpleValueType DestVT = getValueType(DestTy); O << IndentStr << "LocVT = " << getEnumName(DestVT) << ";\n"; if (MVT(DestVT).isFloatingPoint()) { @@ -311,7 +314,7 @@ void CallingConvEmitter::EmitAction(Record *Action, unsigned Indent, << IndentStr << " LocInfo = CCValAssign::AExt;\n"; } } else if (Action->isSubClassOf("CCPromoteToUpperBitsInType")) { - Record *DestTy = Action->getValueAsDef("DestTy"); + const Record *DestTy = Action->getValueAsDef("DestTy"); MVT::SimpleValueType DestVT = getValueType(DestTy); O << IndentStr << "LocVT = " << getEnumName(DestVT) << ";\n"; if (MVT(DestVT).isFloatingPoint()) { @@ -327,17 +330,17 @@ void CallingConvEmitter::EmitAction(Record *Action, unsigned Indent, << IndentStr << " LocInfo = CCValAssign::AExtUpper;\n"; } } else if (Action->isSubClassOf("CCBitConvertToType")) { - Record *DestTy = Action->getValueAsDef("DestTy"); + const Record *DestTy = Action->getValueAsDef("DestTy"); O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy)) << ";\n"; O << IndentStr << "LocInfo = CCValAssign::BCvt;\n"; } else if (Action->isSubClassOf("CCTruncToType")) { - Record *DestTy = Action->getValueAsDef("DestTy"); + const Record *DestTy = Action->getValueAsDef("DestTy"); O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy)) << ";\n"; O << IndentStr << "LocInfo = CCValAssign::Trunc;\n"; } else if (Action->isSubClassOf("CCPassIndirect")) { - Record *DestTy = Action->getValueAsDef("DestTy"); + const Record *DestTy = Action->getValueAsDef("DestTy"); O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy)) << ";\n"; O << IndentStr << "LocInfo = CCValAssign::Indirect;\n"; From b334ca6739fb069b7259a89ba246cc600f07c68f Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Wed, 18 Sep 2024 10:26:40 -0400 Subject: [PATCH 068/321] [NFC] Remove trailing whitespaces in `llvm/docs/LangRef.rst` --- llvm/docs/LangRef.rst | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 144b4497ca63ce..abeafb7616201a 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -354,7 +354,7 @@ added in the future: not be used lightly but only for specific situations such as an alternative to the *register pinning* performance technique often used when implementing functional programming languages. At the - moment only X86, AArch64, and RISCV support this convention. The + moment only X86, AArch64, and RISCV support this convention. The following limitations exist: - On *X86-32* only up to 4 bit type parameters are supported. No @@ -685,10 +685,10 @@ implementation defined, the optimizer can't do the latter. The former is challenging as many commonly expected properties, such as ``ptrtoint(v)-ptrtoint(v) == 0``, don't hold for non-integral types. Similar restrictions apply to intrinsics that might examine the pointer bits, -such as :ref:`llvm.ptrmask`. +such as :ref:`llvm.ptrmask`. The alignment information provided by the frontend for a non-integral pointer -(typically using attributes or metadata) must be valid for every possible +(typically using attributes or metadata) must be valid for every possible representation of the pointer. .. _globalvars: @@ -1677,10 +1677,10 @@ Currently, only the following parameter attributes are defined: - The range is allowed to wrap. - The empty range is represented using ``0,0``. - Otherwise, ``a`` and ``b`` are not allowed to be equal. - - This attribute may only be applied to parameters or return values with integer + + This attribute may only be applied to parameters or return values with integer or vector of integer types. - + For vector-typed parameters, the range is applied element-wise. .. _gc: @@ -14346,7 +14346,7 @@ Arguments: """""""""" The first 4 arguments are similar to ``llvm.instrprof.increment``. The indexing is specific to callsites, meaning callsites are indexed from 0, independent from -the indexes used by the other intrinsics (such as +the indexes used by the other intrinsics (such as ``llvm.instrprof.increment[.step]``). The last argument is the called value of the callsite this intrinsic precedes. @@ -14360,7 +14360,7 @@ a buffer LLVM can use to perform counter increments (i.e. the lowering of ``llvm.instrprof.increment[.step]``. The address range following the counter buffer, ```` x ``sizeof(ptr)`` - sized, is expected to contain pointers to contexts of functions called from this function ("subcontexts"). -LLVM does not dereference into that memory region, just calculates GEPs. +LLVM does not dereference into that memory region, just calculates GEPs. The lowering of ``llvm.instrprof.callsite`` consists of: @@ -14929,8 +14929,8 @@ integer bit width or any vector of integer elements. Overview: """"""""" -Return ``-1`` if ``%a`` is signed less than ``%b``, ``0`` if they are equal, and -``1`` if ``%a`` is signed greater than ``%b``. Vector intrinsics operate on a per-element basis. +Return ``-1`` if ``%a`` is signed less than ``%b``, ``0`` if they are equal, and +``1`` if ``%a`` is signed greater than ``%b``. Vector intrinsics operate on a per-element basis. Arguments: """""""""" @@ -14958,8 +14958,8 @@ integer bit width or any vector of integer elements. Overview: """"""""" -Return ``-1`` if ``%a`` is unsigned less than ``%b``, ``0`` if they are equal, and -``1`` if ``%a`` is unsigned greater than ``%b``. Vector intrinsics operate on a per-element basis. +Return ``-1`` if ``%a`` is unsigned less than ``%b``, ``0`` if they are equal, and +``1`` if ``%a`` is unsigned greater than ``%b``. Vector intrinsics operate on a per-element basis. Arguments: """""""""" @@ -21556,9 +21556,9 @@ Semantics: """""""""" The '``llvm.vp.minimum``' intrinsic performs floating-point minimum (:ref:`minimum `) -of the first and second vector arguments on each enabled lane, the result being +of the first and second vector arguments on each enabled lane, the result being NaN if either argument is a NaN. -0.0 is considered to be less than +0.0 for this -intrinsic. The result on disabled lanes is a :ref:`poison value `. +intrinsic. The result on disabled lanes is a :ref:`poison value `. The operation is performed in the default floating-point environment. Examples: @@ -29191,7 +29191,7 @@ Semantics: """""""""" The intrinsic ``@llvm.allow.ubsan.check()`` returns either ``true`` or -``false``, depending on compiler options. +``false``, depending on compiler options. For each evaluation of a call to this intrinsic, the program must be valid and correct both if it returns ``true`` and if it returns ``false``. @@ -29250,13 +29250,13 @@ Semantics: """""""""" The intrinsic ``@llvm.allow.runtime.check()`` returns either ``true`` or -``false``, depending on compiler options. +``false``, depending on compiler options. For each evaluation of a call to this intrinsic, the program must be valid and correct both if it returns ``true`` and if it returns ``false``. When used in a branch condition, it allows us to choose between -two alternative correct solutions for the same problem. +two alternative correct solutions for the same problem. If the intrinsic is evaluated as ``true``, program should execute a guarded check. If the intrinsic is evaluated as ``false``, the program should avoid any From 67518a44fec0f59b2f926059cf15ec77ec72da13 Mon Sep 17 00:00:00 2001 From: Sarah Spall Date: Wed, 18 Sep 2024 08:19:52 -0700 Subject: [PATCH 069/321] [HLSL] Implement elementwise popcount (#108121) Add new elementwise popcount builtin to support HLSL function 'countbits'. elementwise popcount only accepts integer types. Add hlsl intrinsic 'countbits' Closes #99094 --- clang/docs/LanguageExtensions.rst | 1 + clang/docs/ReleaseNotes.rst | 1 + clang/include/clang/Basic/Builtins.td | 6 ++ clang/lib/CodeGen/CGBuiltin.cpp | 3 + clang/lib/Headers/hlsl/hlsl_intrinsics.h | 71 ++++++++++++++++ clang/lib/Sema/SemaChecking.cpp | 2 +- .../test/CodeGen/builtins-elementwise-math.c | 37 +++++++++ .../test/CodeGenHLSL/builtins/countbits.hlsl | 80 +++++++++++++++++++ clang/test/Sema/builtins-elementwise-math.c | 33 ++++++++ clang/test/Sema/countbits-errors.hlsl | 28 +++++++ .../SemaCXX/builtins-elementwise-math.cpp | 8 ++ .../SemaHLSL/BuiltIns/countbits-errors.hlsl | 21 +++++ llvm/lib/Target/DirectX/DXIL.td | 11 +++ llvm/test/CodeGen/DirectX/countbits.ll | 47 +++++++++++ .../SPIRV/hlsl-intrinsics/countbits.ll | 21 +++++ 15 files changed, 369 insertions(+), 1 deletion(-) create mode 100644 clang/test/CodeGenHLSL/builtins/countbits.hlsl create mode 100644 clang/test/Sema/countbits-errors.hlsl create mode 100644 clang/test/SemaHLSL/BuiltIns/countbits-errors.hlsl create mode 100644 llvm/test/CodeGen/DirectX/countbits.ll create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-intrinsics/countbits.ll diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index c08697282cbfe8..f62f90fb9650a9 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -667,6 +667,7 @@ Unless specified otherwise operation(±0) = ±0 and operation(±infinity) = ±in T __builtin_elementwise_log(T x) return the natural logarithm of x floating point types T __builtin_elementwise_log2(T x) return the base 2 logarithm of x floating point types T __builtin_elementwise_log10(T x) return the base 10 logarithm of x floating point types + T __builtin_elementwise_popcount(T x) return the number of 1 bits in x integer types T __builtin_elementwise_pow(T x, T y) return x raised to the power of y floating point types T __builtin_elementwise_bitreverse(T x) return the integer represented after reversing the bits of x integer types T __builtin_elementwise_exp(T x) returns the base-e exponential, e^x, of the specified value floating point types diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 7b612e3c65f494..d10b284310071e 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -116,6 +116,7 @@ C++ Language Changes - Accept C++26 user-defined ``static_assert`` messages in C++11 as an extension. +- Add ``__builtin_elementwise_popcount`` builtin for integer types only. C++2c Feature Support ^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td index 6cf03d27055cd9..8c5d7ad763bf97 100644 --- a/clang/include/clang/Basic/Builtins.td +++ b/clang/include/clang/Basic/Builtins.td @@ -1322,6 +1322,12 @@ def ElementwiseLog10 : Builtin { let Prototype = "void(...)"; } +def ElementwisePopcount : Builtin { + let Spellings = ["__builtin_elementwise_popcount"]; + let Attributes = [NoThrow, Const, CustomTypeChecking]; + let Prototype = "void(...)"; +} + def ElementwisePow : Builtin { let Spellings = ["__builtin_elementwise_pow"]; let Attributes = [NoThrow, Const, CustomTypeChecking]; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index a52e880a764252..7e18aafcdd4b8a 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -3834,6 +3834,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BI__builtin_elementwise_floor: return RValue::get(emitBuiltinWithOneOverloadedType<1>( *this, E, llvm::Intrinsic::floor, "elt.floor")); + case Builtin::BI__builtin_elementwise_popcount: + return RValue::get(emitBuiltinWithOneOverloadedType<1>( + *this, E, llvm::Intrinsic::ctpop, "elt.ctpop")); case Builtin::BI__builtin_elementwise_roundeven: return RValue::get(emitBuiltinWithOneOverloadedType<1>( *this, E, llvm::Intrinsic::roundeven, "elt.roundeven")); diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h index 6a50d50ebd3479..6cd6a2caf19994 100644 --- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h +++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h @@ -650,6 +650,77 @@ float3 cosh(float3); _HLSL_BUILTIN_ALIAS(__builtin_elementwise_cosh) float4 cosh(float4); +//===----------------------------------------------------------------------===// +// count bits builtins +//===----------------------------------------------------------------------===// + +/// \fn T countbits(T Val) +/// \brief Return the number of bits (per component) set in the input integer. +/// \param Val The input value. + +#ifdef __HLSL_ENABLE_16_BIT +_HLSL_AVAILABILITY(shadermodel, 6.2) +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount) +int16_t countbits(int16_t); +_HLSL_AVAILABILITY(shadermodel, 6.2) +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount) +int16_t2 countbits(int16_t2); +_HLSL_AVAILABILITY(shadermodel, 6.2) +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount) +int16_t3 countbits(int16_t3); +_HLSL_AVAILABILITY(shadermodel, 6.2) +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount) +int16_t4 countbits(int16_t4); +_HLSL_AVAILABILITY(shadermodel, 6.2) +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount) +uint16_t countbits(uint16_t); +_HLSL_AVAILABILITY(shadermodel, 6.2) +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount) +uint16_t2 countbits(uint16_t2); +_HLSL_AVAILABILITY(shadermodel, 6.2) +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount) +uint16_t3 countbits(uint16_t3); +_HLSL_AVAILABILITY(shadermodel, 6.2) +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount) +uint16_t4 countbits(uint16_t4); +#endif + +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount) +int countbits(int); +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount) +int2 countbits(int2); +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount) +int3 countbits(int3); +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount) +int4 countbits(int4); + +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount) +uint countbits(uint); +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount) +uint2 countbits(uint2); +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount) +uint3 countbits(uint3); +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount) +uint4 countbits(uint4); + +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount) +int64_t countbits(int64_t); +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount) +int64_t2 countbits(int64_t2); +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount) +int64_t3 countbits(int64_t3); +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount) +int64_t4 countbits(int64_t4); + +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount) +uint64_t countbits(uint64_t); +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount) +uint64_t2 countbits(uint64_t2); +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount) +uint64_t3 countbits(uint64_t3); +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount) +uint64_t4 countbits(uint64_t4); + //===----------------------------------------------------------------------===// // dot product builtins //===----------------------------------------------------------------------===// diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 99500daca295c9..d2570119c3432d 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -2795,7 +2795,7 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID, if (BuiltinElementwiseMath(TheCall)) return ExprError(); break; - + case Builtin::BI__builtin_elementwise_popcount: case Builtin::BI__builtin_elementwise_bitreverse: { if (PrepareBuiltinElementwiseMathOneArgCall(TheCall)) return ExprError(); diff --git a/clang/test/CodeGen/builtins-elementwise-math.c b/clang/test/CodeGen/builtins-elementwise-math.c index 8fb52992c0fe68..7e094a52653ef0 100644 --- a/clang/test/CodeGen/builtins-elementwise-math.c +++ b/clang/test/CodeGen/builtins-elementwise-math.c @@ -570,6 +570,43 @@ void test_builtin_elementwise_log2(float f1, float f2, double d1, double d2, vf2 = __builtin_elementwise_log2(vf1); } +void test_builtin_elementwise_popcount(si8 vi1, si8 vi2, + long long int i1, long long int i2, short si, + _BitInt(31) bi1, _BitInt(31) bi2) { + + + // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8 + // CHECK-NEXT: call i64 @llvm.ctpop.i64(i64 [[I1]]) + i2 = __builtin_elementwise_popcount(i1); + + // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16 + // CHECK-NEXT: call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> [[VI1]]) + vi2 = __builtin_elementwise_popcount(vi1); + + // CHECK: [[CVI2:%.+]] = load <8 x i16>, ptr %cvi2, align 16 + // CHECK-NEXT: call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> [[CVI2]]) + const si8 cvi2 = vi2; + vi2 = __builtin_elementwise_popcount(cvi2); + + // CHECK: [[BI1:%.+]] = load i32, ptr %bi1.addr, align 4 + // CHECK-NEXT: [[LOADEDV:%.+]] = trunc i32 [[BI1]] to i31 + // CHECK-NEXT: call i31 @llvm.ctpop.i31(i31 [[LOADEDV]]) + bi2 = __builtin_elementwise_popcount(bi1); + + // CHECK: [[IA1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4 + // CHECK-NEXT: call i32 @llvm.ctpop.i32(i32 [[IA1]]) + b = __builtin_elementwise_popcount(int_as_one); + + // CHECK: call i32 @llvm.ctpop.i32(i32 -10) + b = __builtin_elementwise_popcount(-10); + + // CHECK: [[SI:%.+]] = load i16, ptr %si.addr, align 2 + // CHECK-NEXT: [[SI_EXT:%.+]] = sext i16 [[SI]] to i32 + // CHECK-NEXT: [[RES:%.+]] = call i32 @llvm.ctpop.i32(i32 [[SI_EXT]]) + // CHECK-NEXT: = trunc i32 [[RES]] to i16 + si = __builtin_elementwise_popcount(si); +} + void test_builtin_elementwise_pow(float f1, float f2, double d1, double d2, float4 vf1, float4 vf2) { diff --git a/clang/test/CodeGenHLSL/builtins/countbits.hlsl b/clang/test/CodeGenHLSL/builtins/countbits.hlsl new file mode 100644 index 00000000000000..8dfe977bfae626 --- /dev/null +++ b/clang/test/CodeGenHLSL/builtins/countbits.hlsl @@ -0,0 +1,80 @@ +// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: -emit-llvm -disable-llvm-passes -O3 -o - | FileCheck %s + +#ifdef __HLSL_ENABLE_16_BIT +// CHECK-LABEL: test_countbits_ushort +// CHECK: call i16 @llvm.ctpop.i16 +uint16_t test_countbits_ushort(uint16_t p0) +{ + return countbits(p0); +} +// CHECK-LABEL: test_countbits_ushort2 +// CHECK: call <2 x i16> @llvm.ctpop.v2i16 +uint16_t2 test_countbits_ushort2(uint16_t2 p0) +{ + return countbits(p0); +} +// CHECK-LABEL: test_countbits_ushort3 +// CHECK: call <3 x i16> @llvm.ctpop.v3i16 +uint16_t3 test_countbits_ushort3(uint16_t3 p0) +{ + return countbits(p0); +} +// CHECK-LABEL: test_countbits_ushort4 +// CHECK: call <4 x i16> @llvm.ctpop.v4i16 +uint16_t4 test_countbits_ushort4(uint16_t4 p0) +{ + return countbits(p0); +} +#endif + +// CHECK-LABEL: test_countbits_uint +// CHECK: call i32 @llvm.ctpop.i32 +int test_countbits_uint(uint p0) +{ + return countbits(p0); +} +// CHECK-LABEL: test_countbits_uint2 +// CHECK: call <2 x i32> @llvm.ctpop.v2i32 +uint2 test_countbits_uint2(uint2 p0) +{ + return countbits(p0); +} +// CHECK-LABEL: test_countbits_uint3 +// CHECK: call <3 x i32> @llvm.ctpop.v3i32 +uint3 test_countbits_uint3(uint3 p0) +{ + return countbits(p0); +} +// CHECK-LABEL: test_countbits_uint4 +// CHECK: call <4 x i32> @llvm.ctpop.v4i32 +uint4 test_countbits_uint4(uint4 p0) +{ + return countbits(p0); +} + +// CHECK-LABEL: test_countbits_long +// CHECK: call i64 @llvm.ctpop.i64 +uint64_t test_countbits_long(uint64_t p0) +{ + return countbits(p0); +} +// CHECK-LABEL: test_countbits_long2 +// CHECK: call <2 x i64> @llvm.ctpop.v2i64 +uint64_t2 test_countbits_long2(uint64_t2 p0) +{ + return countbits(p0); +} +// CHECK-LABEL: test_countbits_long3 +// CHECK: call <3 x i64> @llvm.ctpop.v3i64 +uint64_t3 test_countbits_long3(uint64_t3 p0) +{ + return countbits(p0); +} +// CHECK-LABEL: test_countbits_long4 +// CHECK: call <4 x i64> @llvm.ctpop.v4i64 +uint64_t4 test_countbits_long4(uint64_t4 p0) +{ + return countbits(p0); +} diff --git a/clang/test/Sema/builtins-elementwise-math.c b/clang/test/Sema/builtins-elementwise-math.c index 628274380ae5f2..1727be1d6286d5 100644 --- a/clang/test/Sema/builtins-elementwise-math.c +++ b/clang/test/Sema/builtins-elementwise-math.c @@ -505,6 +505,39 @@ void test_builtin_elementwise_log2(int i, float f, double d, float4 v, int3 iv, // expected-error@-1 {{1st argument must be a floating point type (was 'unsigned4' (vector of 4 'unsigned int' values))}} } +void test_builtin_elementwise_popcount(int i, float f, double d, float4 v, int3 iv, unsigned u, unsigned4 uv) { + + struct Foo s = __builtin_elementwise_popcount(i); + // expected-error@-1 {{initializing 'struct Foo' with an expression of incompatible type 'int'}} + + i = __builtin_elementwise_popcount(); + // expected-error@-1 {{too few arguments to function call, expected 1, have 0}} + + i = __builtin_elementwise_popcount(f); + // expected-error@-1 {{1st argument must be a vector of integers (was 'float')}} + + i = __builtin_elementwise_popcount(f, f); + // expected-error@-1 {{too many arguments to function call, expected 1, have 2}} + + u = __builtin_elementwise_popcount(d); + // expected-error@-1 {{1st argument must be a vector of integers (was 'double')}} + + v = __builtin_elementwise_popcount(v); + // expected-error@-1 {{1st argument must be a vector of integers (was 'float4' (vector of 4 'float' values))}} + + int2 i2 = __builtin_elementwise_popcount(iv); + // expected-error@-1 {{initializing 'int2' (vector of 2 'int' values) with an expression of incompatible type 'int3' (vector of 3 'int' values)}} + + iv = __builtin_elementwise_popcount(i2); + // expected-error@-1 {{assigning to 'int3' (vector of 3 'int' values) from incompatible type 'int2' (vector of 2 'int' values)}} + + unsigned3 u3 = __builtin_elementwise_popcount(iv); + // expected-error@-1 {{initializing 'unsigned3' (vector of 3 'unsigned int' values) with an expression of incompatible type 'int3' (vector of 3 'int' values)}} + + iv = __builtin_elementwise_popcount(u3); + // expected-error@-1 {{assigning to 'int3' (vector of 3 'int' values) from incompatible type 'unsigned3' (vector of 3 'unsigned int' values)}} +} + void test_builtin_elementwise_pow(int i, short s, double d, float4 v, int3 iv, unsigned3 uv, int *p) { i = __builtin_elementwise_pow(p, d); // expected-error@-1 {{arguments are of different types ('int *' vs 'double')}} diff --git a/clang/test/Sema/countbits-errors.hlsl b/clang/test/Sema/countbits-errors.hlsl new file mode 100644 index 00000000000000..0fd36fe78d79f0 --- /dev/null +++ b/clang/test/Sema/countbits-errors.hlsl @@ -0,0 +1,28 @@ +// RUN: %clang_cc1 -finclude-default-header +// -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only +// -disable-llvm-passes -verify + +double2 test_int_builtin(double2 p0) { + return __builtin_hlsl_elementwise_countbits(p0); + // expected-error@-1 {{passing 'double2' (aka 'vector') to + // parameter of incompatible type + // '__attribute__((__vector_size__(2 * sizeof(int)))) int' + // (vector of 2 'int' values)}} +} + +float test_ambiguous(float p0) { + return countbits(p0); + // expected-error@-1 {{call to 'countbits' is ambiguous}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function}} +} + +float test_float_builtin(float p0) { + return __builtin_hlsl_elementwise_countbits(p0); + // expected-error@-1 {{passing 'double' to parameter of incompatible type + // 'int'}} +} diff --git a/clang/test/SemaCXX/builtins-elementwise-math.cpp b/clang/test/SemaCXX/builtins-elementwise-math.cpp index 898d869f4c81be..c3d8bc593c0bbc 100644 --- a/clang/test/SemaCXX/builtins-elementwise-math.cpp +++ b/clang/test/SemaCXX/builtins-elementwise-math.cpp @@ -269,3 +269,11 @@ void test_builtin_elementwise_bitreverse() { static_assert(!is_const::value); static_assert(!is_const::value); } + +void test_builtin_elementwise_popcount() { + const int a = 2; + int b = 1; + static_assert(!is_const::value); + static_assert(!is_const::value); +} + diff --git a/clang/test/SemaHLSL/BuiltIns/countbits-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/countbits-errors.hlsl new file mode 100644 index 00000000000000..8d5f0abb2860f8 --- /dev/null +++ b/clang/test/SemaHLSL/BuiltIns/countbits-errors.hlsl @@ -0,0 +1,21 @@ +// RUN: %clang_cc1 -finclude-default-header +// -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only +// -disable-llvm-passes -verify -verify-ignore-unexpected + + +double test_int_builtin(double p0) { + return countbits(p0); + // expected-error@-1 {{call to 'countbits' is ambiguous}} +} + +double2 test_int_builtin_2(double2 p0) { + return __builtin_elementwise_popcount(p0); + // expected-error@-1 {{1st argument must be a vector of integers + // (was 'double2' (aka 'vector'))}} +} + +double test_int_builtin_3(float p0) { + return __builtin_elementwise_popcount(p0); + // expected-error@-1 {{1st argument must be a vector of integers + // (was 'float')}} +} diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index 902ab37bf741ed..9aa0af3e3a6b17 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -553,6 +553,17 @@ def Rbits : DXILOp<30, unary> { let attributes = [Attributes]; } +def CBits : DXILOp<31, unary> { + let Doc = "Returns the number of 1 bits in the specified value."; + let LLVMIntrinsic = int_ctpop; + let arguments = [OverloadTy]; + let result = OverloadTy; + let overloads = + [Overloads]; + let stages = [Stages]; + let attributes = [Attributes]; +} + def FMax : DXILOp<35, binary> { let Doc = "Float maximum. FMax(a,b) = a > b ? a : b"; let LLVMIntrinsic = int_maxnum; diff --git a/llvm/test/CodeGen/DirectX/countbits.ll b/llvm/test/CodeGen/DirectX/countbits.ll new file mode 100644 index 00000000000000..c6bc2b6790948e --- /dev/null +++ b/llvm/test/CodeGen/DirectX/countbits.ll @@ -0,0 +1,47 @@ +; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s + +; Make sure dxil operation function calls for countbits are generated for all integer types. + +define noundef i16 @test_countbits_short(i16 noundef %a) { +entry: +; CHECK: call i16 @dx.op.unary.i16(i32 31, i16 %{{.*}}) + %elt.ctpop = call i16 @llvm.ctpop.i16(i16 %a) + ret i16 %elt.ctpop +} + +define noundef i32 @test_countbits_int(i32 noundef %a) { +entry: +; CHECK: call i32 @dx.op.unary.i32(i32 31, i32 %{{.*}}) + %elt.ctpop = call i32 @llvm.ctpop.i32(i32 %a) + ret i32 %elt.ctpop +} + +define noundef i64 @test_countbits_long(i64 noundef %a) { +entry: +; CHECK: call i64 @dx.op.unary.i64(i32 31, i64 %{{.*}}) + %elt.ctpop = call i64 @llvm.ctpop.i64(i64 %a) + ret i64 %elt.ctpop +} + +define noundef <4 x i32> @countbits_vec4_i32(<4 x i32> noundef %a) { +entry: + ; CHECK: [[ee0:%.*]] = extractelement <4 x i32> %a, i64 0 + ; CHECK: [[ie0:%.*]] = call i32 @dx.op.unary.i32(i32 31, i32 [[ee0]]) + ; CHECK: [[ee1:%.*]] = extractelement <4 x i32> %a, i64 1 + ; CHECK: [[ie1:%.*]] = call i32 @dx.op.unary.i32(i32 31, i32 [[ee1]]) + ; CHECK: [[ee2:%.*]] = extractelement <4 x i32> %a, i64 2 + ; CHECK: [[ie2:%.*]] = call i32 @dx.op.unary.i32(i32 31, i32 [[ee2]]) + ; CHECK: [[ee3:%.*]] = extractelement <4 x i32> %a, i64 3 + ; CHECK: [[ie3:%.*]] = call i32 @dx.op.unary.i32(i32 31, i32 [[ee3]]) + ; CHECK: insertelement <4 x i32> poison, i32 [[ie0]], i64 0 + ; CHECK: insertelement <4 x i32> %{{.*}}, i32 [[ie1]], i64 1 + ; CHECK: insertelement <4 x i32> %{{.*}}, i32 [[ie2]], i64 2 + ; CHECK: insertelement <4 x i32> %{{.*}}, i32 [[ie3]], i64 3 + %2 = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a) + ret <4 x i32> %2 +} + +declare i16 @llvm.ctpop.i16(i16) +declare i32 @llvm.ctpop.i32(i32) +declare i64 @llvm.ctpop.i64(i64) +declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/countbits.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/countbits.ll new file mode 100644 index 00000000000000..57ec0bda2e1890 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/countbits.ll @@ -0,0 +1,21 @@ +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK: OpMemoryModel Logical GLSL450 + +define noundef i32 @countbits_i32(i32 noundef %a) { +entry: +; CHECK: %[[#]] = OpBitCount %[[#]] %[[#]] + %elt.bitreverse = call i32 @llvm.ctpop.i32(i32 %a) + ret i32 %elt.bitreverse +} + +define noundef i16 @countbits_i16(i16 noundef %a) { +entry: +; CHECK: %[[#]] = OpBitCount %[[#]] %[[#]] + %elt.ctpop = call i16 @llvm.ctpop.i16(i16 %a) + ret i16 %elt.ctpop +} + +declare i16 @llvm.ctpop.i16(i16) +declare i32 @llvm.ctpop.i32(i32) From 475ceca859233b387c22f13ecef581158ef36346 Mon Sep 17 00:00:00 2001 From: Mainak Sil Date: Wed, 18 Sep 2024 20:59:49 +0530 Subject: [PATCH 070/321] [clang] Increase VecLib bitfield size to 4 bits in CodeGenOptions.def (#108804) Summary: This PR fixes the issue where the VecLib bitfield in CodeGenOptions.def is too small to accommodate the increasing number of vector libraries. Specifically, the bitfield size was previously set to 3, but with the introduction of more vector libraries (currently 9), the bitfield needed to be expanded to avoid potential issues in vectorization. In this PR, I have increased the size of the VecLib bitfield from 3 to 4 to account for the additional libraries. This ensures that all 9 vector libraries are correctly encoded and available for use without errors. Changes Made: Modified: Increased the VecLib bitfield size from 3 to 4 in clang/include/clang/Basic/CodeGenOptions.def. Motivation: This change is necessary to ensure that all vector libraries are properly represented and selectable. The current limitation of the VecLib bitfield size was causing some vectorization opportunities to be lost when more than 3 bits were needed to represent the library options. Closes: Fixes https://github.com/llvm/llvm-project/issues/108704 --- clang/include/clang/Basic/CodeGenOptions.def | 14 ++++++++-- clang/unittests/CodeGen/AllLibrariesFit.cpp | 10 +++++++ .../CodeGen/EncodingDecodingTest.cpp | 17 ++++++++++++ .../CodeGen/SimulatedOverflowTest.cpp | 26 +++++++++++++++++++ 4 files changed, 65 insertions(+), 2 deletions(-) create mode 100644 clang/unittests/CodeGen/AllLibrariesFit.cpp create mode 100644 clang/unittests/CodeGen/EncodingDecodingTest.cpp create mode 100644 clang/unittests/CodeGen/SimulatedOverflowTest.cpp diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def index b600198998d85b..b78ae61e6509ea 100644 --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -375,8 +375,18 @@ ENUM_CODEGENOPT(Inlining, InliningMethod, 2, NormalInlining) /// The maximum stack size a function can have to be considered for inlining. VALUE_CODEGENOPT(InlineMaxStackSize, 32, UINT_MAX) -// Vector functions library to use. -ENUM_CODEGENOPT(VecLib, llvm::driver::VectorLibrary, 3, llvm::driver::VectorLibrary::NoLibrary) +// Define the number of bits required for the VecLib enum +#define VECLIB_BIT_COUNT (llvm::countPopulation(llvm::driver::VectorLibrary::MaxLibrary)) + +// Ensure the VecLib bitfield has enough space for future vector libraries. +// The number of bits is determined automatically based on the number of enum values. +static_assert(static_cast(llvm::driver::VectorLibrary::MaxLibrary) <= (1 << VECLIB_BIT_COUNT), + "VecLib bitfield size is too small to accommodate all vector libraries."); + +// VecLib definition in CodeGenOptions.def +ENUM_CODEGENOPT(VecLib, llvm::driver::VectorLibrary, VECLIB_BIT_COUNT, llvm::driver::VectorLibrary::NoLibrary) + +#undef VECLIB_BIT_COUNT /// The default TLS model to use. ENUM_CODEGENOPT(DefaultTLSModel, TLSModel, 2, GeneralDynamicTLSModel) diff --git a/clang/unittests/CodeGen/AllLibrariesFit.cpp b/clang/unittests/CodeGen/AllLibrariesFit.cpp new file mode 100644 index 00000000000000..dfe63b557729ee --- /dev/null +++ b/clang/unittests/CodeGen/AllLibrariesFit.cpp @@ -0,0 +1,10 @@ +#include "clang/Basic/CodeGenOptions.h" +#include "llvm/Driver/Options.h" +#include "gtest/gtest.h" + +TEST(VecLibBitfieldTest, AllLibrariesFit) { + // We expect that all vector libraries fit in the bitfield size + EXPECT_LE(static_cast(llvm::driver::VectorLibrary::MaxLibrary), + (1 << VECLIB_BIT_COUNT)) + << "VecLib bitfield size is too small!"; + } diff --git a/clang/unittests/CodeGen/EncodingDecodingTest.cpp b/clang/unittests/CodeGen/EncodingDecodingTest.cpp new file mode 100644 index 00000000000000..67c89ef07c428b --- /dev/null +++ b/clang/unittests/CodeGen/EncodingDecodingTest.cpp @@ -0,0 +1,17 @@ +TEST(VecLibBitfieldTest, EncodingDecodingTest) { + clang::CodeGenOptions Opts; + + // Test encoding and decoding for each vector library + for (int i = static_cast(llvm::driver::VectorLibrary::Accelerate); + i <= static_cast(llvm::driver::VectorLibrary::MaxLibrary); ++i) { + + Opts.VecLib = static_cast(i); + + // Encode and then decode + llvm::driver::VectorLibrary decodedValue = + static_cast(Opts.VecLib); + + EXPECT_EQ(decodedValue, Opts.VecLib) + << "Encoding/Decoding failed for vector library " << i; + } +} diff --git a/clang/unittests/CodeGen/SimulatedOverflowTest.cpp b/clang/unittests/CodeGen/SimulatedOverflowTest.cpp new file mode 100644 index 00000000000000..acfeaf7498b6d0 --- /dev/null +++ b/clang/unittests/CodeGen/SimulatedOverflowTest.cpp @@ -0,0 +1,26 @@ +// Simulate the addition of a new library without increasing the bitfield size +enum class SimulatedVectorLibrary { + Accelerate = 0, + LIBMVEC, + MASSV, + SVML, + SLEEF, + Darwin_libsystem_m, + ArmPL, + AMDLIBM, + NoLibrary, + // Simulate new addition + NewLibrary, + MaxLibrary +}; + +#define SIMULATED_VECLIB_BIT_COUNT \ + 4 // The current bitfield size (should be 4 for 9 options) + +TEST(VecLibBitfieldTest, SimulatedOverflowTest) { + // Simulate the addition of a new library and check if the bitfield size is + // sufficient + EXPECT_LE(static_cast(SimulatedVectorLibrary::MaxLibrary), + (1 << SIMULATED_VECLIB_BIT_COUNT)) + << "Simulated VecLib bitfield size overflow!"; +} From 3f0dfab54184dcf9b00f37a2b8ac4f1f6ab14701 Mon Sep 17 00:00:00 2001 From: Aaron Ballman Date: Wed, 18 Sep 2024 11:34:09 -0400 Subject: [PATCH 071/321] Revert "[clang] Increase VecLib bitfield size to 4 bits in CodeGenOptions.def" (#109161) Reverts llvm/llvm-project#108804 Bots are failing: https://lab.llvm.org/buildbot/#/builders/140/builds/6859 --- clang/include/clang/Basic/CodeGenOptions.def | 14 ++-------- clang/unittests/CodeGen/AllLibrariesFit.cpp | 10 ------- .../CodeGen/EncodingDecodingTest.cpp | 17 ------------ .../CodeGen/SimulatedOverflowTest.cpp | 26 ------------------- 4 files changed, 2 insertions(+), 65 deletions(-) delete mode 100644 clang/unittests/CodeGen/AllLibrariesFit.cpp delete mode 100644 clang/unittests/CodeGen/EncodingDecodingTest.cpp delete mode 100644 clang/unittests/CodeGen/SimulatedOverflowTest.cpp diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def index b78ae61e6509ea..b600198998d85b 100644 --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -375,18 +375,8 @@ ENUM_CODEGENOPT(Inlining, InliningMethod, 2, NormalInlining) /// The maximum stack size a function can have to be considered for inlining. VALUE_CODEGENOPT(InlineMaxStackSize, 32, UINT_MAX) -// Define the number of bits required for the VecLib enum -#define VECLIB_BIT_COUNT (llvm::countPopulation(llvm::driver::VectorLibrary::MaxLibrary)) - -// Ensure the VecLib bitfield has enough space for future vector libraries. -// The number of bits is determined automatically based on the number of enum values. -static_assert(static_cast(llvm::driver::VectorLibrary::MaxLibrary) <= (1 << VECLIB_BIT_COUNT), - "VecLib bitfield size is too small to accommodate all vector libraries."); - -// VecLib definition in CodeGenOptions.def -ENUM_CODEGENOPT(VecLib, llvm::driver::VectorLibrary, VECLIB_BIT_COUNT, llvm::driver::VectorLibrary::NoLibrary) - -#undef VECLIB_BIT_COUNT +// Vector functions library to use. +ENUM_CODEGENOPT(VecLib, llvm::driver::VectorLibrary, 3, llvm::driver::VectorLibrary::NoLibrary) /// The default TLS model to use. ENUM_CODEGENOPT(DefaultTLSModel, TLSModel, 2, GeneralDynamicTLSModel) diff --git a/clang/unittests/CodeGen/AllLibrariesFit.cpp b/clang/unittests/CodeGen/AllLibrariesFit.cpp deleted file mode 100644 index dfe63b557729ee..00000000000000 --- a/clang/unittests/CodeGen/AllLibrariesFit.cpp +++ /dev/null @@ -1,10 +0,0 @@ -#include "clang/Basic/CodeGenOptions.h" -#include "llvm/Driver/Options.h" -#include "gtest/gtest.h" - -TEST(VecLibBitfieldTest, AllLibrariesFit) { - // We expect that all vector libraries fit in the bitfield size - EXPECT_LE(static_cast(llvm::driver::VectorLibrary::MaxLibrary), - (1 << VECLIB_BIT_COUNT)) - << "VecLib bitfield size is too small!"; - } diff --git a/clang/unittests/CodeGen/EncodingDecodingTest.cpp b/clang/unittests/CodeGen/EncodingDecodingTest.cpp deleted file mode 100644 index 67c89ef07c428b..00000000000000 --- a/clang/unittests/CodeGen/EncodingDecodingTest.cpp +++ /dev/null @@ -1,17 +0,0 @@ -TEST(VecLibBitfieldTest, EncodingDecodingTest) { - clang::CodeGenOptions Opts; - - // Test encoding and decoding for each vector library - for (int i = static_cast(llvm::driver::VectorLibrary::Accelerate); - i <= static_cast(llvm::driver::VectorLibrary::MaxLibrary); ++i) { - - Opts.VecLib = static_cast(i); - - // Encode and then decode - llvm::driver::VectorLibrary decodedValue = - static_cast(Opts.VecLib); - - EXPECT_EQ(decodedValue, Opts.VecLib) - << "Encoding/Decoding failed for vector library " << i; - } -} diff --git a/clang/unittests/CodeGen/SimulatedOverflowTest.cpp b/clang/unittests/CodeGen/SimulatedOverflowTest.cpp deleted file mode 100644 index acfeaf7498b6d0..00000000000000 --- a/clang/unittests/CodeGen/SimulatedOverflowTest.cpp +++ /dev/null @@ -1,26 +0,0 @@ -// Simulate the addition of a new library without increasing the bitfield size -enum class SimulatedVectorLibrary { - Accelerate = 0, - LIBMVEC, - MASSV, - SVML, - SLEEF, - Darwin_libsystem_m, - ArmPL, - AMDLIBM, - NoLibrary, - // Simulate new addition - NewLibrary, - MaxLibrary -}; - -#define SIMULATED_VECLIB_BIT_COUNT \ - 4 // The current bitfield size (should be 4 for 9 options) - -TEST(VecLibBitfieldTest, SimulatedOverflowTest) { - // Simulate the addition of a new library and check if the bitfield size is - // sufficient - EXPECT_LE(static_cast(SimulatedVectorLibrary::MaxLibrary), - (1 << SIMULATED_VECLIB_BIT_COUNT)) - << "Simulated VecLib bitfield size overflow!"; -} From 6ce14099ffa1194a5ed1f1ae6c35a4e811706fae Mon Sep 17 00:00:00 2001 From: Luc Blaeser <112870813+luc-blaeser@users.noreply.github.com> Date: Wed, 18 Sep 2024 17:40:30 +0200 Subject: [PATCH 072/321] [lld][WebAssembly] Report unsupported PIC relocations as errors (#104926) `WASM_MEMORY_ADDR_REL_` and `WASM_TABLE_INDEX_REL_` relocations against **undefined symbols** are not supported and, except for `UnresolvedPolicy::ReportError`, lead to incorrect Wasm code, such as invalid data address or invalid table index that cannot be patched during later dynamic Wasm linking with modules declaring those symbols. This is different to other relocations that support undefined symbols by declaring correspond Wasm imports. For more robust behavior, `wasm-ld` should probably report an error for such unsupported PIC relocations, independent of the `UnresolvedPolicy`. --- lld/test/wasm/unsupported-pic-relocations.s | 39 +++++++++++++++++++ lld/test/wasm/unsupported-pic-relocations64.s | 39 +++++++++++++++++++ lld/wasm/Relocations.cpp | 16 ++++++++ 3 files changed, 94 insertions(+) create mode 100644 lld/test/wasm/unsupported-pic-relocations.s create mode 100644 lld/test/wasm/unsupported-pic-relocations64.s diff --git a/lld/test/wasm/unsupported-pic-relocations.s b/lld/test/wasm/unsupported-pic-relocations.s new file mode 100644 index 00000000000000..ea32e8468cdb4d --- /dev/null +++ b/lld/test/wasm/unsupported-pic-relocations.s @@ -0,0 +1,39 @@ +# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s + +# RUN: not wasm-ld --experimental-pic -shared %t.o -o /dev/null 2>&1 | \ +# RUN: FileCheck %s + +# RUN: not wasm-ld --experimental-pic -shared %t.o -o /dev/null --unresolved-symbols=report-all 2>&1 | \ +# RUN: FileCheck %s + +# RUN: not wasm-ld --experimental-pic -shared %t.o -o /dev/null --warn-unresolved-symbols 2>&1 | \ +# RUN: FileCheck %s + +# RUN: not wasm-ld --experimental-pic -shared %t.o -o /dev/null --unresolved-symbols=ignore-all 2>&1 | \ +# RUN: FileCheck %s + +# RUN: not wasm-ld --experimental-pic -shared %t.o -o /dev/null --unresolved-symbols=import-dynamic 2>&1 | \ +# RUN: FileCheck %s + +.functype external_func () -> () + +use_undefined_function: + .functype use_undefined_function () -> () + i32.const external_func@TBREL + # CHECK: error: {{.*}}.o: relocation R_WASM_TABLE_INDEX_REL_SLEB is not supported against an undefined symbol `external_func` + drop + end_function + +use_undefined_data: + .functype use_undefined_data () -> () + i32.const external_data@MBREL + # CHECK: error: {{.*}}.o: relocation R_WASM_MEMORY_ADDR_REL_SLEB is not supported against an undefined symbol `external_data` + drop + end_function + +.globl _start +_start: + .functype _start () -> () + call use_undefined_function + call use_undefined_data + end_function diff --git a/lld/test/wasm/unsupported-pic-relocations64.s b/lld/test/wasm/unsupported-pic-relocations64.s new file mode 100644 index 00000000000000..db9707b7fbac5e --- /dev/null +++ b/lld/test/wasm/unsupported-pic-relocations64.s @@ -0,0 +1,39 @@ +# RUN: llvm-mc -filetype=obj -triple=wasm64-unknown-unknown -o %t.o %s + +# RUN: not wasm-ld -mwasm64 --experimental-pic -shared %t.o -o /dev/null 2>&1 | \ +# RUN: FileCheck %s + +# RUN: not wasm-ld -mwasm64 --experimental-pic -shared %t.o -o /dev/null --unresolved-symbols=report-all 2>&1 | \ +# RUN: FileCheck %s + +# RUN: not wasm-ld -mwasm64 --experimental-pic -shared %t.o -o /dev/null --warn-unresolved-symbols 2>&1 | \ +# RUN: FileCheck %s + +# RUN: not wasm-ld -mwasm64 --experimental-pic -shared %t.o -o /dev/null --unresolved-symbols=ignore-all 2>&1 | \ +# RUN: FileCheck %s + +# RUN: not wasm-ld -mwasm64 --experimental-pic -shared %t.o -o /dev/null --unresolved-symbols=import-dynamic 2>&1 | \ +# RUN: FileCheck %s + +.functype external_func () -> () + +use_undefined_function: + .functype use_undefined_function () -> () + i64.const external_func@TBREL + # CHECK: error: {{.*}}.o: relocation R_WASM_TABLE_INDEX_REL_SLEB64 is not supported against an undefined symbol `external_func` + drop + end_function + +use_undefined_data: + .functype use_undefined_data () -> () + i64.const external_data@MBREL + # CHECK: error: {{.*}}.o: relocation R_WASM_MEMORY_ADDR_REL_SLEB64 is not supported against an undefined symbol `external_data` + drop + end_function + +.globl _start +_start: + .functype _start () -> () + call use_undefined_function + call use_undefined_data + end_function diff --git a/lld/wasm/Relocations.cpp b/lld/wasm/Relocations.cpp index 6f33a4f28a9d09..2dbfe335494711 100644 --- a/lld/wasm/Relocations.cpp +++ b/lld/wasm/Relocations.cpp @@ -173,6 +173,22 @@ void scanRelocations(InputChunk *chunk) { } } + if (sym->isUndefined()) { + switch (reloc.Type) { + case R_WASM_TABLE_INDEX_REL_SLEB: + case R_WASM_TABLE_INDEX_REL_SLEB64: + case R_WASM_MEMORY_ADDR_REL_SLEB: + case R_WASM_MEMORY_ADDR_REL_SLEB64: + // These relocation types are for symbols that exists relative to + // `__memory_base` or `__table_base` and as such only make sense for + // defined symbols. + error(toString(file) + ": relocation " + relocTypeToString(reloc.Type) + + " is not supported against an undefined symbol `" + + toString(*sym) + "`"); + break; + } + } + if (sym->isUndefined() && !config->relocatable && !sym->isWeak()) { // Report undefined symbols reportUndefined(file, sym); From afce1b10144d006b7f171cd532ad663295e79ec4 Mon Sep 17 00:00:00 2001 From: Jordan Rupprecht Date: Wed, 18 Sep 2024 10:42:03 -0500 Subject: [PATCH 073/321] [bazel] Remove empty Rename tests for now-deleted clang-rename (#109162) Removed in #108988, the tool is fine but the glob for tests is now empty because all the tests were deleted. --- .../clang/unittests/BUILD.bazel | 26 ------------------- 1 file changed, 26 deletions(-) diff --git a/utils/bazel/llvm-project-overlay/clang/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/unittests/BUILD.bazel index 884a6055cf4e0c..e8c7106b287516 100644 --- a/utils/bazel/llvm-project-overlay/clang/unittests/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang/unittests/BUILD.bazel @@ -298,32 +298,6 @@ cc_library( ], ) -cc_test( - name = "rename_tests", - size = "small", - timeout = "moderate", - srcs = glob( - [ - "Rename/*.cpp", - "Rename/*.h", - ], - allow_empty = False, - ), - shard_count = 20, - deps = [ - ":rename_tests_tooling_hdrs", - "//clang:ast_matchers", - "//clang:basic", - "//clang:format", - "//clang:frontend", - "//clang:tooling", - "//clang:tooling_refactoring", - "//llvm:Support", - "//third-party/unittest:gtest", - "//third-party/unittest:gtest_main", - ], -) - cc_test( name = "rewrite_tests", size = "small", From 4b524088a80757a204424a1f172721ee997519d9 Mon Sep 17 00:00:00 2001 From: Lei Huang Date: Wed, 18 Sep 2024 11:43:49 -0400 Subject: [PATCH 074/321] [NFC] Update function names in MCTargetAsmParser.h (#108643) Update function names to adhere to LLVM coding standard. --- .../llvm/MC/MCParser/MCTargetAsmParser.h | 14 +- llvm/lib/MC/MCParser/AsmParser.cpp | 6 +- llvm/lib/MC/MCParser/MasmParser.cpp | 6 +- .../AArch64/AsmParser/AArch64AsmParser.cpp | 15 +-- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 14 +- .../lib/Target/ARM/AsmParser/ARMAsmParser.cpp | 12 +- .../lib/Target/AVR/AsmParser/AVRAsmParser.cpp | 8 +- .../lib/Target/BPF/AsmParser/BPFAsmParser.cpp | 11 +- .../Hexagon/AsmParser/HexagonAsmParser.cpp | 10 +- .../Target/Lanai/AsmParser/LanaiAsmParser.cpp | 8 +- .../AsmParser/LoongArchAsmParser.cpp | 10 +- .../MSP430/AsmParser/MSP430AsmParser.cpp | 8 +- .../Target/Mips/AsmParser/MipsAsmParser.cpp | 10 +- .../Target/PowerPC/AsmParser/PPCAsmParser.cpp | 123 +++++++++--------- .../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 14 +- .../Target/Sparc/AsmParser/SparcAsmParser.cpp | 10 +- .../SystemZ/AsmParser/SystemZAsmParser.cpp | 8 +- llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp | 8 +- .../AsmParser/WebAssemblyAsmParser.cpp | 4 +- .../lib/Target/X86/AsmParser/X86AsmParser.cpp | 12 +- 20 files changed, 151 insertions(+), 160 deletions(-) diff --git a/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h b/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h index 49ce417e6fbb20..54ae436d90b283 100644 --- a/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h +++ b/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h @@ -426,7 +426,7 @@ class MCTargetAsmParser : public MCAsmParserExtension { virtual ParseStatus tryParseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) = 0; - /// ParseInstruction - Parse one assembly instruction. + /// Parse one assembly instruction. /// /// The parser is positioned following the instruction name. The target /// specific instruction parser should parse the entire instruction and @@ -439,11 +439,11 @@ class MCTargetAsmParser : public MCAsmParserExtension { /// \param Operands [out] - The list of parsed operands, this returns /// ownership of them to the caller. /// \return True on failure. - virtual bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + virtual bool parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) = 0; - virtual bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + virtual bool parseInstruction(ParseInstructionInfo &Info, StringRef Name, AsmToken Token, OperandVector &Operands) { - return ParseInstruction(Info, Name, Token.getLoc(), Operands); + return parseInstruction(Info, Name, Token.getLoc(), Operands); } /// ParseDirective - Parse a target specific assembler directive @@ -471,19 +471,19 @@ class MCTargetAsmParser : public MCAsmParserExtension { /// \param DirectiveID - The token identifying the directive. virtual ParseStatus parseDirective(AsmToken DirectiveID); - /// MatchAndEmitInstruction - Recognize a series of operands of a parsed + /// Recognize a series of operands of a parsed /// instruction as an actual MCInst and emit it to the specified MCStreamer. /// This returns false on success and returns true on failure to match. /// /// On failure, the target parser is responsible for emitting a diagnostic /// explaining the match failure. - virtual bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + virtual bool matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) = 0; /// Allows targets to let registers opt out of clobber lists. - virtual bool OmitRegisterFromClobberLists(unsigned RegNo) { return false; } + virtual bool omitRegisterFromClobberLists(unsigned RegNo) { return false; } /// Allow a target to add special case operand matching for things that /// tblgen doesn't/can't handle effectively. For example, literal diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp index 66e52fe2d08f8d..9eff35642c9f39 100644 --- a/llvm/lib/MC/MCParser/AsmParser.cpp +++ b/llvm/lib/MC/MCParser/AsmParser.cpp @@ -2322,7 +2322,7 @@ bool AsmParser::parseAndMatchAndEmitTargetInstruction(ParseStatementInfo &Info, // Canonicalize the opcode to lower case. std::string OpcodeStr = IDVal.lower(); ParseInstructionInfo IInfo(Info.AsmRewrites); - bool ParseHadError = getTargetParser().ParseInstruction(IInfo, OpcodeStr, ID, + bool ParseHadError = getTargetParser().parseInstruction(IInfo, OpcodeStr, ID, Info.ParsedOperands); Info.ParseError = ParseHadError; @@ -2379,7 +2379,7 @@ bool AsmParser::parseAndMatchAndEmitTargetInstruction(ParseStatementInfo &Info, // If parsing succeeded, match the instruction. if (!ParseHadError) { uint64_t ErrorInfo; - if (getTargetParser().MatchAndEmitInstruction( + if (getTargetParser().matchAndEmitInstruction( IDLoc, Info.Opcode, Info.ParsedOperands, Out, ErrorInfo, getTargetParser().isParsingMSInlineAsm())) return true; @@ -6029,7 +6029,7 @@ bool AsmParser::parseMSInlineAsm( // Register operand. if (Operand.isReg() && !Operand.needAddressOf() && - !getTargetParser().OmitRegisterFromClobberLists(Operand.getReg())) { + !getTargetParser().omitRegisterFromClobberLists(Operand.getReg())) { unsigned NumDefs = Desc.getNumDefs(); // Clobber. if (NumDefs && Operand.getMCOperandNum() < NumDefs) diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp index 9f619c5018b509..0c64af9e460ea0 100644 --- a/llvm/lib/MC/MCParser/MasmParser.cpp +++ b/llvm/lib/MC/MCParser/MasmParser.cpp @@ -2657,7 +2657,7 @@ bool MasmParser::parseStatement(ParseStatementInfo &Info, // Canonicalize the opcode to lower case. std::string OpcodeStr = IDVal.lower(); ParseInstructionInfo IInfo(Info.AsmRewrites); - bool ParseHadError = getTargetParser().ParseInstruction(IInfo, OpcodeStr, ID, + bool ParseHadError = getTargetParser().parseInstruction(IInfo, OpcodeStr, ID, Info.ParsedOperands); Info.ParseError = ParseHadError; @@ -2714,7 +2714,7 @@ bool MasmParser::parseStatement(ParseStatementInfo &Info, // If parsing succeeded, match the instruction. if (!ParseHadError) { uint64_t ErrorInfo; - if (getTargetParser().MatchAndEmitInstruction( + if (getTargetParser().matchAndEmitInstruction( IDLoc, Info.Opcode, Info.ParsedOperands, Out, ErrorInfo, getTargetParser().isParsingMSInlineAsm())) return true; @@ -7389,7 +7389,7 @@ bool MasmParser::parseMSInlineAsm( // Register operand. if (Operand.isReg() && !Operand.needAddressOf() && - !getTargetParser().OmitRegisterFromClobberLists(Operand.getReg())) { + !getTargetParser().omitRegisterFromClobberLists(Operand.getReg())) { unsigned NumDefs = Desc.getNumDefs(); // Clobber. if (NumDefs && Operand.getMCOperandNum() < NumDefs) diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 13a7eef4788524..4f6131fd835577 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -231,12 +231,12 @@ class AArch64AsmParser : public MCTargetAsmParser { bool validateInstruction(MCInst &Inst, SMLoc &IDLoc, SmallVectorImpl &Loc); unsigned getNumRegsForRegKind(RegKind K); - bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + bool matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) override; -/// @name Auto-generated Match Functions -/// { + /// @name Auto-generated Match Functions + /// { #define GET_ASSEMBLER_HEADER #include "AArch64GenAsmMatcher.inc" @@ -321,7 +321,7 @@ class AArch64AsmParser : public MCTargetAsmParser { bool areEqualRegs(const MCParsedAsmOperand &Op1, const MCParsedAsmOperand &Op2) const override; - bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + bool parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) override; bool parseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) override; ParseStatus tryParseRegister(MCRegister &Reg, SMLoc &StartLoc, @@ -5086,9 +5086,8 @@ bool AArch64AsmParser::areEqualRegs(const MCParsedAsmOperand &Op1, return false; } -/// ParseInstruction - Parse an AArch64 instruction mnemonic followed by its -/// operands. -bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info, +/// Parse an AArch64 instruction mnemonic followed by its operands. +bool AArch64AsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) { Name = StringSwitch(Name.lower()) @@ -6205,7 +6204,7 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode, static const char *getSubtargetFeatureName(uint64_t Val); -bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, +bool AArch64AsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 5db6c52d189e37..bab3f8a08781da 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1256,7 +1256,7 @@ class KernelScopeInfo { } void usesAgprAt(int i) { - // Instruction will error in AMDGPUAsmParser::MatchAndEmitInstruction + // Instruction will error in AMDGPUAsmParser::matchAndEmitInstruction if (!hasMAIInsts(*MSTI)) return; @@ -1597,7 +1597,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser { unsigned checkTargetMatchPredicate(MCInst &Inst) override; unsigned validateTargetOperandClass(MCParsedAsmOperand &Op, unsigned Kind) override; - bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + bool matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) override; @@ -1605,7 +1605,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser { ParseStatus parseOperand(OperandVector &Operands, StringRef Mnemonic, OperandMode Mode = OperandMode_Default); StringRef parseMnemonicSuffix(StringRef Name); - bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + bool parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) override; //bool ProcessInstruction(MCInst &Inst); @@ -5288,7 +5288,7 @@ static bool isInvalidVOPDY(const OperandVector &Operands, return false; } -bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, +bool AMDGPUAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, @@ -6393,9 +6393,9 @@ static void applyMnemonicAliases(StringRef &Mnemonic, const FeatureBitset &Features, unsigned VariantID); -bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info, - StringRef Name, - SMLoc NameLoc, OperandVector &Operands) { +bool AMDGPUAsmParser::parseInstruction(ParseInstructionInfo &Info, + StringRef Name, SMLoc NameLoc, + OperandVector &Operands) { // Add the instruction mnemonic Name = parseMnemonicSuffix(Name); diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index 3e3f134d347016..7d74f86c164fc7 100644 --- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -712,7 +712,7 @@ class ARMAsmParser : public MCTargetAsmParser { bool parseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) override; ParseStatus tryParseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) override; - bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + bool parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) override; bool ParseDirective(AsmToken DirectiveID) override; @@ -723,7 +723,7 @@ class ARMAsmParser : public MCTargetAsmParser { checkEarlyTargetMatchPredicate(MCInst &Inst, const OperandVector &Operands) override; - bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + bool matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) override; @@ -7051,7 +7051,7 @@ void removeVPTCondCode(OperandVector &Operands, unsigned &MnemonicOpsEndInd) { } /// Parse an arm instruction mnemonic followed by its operands. -bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, +bool ARMAsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) { MCAsmParser &Parser = getParser(); @@ -11350,7 +11350,7 @@ static std::string ARMMnemonicSpellCheck(StringRef S, const FeatureBitset &FBS, unsigned VariantID = 0); static const char *getSubtargetFeatureName(uint64_t Val); -bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, +bool ARMAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) { @@ -11427,7 +11427,7 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, llvm_unreachable("Implement any new match types added!"); } -/// parseDirective parses the arm specific directives +/// ParseDirective parses the arm specific directives bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) { const MCContext::Environment Format = getContext().getObjectFileType(); bool IsMachO = Format == MCContext::IsMachO; @@ -12120,7 +12120,7 @@ bool ARMAsmParser::parseDirectiveSetFP(SMLoc L) { return false; } -/// parseDirective +/// parseDirectivePad /// ::= .pad offset bool ARMAsmParser::parseDirectivePad(SMLoc L) { MCAsmParser &Parser = getParser(); diff --git a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp index 193722fa356114..b4971e43b48ebf 100644 --- a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp +++ b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp @@ -51,7 +51,7 @@ class AVRAsmParser : public MCTargetAsmParser { #define GET_ASSEMBLER_HEADER #include "AVRGenAsmMatcher.inc" - bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + bool matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) override; @@ -60,7 +60,7 @@ class AVRAsmParser : public MCTargetAsmParser { ParseStatus tryParseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) override; - bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + bool parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) override; ParseStatus parseDirective(AsmToken DirectiveID) override; @@ -320,7 +320,7 @@ bool AVRAsmParser::emit(MCInst &Inst, SMLoc const &Loc, MCStreamer &Out) const { return false; } -bool AVRAsmParser::MatchAndEmitInstruction(SMLoc Loc, unsigned &Opcode, +bool AVRAsmParser::matchAndEmitInstruction(SMLoc Loc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) { @@ -623,7 +623,7 @@ void AVRAsmParser::eatComma() { } } -bool AVRAsmParser::ParseInstruction(ParseInstructionInfo &Info, +bool AVRAsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Mnemonic, SMLoc NameLoc, OperandVector &Operands) { Operands.push_back(AVROperand::CreateToken(Mnemonic, NameLoc)); diff --git a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp index 9672ed009e9be1..06b7743e0cd310 100644 --- a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp +++ b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp @@ -34,7 +34,7 @@ class BPFAsmParser : public MCTargetAsmParser { bool PreMatchCheck(OperandVector &Operands); - bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + bool matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) override; @@ -43,7 +43,7 @@ class BPFAsmParser : public MCTargetAsmParser { ParseStatus tryParseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) override; - bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + bool parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) override; // "=" is used as assignment operator for assembly statment, so can't be used @@ -304,7 +304,7 @@ bool BPFAsmParser::PreMatchCheck(OperandVector &Operands) { return false; } -bool BPFAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, +bool BPFAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) { @@ -483,9 +483,8 @@ ParseStatus BPFAsmParser::parseImmediate(OperandVector &Operands) { return ParseStatus::Success; } -/// ParseInstruction - Parse an BPF instruction which is in BPF verifier -/// format. -bool BPFAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, +/// Parse an BPF instruction which is in BPF verifier format. +bool BPFAsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) { // The first operand could be either register or actually an operator. unsigned RegNo = MatchRegisterName(Name); diff --git a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp index 2fb1c484fc8a14..62f188957cccf5 100644 --- a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp +++ b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp @@ -134,7 +134,7 @@ class HexagonAsmParser : public MCTargetAsmParser { OperandVector &InstOperands, uint64_t &ErrorInfo, bool MatchingInlineAsm); void eatToEndOfPacket(); - bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + bool matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) override; @@ -180,12 +180,12 @@ class HexagonAsmParser : public MCTargetAsmParser { bool parseExpressionOrOperand(OperandVector &Operands); bool parseExpression(MCExpr const *&Expr); - bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + bool parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) override { llvm_unreachable("Unimplemented"); } - bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, AsmToken ID, + bool parseInstruction(ParseInstructionInfo &Info, StringRef Name, AsmToken ID, OperandVector &Operands) override; bool ParseDirective(AsmToken DirectiveID) override; @@ -614,7 +614,7 @@ void HexagonAsmParser::eatToEndOfPacket() { InBrackets = false; } -bool HexagonAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, +bool HexagonAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, @@ -1278,7 +1278,7 @@ bool HexagonAsmParser::parseInstruction(OperandVector &Operands) { } } -bool HexagonAsmParser::ParseInstruction(ParseInstructionInfo &Info, +bool HexagonAsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name, AsmToken ID, OperandVector &Operands) { getLexer().UnLex(ID); diff --git a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp index 6ab1375b974ec6..280f1f3ddbb69f 100644 --- a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp +++ b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp @@ -62,14 +62,14 @@ class LanaiAsmParser : public MCTargetAsmParser { bool parsePrePost(StringRef Type, int *OffsetValue); - bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + bool parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) override; bool parseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) override; ParseStatus tryParseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) override; - bool MatchAndEmitInstruction(SMLoc IdLoc, unsigned &Opcode, + bool matchAndEmitInstruction(SMLoc IdLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) override; @@ -645,7 +645,7 @@ struct LanaiOperand : public MCParsedAsmOperand { } // end anonymous namespace -bool LanaiAsmParser::MatchAndEmitInstruction(SMLoc IdLoc, unsigned &Opcode, +bool LanaiAsmParser::matchAndEmitInstruction(SMLoc IdLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, @@ -1161,7 +1161,7 @@ static bool MaybePredicatedInst(const OperandVector &Operands) { .Default(false); } -bool LanaiAsmParser::ParseInstruction(ParseInstructionInfo & /*Info*/, +bool LanaiAsmParser::parseInstruction(ParseInstructionInfo & /*Info*/, StringRef Name, SMLoc NameLoc, OperandVector &Operands) { // First operand is token for instruction diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp index b8f1cdfd2cb354..57c42024b4d2b2 100644 --- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp +++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp @@ -47,10 +47,10 @@ class LoongArchAsmParser : public MCTargetAsmParser { ParseStatus tryParseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) override; - bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + bool parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) override; - bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + bool matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) override; @@ -65,7 +65,7 @@ class LoongArchAsmParser : public MCTargetAsmParser { const Twine &Msg); /// Helper for processing MC instructions that have been successfully matched - /// by MatchAndEmitInstruction. + /// by matchAndEmitInstruction. bool processInstruction(MCInst &Inst, SMLoc IDLoc, OperandVector &Operands, MCStreamer &Out); @@ -793,7 +793,7 @@ bool LoongArchAsmParser::parseOperand(OperandVector &Operands, return Error(getLoc(), "unknown operand"); } -bool LoongArchAsmParser::ParseInstruction(ParseInstructionInfo &Info, +bool LoongArchAsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) { // First operand in MCInst is instruction mnemonic. @@ -1506,7 +1506,7 @@ bool LoongArchAsmParser::generateImmOutOfRangeError( return Error(ErrorLoc, Msg + " [" + Twine(Lower) + ", " + Twine(Upper) + "]"); } -bool LoongArchAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, +bool LoongArchAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, diff --git a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp index 2bc1a89ef59cf0..34ae80669f2c3c 100644 --- a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp +++ b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp @@ -40,7 +40,7 @@ class MSP430AsmParser : public MCTargetAsmParser { MCAsmParser &Parser; const MCRegisterInfo *MRI; - bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + bool matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) override; @@ -49,7 +49,7 @@ class MSP430AsmParser : public MCTargetAsmParser { ParseStatus tryParseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) override; - bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + bool parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) override; ParseStatus parseDirective(AsmToken DirectiveID) override; @@ -252,7 +252,7 @@ class MSP430Operand : public MCParsedAsmOperand { }; } // end anonymous namespace -bool MSP430AsmParser::MatchAndEmitInstruction(SMLoc Loc, unsigned &Opcode, +bool MSP430AsmParser::matchAndEmitInstruction(SMLoc Loc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, @@ -385,7 +385,7 @@ bool MSP430AsmParser::parseJccInstruction(ParseInstructionInfo &Info, return false; } -bool MSP430AsmParser::ParseInstruction(ParseInstructionInfo &Info, +bool MSP430AsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) { // Drop .w suffix diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index 8ab435c6c6fd18..7888c57363ed33 100644 --- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -174,7 +174,7 @@ class MipsAsmParser : public MCTargetAsmParser { const OperandVector &Operands) override; unsigned checkTargetMatchPredicate(MCInst &Inst) override; - bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + bool matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) override; @@ -190,7 +190,7 @@ class MipsAsmParser : public MCTargetAsmParser { bool mnemonicIsValid(StringRef Mnemonic, unsigned VariantID); - bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + bool parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) override; bool ParseDirective(AsmToken DirectiveID) override; @@ -5992,7 +5992,7 @@ static SMLoc RefineErrorLoc(const SMLoc Loc, const OperandVector &Operands, return Loc; } -bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, +bool MipsAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, @@ -6997,10 +6997,10 @@ bool MipsAsmParser::areEqualRegs(const MCParsedAsmOperand &Op1, return Op1.getReg() == Op2.getReg(); } -bool MipsAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, +bool MipsAsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) { MCAsmParser &Parser = getParser(); - LLVM_DEBUG(dbgs() << "ParseInstruction\n"); + LLVM_DEBUG(dbgs() << "parseInstruction\n"); // We have reached first instruction, module directive are now forbidden. getTargetStreamer().forbidModuleDirective(); diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp index 59ad995b44b04a..597a976b076a52 100644 --- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp +++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp @@ -103,32 +103,32 @@ class PPCAsmParser : public MCTargetAsmParser { bool isPPC64() const { return IsPPC64; } - bool MatchRegisterName(MCRegister &RegNo, int64_t &IntVal); + bool matchRegisterName(MCRegister &RegNo, int64_t &IntVal); bool parseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) override; ParseStatus tryParseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) override; - const MCExpr *ExtractModifierFromExpr(const MCExpr *E, + const MCExpr *extractModifierFromExpr(const MCExpr *E, PPCMCExpr::VariantKind &Variant); - const MCExpr *FixupVariantKind(const MCExpr *E); - bool ParseExpression(const MCExpr *&EVal); + const MCExpr *fixupVariantKind(const MCExpr *E); + bool parseExpression(const MCExpr *&EVal); - bool ParseOperand(OperandVector &Operands); + bool parseOperand(OperandVector &Operands); - bool ParseDirectiveWord(unsigned Size, AsmToken ID); - bool ParseDirectiveTC(unsigned Size, AsmToken ID); - bool ParseDirectiveMachine(SMLoc L); - bool ParseDirectiveAbiVersion(SMLoc L); - bool ParseDirectiveLocalEntry(SMLoc L); - bool ParseGNUAttribute(SMLoc L); + bool parseDirectiveWord(unsigned Size, AsmToken ID); + bool parseDirectiveTC(unsigned Size, AsmToken ID); + bool parseDirectiveMachine(SMLoc L); + bool parseDirectiveAbiVersion(SMLoc L); + bool parseDirectiveLocalEntry(SMLoc L); + bool parseGNUAttribute(SMLoc L); - bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + bool matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) override; - void ProcessInstruction(MCInst &Inst, const OperandVector &Ops); + void processInstruction(MCInst &Inst, const OperandVector &Ops); /// @name Auto-generated Match Functions /// { @@ -150,7 +150,7 @@ class PPCAsmParser : public MCTargetAsmParser { setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); } - bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + bool parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) override; bool ParseDirective(AsmToken DirectiveID) override; @@ -818,7 +818,7 @@ addNegOperand(MCInst &Inst, MCOperand &Op, MCContext &Ctx) { Inst.addOperand(MCOperand::createExpr(MCUnaryExpr::createMinus(Expr, Ctx))); } -void PPCAsmParser::ProcessInstruction(MCInst &Inst, +void PPCAsmParser::processInstruction(MCInst &Inst, const OperandVector &Operands) { int Opcode = Inst.getOpcode(); switch (Opcode) { @@ -1252,7 +1252,7 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst, static std::string PPCMnemonicSpellCheck(StringRef S, const FeatureBitset &FBS, unsigned VariantID = 0); -bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, +bool PPCAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) { @@ -1261,7 +1261,7 @@ bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) { case Match_Success: // Post-process instructions (typically extended mnemonics) - ProcessInstruction(Inst, Operands); + processInstruction(Inst, Operands); Inst.setLoc(IDLoc); Out.emitInstruction(Inst, getSTI()); return false; @@ -1291,7 +1291,7 @@ bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, llvm_unreachable("Implement any new match types added!"); } -bool PPCAsmParser::MatchRegisterName(MCRegister &RegNo, int64_t &IntVal) { +bool PPCAsmParser::matchRegisterName(MCRegister &RegNo, int64_t &IntVal) { if (getParser().getTok().is(AsmToken::Percent)) getParser().Lex(); // Eat the '%'. @@ -1364,7 +1364,7 @@ ParseStatus PPCAsmParser::tryParseRegister(MCRegister &Reg, SMLoc &StartLoc, EndLoc = Tok.getEndLoc(); Reg = PPC::NoRegister; int64_t IntVal; - if (MatchRegisterName(Reg, IntVal)) + if (matchRegisterName(Reg, IntVal)) return ParseStatus::NoMatch; return ParseStatus::Success; } @@ -1375,9 +1375,9 @@ ParseStatus PPCAsmParser::tryParseRegister(MCRegister &Reg, SMLoc &StartLoc, /// variant, return the corresponding PPCMCExpr::VariantKind, /// and a modified expression using the default symbol variant. /// Otherwise, return NULL. -const MCExpr *PPCAsmParser:: -ExtractModifierFromExpr(const MCExpr *E, - PPCMCExpr::VariantKind &Variant) { +const MCExpr * +PPCAsmParser::extractModifierFromExpr(const MCExpr *E, + PPCMCExpr::VariantKind &Variant) { MCContext &Context = getParser().getContext(); Variant = PPCMCExpr::VK_PPC_None; @@ -1426,7 +1426,7 @@ ExtractModifierFromExpr(const MCExpr *E, case MCExpr::Unary: { const MCUnaryExpr *UE = cast(E); - const MCExpr *Sub = ExtractModifierFromExpr(UE->getSubExpr(), Variant); + const MCExpr *Sub = extractModifierFromExpr(UE->getSubExpr(), Variant); if (!Sub) return nullptr; return MCUnaryExpr::create(UE->getOpcode(), Sub, Context); @@ -1435,8 +1435,8 @@ ExtractModifierFromExpr(const MCExpr *E, case MCExpr::Binary: { const MCBinaryExpr *BE = cast(E); PPCMCExpr::VariantKind LHSVariant, RHSVariant; - const MCExpr *LHS = ExtractModifierFromExpr(BE->getLHS(), LHSVariant); - const MCExpr *RHS = ExtractModifierFromExpr(BE->getRHS(), RHSVariant); + const MCExpr *LHS = extractModifierFromExpr(BE->getLHS(), LHSVariant); + const MCExpr *RHS = extractModifierFromExpr(BE->getRHS(), RHSVariant); if (!LHS && !RHS) return nullptr; @@ -1464,8 +1464,7 @@ ExtractModifierFromExpr(const MCExpr *E, /// them by VK_PPC_TLSGD/VK_PPC_TLSLD. This is necessary to avoid having /// _GLOBAL_OFFSET_TABLE_ created via ELFObjectWriter::RelocNeedsGOT. /// FIXME: This is a hack. -const MCExpr *PPCAsmParser:: -FixupVariantKind(const MCExpr *E) { +const MCExpr *PPCAsmParser::fixupVariantKind(const MCExpr *E) { MCContext &Context = getParser().getContext(); switch (E->getKind()) { @@ -1492,7 +1491,7 @@ FixupVariantKind(const MCExpr *E) { case MCExpr::Unary: { const MCUnaryExpr *UE = cast(E); - const MCExpr *Sub = FixupVariantKind(UE->getSubExpr()); + const MCExpr *Sub = fixupVariantKind(UE->getSubExpr()); if (Sub == UE->getSubExpr()) return E; return MCUnaryExpr::create(UE->getOpcode(), Sub, Context); @@ -1500,8 +1499,8 @@ FixupVariantKind(const MCExpr *E) { case MCExpr::Binary: { const MCBinaryExpr *BE = cast(E); - const MCExpr *LHS = FixupVariantKind(BE->getLHS()); - const MCExpr *RHS = FixupVariantKind(BE->getRHS()); + const MCExpr *LHS = fixupVariantKind(BE->getLHS()); + const MCExpr *RHS = fixupVariantKind(BE->getRHS()); if (LHS == BE->getLHS() && RHS == BE->getRHS()) return E; return MCBinaryExpr::create(BE->getOpcode(), LHS, RHS, Context); @@ -1511,29 +1510,27 @@ FixupVariantKind(const MCExpr *E) { llvm_unreachable("Invalid expression kind!"); } -/// ParseExpression. This differs from the default "parseExpression" in that -/// it handles modifiers. -bool PPCAsmParser:: -ParseExpression(const MCExpr *&EVal) { +/// This differs from the default "parseExpression" in that it handles +/// modifiers. +bool PPCAsmParser::parseExpression(const MCExpr *&EVal) { // (ELF Platforms) // Handle \code @l/@ha \endcode if (getParser().parseExpression(EVal)) return true; - EVal = FixupVariantKind(EVal); + EVal = fixupVariantKind(EVal); PPCMCExpr::VariantKind Variant; - const MCExpr *E = ExtractModifierFromExpr(EVal, Variant); + const MCExpr *E = extractModifierFromExpr(EVal, Variant); if (E) EVal = PPCMCExpr::create(Variant, E, getParser().getContext()); return false; } -/// ParseOperand /// This handles registers in the form 'NN', '%rNN' for ELF platforms and /// rNN for MachO. -bool PPCAsmParser::ParseOperand(OperandVector &Operands) { +bool PPCAsmParser::parseOperand(OperandVector &Operands) { MCAsmParser &Parser = getParser(); SMLoc S = Parser.getTok().getLoc(); SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); @@ -1546,7 +1543,7 @@ bool PPCAsmParser::ParseOperand(OperandVector &Operands) { case AsmToken::Percent: { MCRegister RegNo; int64_t IntVal; - if (MatchRegisterName(RegNo, IntVal)) + if (matchRegisterName(RegNo, IntVal)) return Error(S, "invalid register name"); Operands.push_back(PPCOperand::CreateImm(IntVal, S, E, isPPC64())); @@ -1561,7 +1558,7 @@ bool PPCAsmParser::ParseOperand(OperandVector &Operands) { case AsmToken::Dollar: case AsmToken::Exclaim: case AsmToken::Tilde: - if (!ParseExpression(EVal)) + if (!parseExpression(EVal)) break; // Fall-through [[fallthrough]]; @@ -1589,7 +1586,7 @@ bool PPCAsmParser::ParseOperand(OperandVector &Operands) { if (TlsCall && parseOptionalToken(AsmToken::LParen)) { const MCExpr *TLSSym; const SMLoc S2 = Parser.getTok().getLoc(); - if (ParseExpression(TLSSym)) + if (parseExpression(TLSSym)) return Error(S2, "invalid TLS call expression"); E = Parser.getTok().getLoc(); if (parseToken(AsmToken::RParen, "expected ')'")) @@ -1631,7 +1628,7 @@ bool PPCAsmParser::ParseOperand(OperandVector &Operands) { switch (getLexer().getKind()) { case AsmToken::Percent: { MCRegister RegNo; - if (MatchRegisterName(RegNo, IntVal)) + if (matchRegisterName(RegNo, IntVal)) return Error(S, "invalid register name"); break; } @@ -1655,7 +1652,7 @@ bool PPCAsmParser::ParseOperand(OperandVector &Operands) { } /// Parse an instruction mnemonic followed by its operands. -bool PPCAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, +bool PPCAsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) { // The first operand is the token for the instruction name. // If the next character is a '+' or '-', we need to add it to the @@ -1695,11 +1692,11 @@ bool PPCAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, return false; // Parse the first operand - if (ParseOperand(Operands)) + if (parseOperand(Operands)) return true; while (!parseOptionalToken(AsmToken::EndOfStatement)) { - if (parseToken(AsmToken::Comma) || ParseOperand(Operands)) + if (parseToken(AsmToken::Comma) || parseOperand(Operands)) return true; } @@ -1731,31 +1728,30 @@ bool PPCAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, return false; } -/// ParseDirective parses the PPC specific directives +/// Parses the PPC specific directives bool PPCAsmParser::ParseDirective(AsmToken DirectiveID) { StringRef IDVal = DirectiveID.getIdentifier(); if (IDVal == ".word") - ParseDirectiveWord(2, DirectiveID); + parseDirectiveWord(2, DirectiveID); else if (IDVal == ".llong") - ParseDirectiveWord(8, DirectiveID); + parseDirectiveWord(8, DirectiveID); else if (IDVal == ".tc") - ParseDirectiveTC(isPPC64() ? 8 : 4, DirectiveID); + parseDirectiveTC(isPPC64() ? 8 : 4, DirectiveID); else if (IDVal == ".machine") - ParseDirectiveMachine(DirectiveID.getLoc()); + parseDirectiveMachine(DirectiveID.getLoc()); else if (IDVal == ".abiversion") - ParseDirectiveAbiVersion(DirectiveID.getLoc()); + parseDirectiveAbiVersion(DirectiveID.getLoc()); else if (IDVal == ".localentry") - ParseDirectiveLocalEntry(DirectiveID.getLoc()); + parseDirectiveLocalEntry(DirectiveID.getLoc()); else if (IDVal.starts_with(".gnu_attribute")) - ParseGNUAttribute(DirectiveID.getLoc()); + parseGNUAttribute(DirectiveID.getLoc()); else return true; return false; } -/// ParseDirectiveWord /// ::= .word [ expression (, expression)* ] -bool PPCAsmParser::ParseDirectiveWord(unsigned Size, AsmToken ID) { +bool PPCAsmParser::parseDirectiveWord(unsigned Size, AsmToken ID) { auto parseOp = [&]() -> bool { const MCExpr *Value; SMLoc ExprLoc = getParser().getTok().getLoc(); @@ -1778,9 +1774,8 @@ bool PPCAsmParser::ParseDirectiveWord(unsigned Size, AsmToken ID) { return false; } -/// ParseDirectiveTC /// ::= .tc [ symbol (, expression)* ] -bool PPCAsmParser::ParseDirectiveTC(unsigned Size, AsmToken ID) { +bool PPCAsmParser::parseDirectiveTC(unsigned Size, AsmToken ID) { MCAsmParser &Parser = getParser(); // Skip TC symbol, which is only used with XCOFF. while (getLexer().isNot(AsmToken::EndOfStatement) @@ -1793,12 +1788,12 @@ bool PPCAsmParser::ParseDirectiveTC(unsigned Size, AsmToken ID) { getParser().getStreamer().emitValueToAlignment(Align(Size)); // Emit expressions. - return ParseDirectiveWord(Size, ID); + return parseDirectiveWord(Size, ID); } -/// ParseDirectiveMachine (ELF platforms) +/// ELF platforms. /// ::= .machine [ cpu | "push" | "pop" ] -bool PPCAsmParser::ParseDirectiveMachine(SMLoc L) { +bool PPCAsmParser::parseDirectiveMachine(SMLoc L) { MCAsmParser &Parser = getParser(); if (Parser.getTok().isNot(AsmToken::Identifier) && Parser.getTok().isNot(AsmToken::String)) @@ -1823,9 +1818,8 @@ bool PPCAsmParser::ParseDirectiveMachine(SMLoc L) { return false; } -/// ParseDirectiveAbiVersion /// ::= .abiversion constant-expression -bool PPCAsmParser::ParseDirectiveAbiVersion(SMLoc L) { +bool PPCAsmParser::parseDirectiveAbiVersion(SMLoc L) { int64_t AbiVersion; if (check(getParser().parseAbsoluteExpression(AbiVersion), L, "expected constant expression") || @@ -1840,9 +1834,8 @@ bool PPCAsmParser::ParseDirectiveAbiVersion(SMLoc L) { return false; } -/// ParseDirectiveLocalEntry /// ::= .localentry symbol, expression -bool PPCAsmParser::ParseDirectiveLocalEntry(SMLoc L) { +bool PPCAsmParser::parseDirectiveLocalEntry(SMLoc L) { StringRef Name; if (getParser().parseIdentifier(Name)) return Error(L, "expected identifier in '.localentry' directive"); @@ -1863,7 +1856,7 @@ bool PPCAsmParser::ParseDirectiveLocalEntry(SMLoc L) { return false; } -bool PPCAsmParser::ParseGNUAttribute(SMLoc L) { +bool PPCAsmParser::parseGNUAttribute(SMLoc L) { int64_t Tag; int64_t IntegerValue; if (!getParser().parseGNUAttribute(L, Tag, IntegerValue)) diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 6eb2058107610e..9600293d3da71d 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -104,7 +104,7 @@ class RISCVAsmParser : public MCTargetAsmParser { bool generateImmOutOfRangeError(SMLoc ErrorLoc, int64_t Lower, int64_t Upper, const Twine &Msg); - bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + bool matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) override; @@ -114,7 +114,7 @@ class RISCVAsmParser : public MCTargetAsmParser { ParseStatus tryParseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) override; - bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + bool parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) override; ParseStatus parseDirective(AsmToken DirectiveID) override; @@ -182,7 +182,7 @@ class RISCVAsmParser : public MCTargetAsmParser { bool validateInstruction(MCInst &Inst, OperandVector &Operands); /// Helper for processing MC instructions that have been successfully matched - /// by MatchAndEmitInstruction. Modifications to the emitted instructions, + /// by matchAndEmitInstruction. Modifications to the emitted instructions, /// like the expansion of pseudo instructions (e.g., "li"), can be performed /// in this method. bool processInstruction(MCInst &Inst, SMLoc IDLoc, OperandVector &Operands, @@ -1376,7 +1376,7 @@ bool RISCVAsmParser::generateImmOutOfRangeError( return generateImmOutOfRangeError(ErrorLoc, Lower, Upper, Msg); } -bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, +bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, @@ -2732,7 +2732,7 @@ bool RISCVAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { return true; } -bool RISCVAsmParser::ParseInstruction(ParseInstructionInfo &Info, +bool RISCVAsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) { // Ensure that if the instruction occurs when relaxation is enabled, @@ -3186,12 +3186,12 @@ bool RISCVAsmParser::parseDirectiveInsn(SMLoc L) { ParseInstructionInfo Info; SmallVector, 8> Operands; - if (ParseInstruction(Info, FormatName, L, Operands)) + if (parseInstruction(Info, FormatName, L, Operands)) return true; unsigned Opcode; uint64_t ErrorInfo; - return MatchAndEmitInstruction(L, Opcode, Operands, Parser.getStreamer(), + return matchAndEmitInstruction(L, Opcode, Operands, Parser.getStreamer(), ErrorInfo, /*MatchingInlineAsm=*/false); } diff --git a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp index c1200df5d44dd6..c7a0bebea96943 100644 --- a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp +++ b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp @@ -70,14 +70,14 @@ class SparcAsmParser : public MCTargetAsmParser { /// } // public interface of the MCTargetAsmParser. - bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + bool matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) override; bool parseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) override; ParseStatus tryParseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) override; - bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + bool parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) override; ParseStatus parseDirective(AsmToken DirectiveID) override; @@ -789,7 +789,7 @@ bool SparcAsmParser::expandSETX(MCInst &Inst, SMLoc IDLoc, return false; } -bool SparcAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, +bool SparcAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, @@ -871,14 +871,14 @@ ParseStatus SparcAsmParser::tryParseRegister(MCRegister &Reg, SMLoc &StartLoc, return ParseStatus::NoMatch; } -bool SparcAsmParser::ParseInstruction(ParseInstructionInfo &Info, +bool SparcAsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) { // Validate and reject unavailable mnemonics early before // running any operand parsing. // This is needed because some operands (mainly memory ones) // differ between V8 and V9 ISA and so any operand parsing errors - // will cause IAS to bail out before it reaches MatchAndEmitInstruction + // will cause IAS to bail out before it reaches matchAndEmitInstruction // (where the instruction as a whole, including the mnemonic, is validated // once again just before emission). // As a nice side effect this also allows us to reject unknown diff --git a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp index 7c3898ac67312d..5b26ba08dbdb6b 100644 --- a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp +++ b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp @@ -499,9 +499,9 @@ class SystemZAsmParser : public MCTargetAsmParser { bool RequirePercent, bool RestoreOnFailure); ParseStatus tryParseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) override; - bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + bool parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) override; - bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + bool matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) override; @@ -1401,7 +1401,7 @@ ParseStatus SystemZAsmParser::tryParseRegister(MCRegister &Reg, SMLoc &StartLoc, return ParseStatus::Success; } -bool SystemZAsmParser::ParseInstruction(ParseInstructionInfo &Info, +bool SystemZAsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) { @@ -1526,7 +1526,7 @@ bool SystemZAsmParser::parseOperand(OperandVector &Operands, return false; } -bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, +bool SystemZAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, diff --git a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp index 691fe8fe3aa446..5073894cc7fbe1 100644 --- a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp +++ b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp @@ -51,7 +51,7 @@ class VEAsmParser : public MCTargetAsmParser { /// } // public interface of the MCTargetAsmParser. - bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + bool matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) override; @@ -59,7 +59,7 @@ class VEAsmParser : public MCTargetAsmParser { int parseRegisterName(MCRegister (*matchFn)(StringRef)); ParseStatus tryParseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) override; - bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + bool parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) override; ParseStatus parseDirective(AsmToken DirectiveID) override; @@ -760,7 +760,7 @@ class VEOperand : public MCParsedAsmOperand { } // end anonymous namespace -bool VEAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, +bool VEAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) { @@ -965,7 +965,7 @@ static void applyMnemonicAliases(StringRef &Mnemonic, const FeatureBitset &Features, unsigned VariantID); -bool VEAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, +bool VEAsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) { // If the target architecture uses MnemonicAlias, call it here to parse // operands correctly. diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp index 4fef5fa0ef2208..8e8d08f775638b 100644 --- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp +++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp @@ -562,7 +562,7 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser { } } - bool ParseInstruction(ParseInstructionInfo & /*Info*/, StringRef Name, + bool parseInstruction(ParseInstructionInfo & /*Info*/, StringRef Name, SMLoc NameLoc, OperandVector &Operands) override { // Note: Name does NOT point into the sourcecode, but to a local, so // use NameLoc instead. @@ -1127,7 +1127,7 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser { } } - bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned & /*Opcode*/, + bool matchAndEmitInstruction(SMLoc IDLoc, unsigned & /*Opcode*/, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) override { diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index 735f9dcefb97f7..97079cba143ab3 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -1195,7 +1195,7 @@ class X86AsmParser : public MCTargetAsmParser { /// instrumentation around Inst. void emitInstruction(MCInst &Inst, OperandVector &Operands, MCStreamer &Out); - bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + bool matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) override; @@ -1215,7 +1215,7 @@ class X86AsmParser : public MCTargetAsmParser { uint64_t &ErrorInfo, bool MatchingInlineAsm); - bool OmitRegisterFromClobberLists(unsigned RegNo) override; + bool omitRegisterFromClobberLists(unsigned RegNo) override; /// Parses AVX512 specific operand primitives: masked registers ({%k}, {z}) /// and memory broadcasting ({1to}) primitives, updating Operands vector if required. @@ -1290,7 +1290,7 @@ class X86AsmParser : public MCTargetAsmParser { bool parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) override; - bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + bool parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) override; bool ParseDirective(AsmToken DirectiveID) override; @@ -3186,7 +3186,7 @@ bool X86AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) { return Parser.parsePrimaryExpr(Res, EndLoc, nullptr); } -bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, +bool X86AsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) { MCAsmParser &Parser = getParser(); InstInfo = &Info; @@ -4121,7 +4121,7 @@ static unsigned getPrefixes(OperandVector &Operands) { return Result; } -bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, +bool X86AsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) { @@ -4659,7 +4659,7 @@ bool X86AsmParser::matchAndEmitIntelInstruction( MatchingInlineAsm); } -bool X86AsmParser::OmitRegisterFromClobberLists(unsigned RegNo) { +bool X86AsmParser::omitRegisterFromClobberLists(unsigned RegNo) { return X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(RegNo); } From 2c85fe96893c9c67a96e5b37f1cd79ded3a03344 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Wed, 18 Sep 2024 16:44:58 +0100 Subject: [PATCH 075/321] [AMDGPU] Remove miscellaneous unused code. NFC. --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 6 +----- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 -- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index f2c9619cb8276a..bc771d4ef6c080 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3076,11 +3076,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: - case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: { - applyDefaultMapping(OpdMapper); - executeInWaterfallLoop(B, MI, {2, 5}); - return; - } + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 10108866a7005a..b197f38d054fc0 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -10013,8 +10013,6 @@ SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer, if (!MaybePointer.getValueType().isScalarInteger()) return MaybePointer; - SDLoc DL(MaybePointer); - SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer); return Rsrc; } From eda72fac548f317cec997967494763e9a7bafa27 Mon Sep 17 00:00:00 2001 From: Vakhurin Sergei Date: Wed, 18 Sep 2024 18:46:25 +0300 Subject: [PATCH 076/321] Fix OOM in FormatDiagnostic (2nd attempt) (#108866) Resolves: #70930 (and probably latest comments from clangd/clangd#251) by fixing racing for the shared DiagStorage value which caused messing with args inside the storage and then formatting the following message with getArgSInt(1) == 2: def err_module_odr_violation_function : Error< "%q0 has different definitions in different modules; " "%select{definition in module '%2'|defined here}1 " "first difference is " which causes HandleSelectModifier to go beyond the ArgumentLen so the recursive call to FormatDiagnostic was made with DiagStr > DiagEnd that leads to infinite while (DiagStr != DiagEnd). The Main Idea: Reuse the existing DiagStorageAllocator logic to make all DiagnosticBuilders having independent states. Also, encapsulating the rest of state (e.g. ID and Loc) into DiagnosticBuilder. The last attempt failed - https://github.com/llvm/llvm-project/pull/108187#issuecomment-2353122096 so was reverted - #108838 --- .../ClangTidyDiagnosticConsumer.cpp | 2 - .../clangd/unittests/ConfigCompileTests.cpp | 12 +- clang/include/clang/Basic/Diagnostic.h | 270 ++++++------------ clang/include/clang/Basic/DiagnosticIDs.h | 7 +- clang/include/clang/Basic/PartialDiagnostic.h | 5 +- clang/include/clang/Sema/Sema.h | 6 +- clang/lib/Basic/Diagnostic.cpp | 85 +++--- clang/lib/Basic/DiagnosticIDs.cpp | 21 +- clang/lib/Basic/SourceManager.cpp | 23 +- clang/lib/Frontend/Rewrite/FixItRewriter.cpp | 4 +- clang/lib/Frontend/TextDiagnosticPrinter.cpp | 2 +- clang/lib/Sema/Sema.cpp | 19 +- clang/lib/Sema/SemaBase.cpp | 2 +- clang/lib/Serialization/ASTReader.cpp | 15 +- clang/test/PCH/race-condition.cpp | 41 +++ clang/unittests/Basic/DiagnosticTest.cpp | 19 +- clang/unittests/Driver/DXCModeTest.cpp | 5 - flang/lib/Frontend/TextDiagnosticPrinter.cpp | 2 +- 18 files changed, 233 insertions(+), 307 deletions(-) create mode 100644 clang/test/PCH/race-condition.cpp diff --git a/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp b/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp index 200bb87a5ac3cb..4c75b422701148 100644 --- a/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp +++ b/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp @@ -380,7 +380,6 @@ void ClangTidyDiagnosticConsumer::HandleDiagnostic( ++Context.Stats.ErrorsIgnoredNOLINT; // Ignored a warning, should ignore related notes as well LastErrorWasIgnored = true; - Context.DiagEngine->Clear(); for (const auto &Error : SuppressionErrors) Context.diag(Error); return; @@ -457,7 +456,6 @@ void ClangTidyDiagnosticConsumer::HandleDiagnostic( if (Info.hasSourceManager()) checkFilters(Info.getLocation(), Info.getSourceManager()); - Context.DiagEngine->Clear(); for (const auto &Error : SuppressionErrors) Context.diag(Error); } diff --git a/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp b/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp index 021d731f8f1768..cf9b42828568da 100644 --- a/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp +++ b/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp @@ -305,33 +305,33 @@ TEST_F(ConfigCompileTests, DiagnosticSuppression) { { auto D = DiagEngine.Report(diag::warn_unreachable); EXPECT_TRUE(isDiagnosticSuppressed( - Diag{&DiagEngine}, Conf.Diagnostics.Suppress, LangOptions())); + Diag{&DiagEngine, D}, Conf.Diagnostics.Suppress, LangOptions())); } // Subcategory not respected/suppressed. { auto D = DiagEngine.Report(diag::warn_unreachable_break); EXPECT_FALSE(isDiagnosticSuppressed( - Diag{&DiagEngine}, Conf.Diagnostics.Suppress, LangOptions())); + Diag{&DiagEngine, D}, Conf.Diagnostics.Suppress, LangOptions())); } { auto D = DiagEngine.Report(diag::warn_unused_variable); EXPECT_TRUE(isDiagnosticSuppressed( - Diag{&DiagEngine}, Conf.Diagnostics.Suppress, LangOptions())); + Diag{&DiagEngine, D}, Conf.Diagnostics.Suppress, LangOptions())); } { auto D = DiagEngine.Report(diag::err_typecheck_bool_condition); EXPECT_TRUE(isDiagnosticSuppressed( - Diag{&DiagEngine}, Conf.Diagnostics.Suppress, LangOptions())); + Diag{&DiagEngine, D}, Conf.Diagnostics.Suppress, LangOptions())); } { auto D = DiagEngine.Report(diag::err_unexpected_friend); EXPECT_TRUE(isDiagnosticSuppressed( - Diag{&DiagEngine}, Conf.Diagnostics.Suppress, LangOptions())); + Diag{&DiagEngine, D}, Conf.Diagnostics.Suppress, LangOptions())); } { auto D = DiagEngine.Report(diag::warn_alloca); EXPECT_TRUE(isDiagnosticSuppressed( - Diag{&DiagEngine}, Conf.Diagnostics.Suppress, LangOptions())); + Diag{&DiagEngine, D}, Conf.Diagnostics.Suppress, LangOptions())); } Frag.Diagnostics.Suppress.emplace_back("*"); diff --git a/clang/include/clang/Basic/Diagnostic.h b/clang/include/clang/Basic/Diagnostic.h index 54b69e98540239..e17ed8f98afa9a 100644 --- a/clang/include/clang/Basic/Diagnostic.h +++ b/clang/include/clang/Basic/Diagnostic.h @@ -183,6 +183,41 @@ struct DiagnosticStorage { DiagnosticStorage() = default; }; +/// An allocator for DiagnosticStorage objects, which uses a small cache to +/// objects, used to reduce malloc()/free() traffic for partial diagnostics. +class DiagStorageAllocator { + static const unsigned NumCached = 16; + DiagnosticStorage Cached[NumCached]; + DiagnosticStorage *FreeList[NumCached]; + unsigned NumFreeListEntries; + +public: + DiagStorageAllocator(); + ~DiagStorageAllocator(); + + /// Allocate new storage. + DiagnosticStorage *Allocate() { + if (NumFreeListEntries == 0) + return new DiagnosticStorage; + + DiagnosticStorage *Result = FreeList[--NumFreeListEntries]; + Result->NumDiagArgs = 0; + Result->DiagRanges.clear(); + Result->FixItHints.clear(); + return Result; + } + + /// Free the given storage object. + void Deallocate(DiagnosticStorage *S) { + if (S >= Cached && S <= Cached + NumCached) { + FreeList[NumFreeListEntries++] = S; + return; + } + + delete S; + } +}; + /// Concrete class used by the front-end to report problems and issues. /// /// This massages the diagnostics (e.g. handling things like "report warnings @@ -522,27 +557,6 @@ class DiagnosticsEngine : public RefCountedBase { void *ArgToStringCookie = nullptr; ArgToStringFnTy ArgToStringFn; - /// ID of the "delayed" diagnostic, which is a (typically - /// fatal) diagnostic that had to be delayed because it was found - /// while emitting another diagnostic. - unsigned DelayedDiagID; - - /// First string argument for the delayed diagnostic. - std::string DelayedDiagArg1; - - /// Second string argument for the delayed diagnostic. - std::string DelayedDiagArg2; - - /// Third string argument for the delayed diagnostic. - std::string DelayedDiagArg3; - - /// Optional flag value. - /// - /// Some flags accept values, for instance: -Wframe-larger-than= and - /// -Rpass=. The content of this string is emitted after the flag name - /// and '='. - std::string FlagValue; - public: explicit DiagnosticsEngine(IntrusiveRefCntPtr Diags, IntrusiveRefCntPtr DiagOpts, @@ -949,70 +963,18 @@ class DiagnosticsEngine : public RefCountedBase { void Report(const StoredDiagnostic &storedDiag); - /// Determine whethere there is already a diagnostic in flight. - bool isDiagnosticInFlight() const { - return CurDiagID != std::numeric_limits::max(); - } - - /// Set the "delayed" diagnostic that will be emitted once - /// the current diagnostic completes. - /// - /// If a diagnostic is already in-flight but the front end must - /// report a problem (e.g., with an inconsistent file system - /// state), this routine sets a "delayed" diagnostic that will be - /// emitted after the current diagnostic completes. This should - /// only be used for fatal errors detected at inconvenient - /// times. If emitting a delayed diagnostic causes a second delayed - /// diagnostic to be introduced, that second delayed diagnostic - /// will be ignored. - /// - /// \param DiagID The ID of the diagnostic being delayed. - /// - /// \param Arg1 A string argument that will be provided to the - /// diagnostic. A copy of this string will be stored in the - /// DiagnosticsEngine object itself. - /// - /// \param Arg2 A string argument that will be provided to the - /// diagnostic. A copy of this string will be stored in the - /// DiagnosticsEngine object itself. - /// - /// \param Arg3 A string argument that will be provided to the - /// diagnostic. A copy of this string will be stored in the - /// DiagnosticsEngine object itself. - void SetDelayedDiagnostic(unsigned DiagID, StringRef Arg1 = "", - StringRef Arg2 = "", StringRef Arg3 = ""); - - /// Clear out the current diagnostic. - void Clear() { CurDiagID = std::numeric_limits::max(); } - - /// Return the value associated with this diagnostic flag. - StringRef getFlagValue() const { return FlagValue; } - private: // This is private state used by DiagnosticBuilder. We put it here instead of // in DiagnosticBuilder in order to keep DiagnosticBuilder a small lightweight - // object. This implementation choice means that we can only have one - // diagnostic "in flight" at a time, but this seems to be a reasonable - // tradeoff to keep these objects small. Assertions verify that only one - // diagnostic is in flight at a time. + // object. This implementation choice means that we can only have a few + // diagnostics "in flight" at a time, but this seems to be a reasonable + // tradeoff to keep these objects small. friend class Diagnostic; friend class DiagnosticBuilder; friend class DiagnosticErrorTrap; friend class DiagnosticIDs; friend class PartialDiagnostic; - /// Report the delayed diagnostic. - void ReportDelayed(); - - /// The location of the current diagnostic that is in flight. - SourceLocation CurDiagLoc; - - /// The ID of the current diagnostic that is in flight. - /// - /// This is set to std::numeric_limits::max() when there is no - /// diagnostic in flight. - unsigned CurDiagID; - enum { /// The maximum number of arguments we can hold. /// @@ -1022,7 +984,7 @@ class DiagnosticsEngine : public RefCountedBase { MaxArguments = DiagnosticStorage::MaxArguments, }; - DiagnosticStorage DiagStorage; + DiagStorageAllocator DiagAllocator; DiagnosticMapping makeUserMapping(diag::Severity Map, SourceLocation L) { bool isPragma = L.isValid(); @@ -1042,8 +1004,8 @@ class DiagnosticsEngine : public RefCountedBase { /// Used to report a diagnostic that is finally fully formed. /// /// \returns true if the diagnostic was emitted, false if it was suppressed. - bool ProcessDiag() { - return Diags->ProcessDiag(*this); + bool ProcessDiag(const DiagnosticBuilder &DiagBuilder) { + return Diags->ProcessDiag(*this, DiagBuilder); } /// @name Diagnostic Emission @@ -1058,14 +1020,10 @@ class DiagnosticsEngine : public RefCountedBase { // Sema::Diag() patterns. friend class Sema; - /// Emit the current diagnostic and clear the diagnostic state. + /// Emit the diagnostic /// /// \param Force Emit the diagnostic regardless of suppression settings. - bool EmitCurrentDiagnostic(bool Force = false); - - unsigned getCurrentDiagID() const { return CurDiagID; } - - SourceLocation getCurrentDiagLoc() const { return CurDiagLoc; } + bool EmitDiagnostic(const DiagnosticBuilder &DB, bool Force = false); /// @} }; @@ -1118,40 +1076,7 @@ class DiagnosticErrorTrap { /// class StreamingDiagnostic { public: - /// An allocator for DiagnosticStorage objects, which uses a small cache to - /// objects, used to reduce malloc()/free() traffic for partial diagnostics. - class DiagStorageAllocator { - static const unsigned NumCached = 16; - DiagnosticStorage Cached[NumCached]; - DiagnosticStorage *FreeList[NumCached]; - unsigned NumFreeListEntries; - - public: - DiagStorageAllocator(); - ~DiagStorageAllocator(); - - /// Allocate new storage. - DiagnosticStorage *Allocate() { - if (NumFreeListEntries == 0) - return new DiagnosticStorage; - - DiagnosticStorage *Result = FreeList[--NumFreeListEntries]; - Result->NumDiagArgs = 0; - Result->DiagRanges.clear(); - Result->FixItHints.clear(); - return Result; - } - - /// Free the given storage object. - void Deallocate(DiagnosticStorage *S) { - if (S >= Cached && S <= Cached + NumCached) { - FreeList[NumFreeListEntries++] = S; - return; - } - - delete S; - } - }; + using DiagStorageAllocator = clang::DiagStorageAllocator; protected: mutable DiagnosticStorage *DiagStorage = nullptr; @@ -1240,11 +1165,6 @@ class StreamingDiagnostic { protected: StreamingDiagnostic() = default; - /// Construct with an external storage not owned by itself. The allocator - /// is a null pointer in this case. - explicit StreamingDiagnostic(DiagnosticStorage *Storage) - : DiagStorage(Storage) {} - /// Construct with a storage allocator which will manage the storage. The /// allocator is not a null pointer in this case. explicit StreamingDiagnostic(DiagStorageAllocator &Alloc) @@ -1275,9 +1195,20 @@ class StreamingDiagnostic { class DiagnosticBuilder : public StreamingDiagnostic { friend class DiagnosticsEngine; friend class PartialDiagnostic; + friend class Diagnostic; mutable DiagnosticsEngine *DiagObj = nullptr; + SourceLocation DiagLoc; + unsigned DiagID; + + /// Optional flag value. + /// + /// Some flags accept values, for instance: -Wframe-larger-than= and + /// -Rpass=. The content of this string is emitted after the flag name + /// and '='. + mutable std::string FlagValue; + /// Status variable indicating if this diagnostic is still active. /// // NOTE: This field is redundant with DiagObj (IsActive iff (DiagObj == 0)), @@ -1291,16 +1222,8 @@ class DiagnosticBuilder : public StreamingDiagnostic { DiagnosticBuilder() = default; - explicit DiagnosticBuilder(DiagnosticsEngine *diagObj) - : StreamingDiagnostic(&diagObj->DiagStorage), DiagObj(diagObj), - IsActive(true) { - assert(diagObj && "DiagnosticBuilder requires a valid DiagnosticsEngine!"); - assert(DiagStorage && - "DiagnosticBuilder requires a valid DiagnosticStorage!"); - DiagStorage->NumDiagArgs = 0; - DiagStorage->DiagRanges.clear(); - DiagStorage->FixItHints.clear(); - } + DiagnosticBuilder(DiagnosticsEngine *DiagObj, SourceLocation DiagLoc, + unsigned DiagID); protected: /// Clear out the current diagnostic. @@ -1326,7 +1249,7 @@ class DiagnosticBuilder : public StreamingDiagnostic { if (!isActive()) return false; // Process the diagnostic. - bool Result = DiagObj->EmitCurrentDiagnostic(IsForceEmit); + bool Result = DiagObj->EmitDiagnostic(*this, IsForceEmit); // This diagnostic is dead. Clear(); @@ -1337,13 +1260,7 @@ class DiagnosticBuilder : public StreamingDiagnostic { public: /// Copy constructor. When copied, this "takes" the diagnostic info from the /// input and neuters it. - DiagnosticBuilder(const DiagnosticBuilder &D) : StreamingDiagnostic() { - DiagObj = D.DiagObj; - DiagStorage = D.DiagStorage; - IsActive = D.IsActive; - IsForceEmit = D.IsForceEmit; - D.Clear(); - } + DiagnosticBuilder(const DiagnosticBuilder &D); template const DiagnosticBuilder &operator<<(const T &V) const { assert(isActive() && "Clients must not add to cleared diagnostic!"); @@ -1375,7 +1292,7 @@ class DiagnosticBuilder : public StreamingDiagnostic { return *this; } - void addFlagValue(StringRef V) const { DiagObj->FlagValue = std::string(V); } + void addFlagValue(StringRef V) const { FlagValue = std::string(V); } }; struct AddFlagValue { @@ -1550,12 +1467,7 @@ const StreamingDiagnostic &operator<<(const StreamingDiagnostic &DB, inline DiagnosticBuilder DiagnosticsEngine::Report(SourceLocation Loc, unsigned DiagID) { - assert(CurDiagID == std::numeric_limits::max() && - "Multiple diagnostics in flight at once!"); - CurDiagLoc = Loc; - CurDiagID = DiagID; - FlagValue.clear(); - return DiagnosticBuilder(this); + return DiagnosticBuilder(this, Loc, DiagID); } const StreamingDiagnostic &operator<<(const StreamingDiagnostic &DB, @@ -1570,24 +1482,29 @@ inline DiagnosticBuilder DiagnosticsEngine::Report(unsigned DiagID) { //===----------------------------------------------------------------------===// /// A little helper class (which is basically a smart pointer that forwards -/// info from DiagnosticsEngine) that allows clients to enquire about the -/// currently in-flight diagnostic. +/// info from DiagnosticsEngine and DiagnosticStorage) that allows clients to +/// enquire about the diagnostic. class Diagnostic { const DiagnosticsEngine *DiagObj; + SourceLocation DiagLoc; + unsigned DiagID; + std::string FlagValue; + const DiagnosticStorage &DiagStorage; std::optional StoredDiagMessage; public: - explicit Diagnostic(const DiagnosticsEngine *DO) : DiagObj(DO) {} - Diagnostic(const DiagnosticsEngine *DO, StringRef storedDiagMessage) - : DiagObj(DO), StoredDiagMessage(storedDiagMessage) {} + Diagnostic(const DiagnosticsEngine *DO, const DiagnosticBuilder &DiagBuilder); + Diagnostic(const DiagnosticsEngine *DO, SourceLocation DiagLoc, + unsigned DiagID, const DiagnosticStorage &DiagStorage, + StringRef StoredDiagMessage); const DiagnosticsEngine *getDiags() const { return DiagObj; } - unsigned getID() const { return DiagObj->CurDiagID; } - const SourceLocation &getLocation() const { return DiagObj->CurDiagLoc; } + unsigned getID() const { return DiagID; } + const SourceLocation &getLocation() const { return DiagLoc; } bool hasSourceManager() const { return DiagObj->hasSourceManager(); } SourceManager &getSourceManager() const { return DiagObj->getSourceManager();} - unsigned getNumArgs() const { return DiagObj->DiagStorage.NumDiagArgs; } + unsigned getNumArgs() const { return DiagStorage.NumDiagArgs; } /// Return the kind of the specified index. /// @@ -1597,8 +1514,7 @@ class Diagnostic { /// \pre Idx < getNumArgs() DiagnosticsEngine::ArgumentKind getArgKind(unsigned Idx) const { assert(Idx < getNumArgs() && "Argument index out of range!"); - return (DiagnosticsEngine::ArgumentKind) - DiagObj->DiagStorage.DiagArgumentsKind[Idx]; + return (DiagnosticsEngine::ArgumentKind)DiagStorage.DiagArgumentsKind[Idx]; } /// Return the provided argument string specified by \p Idx. @@ -1606,7 +1522,7 @@ class Diagnostic { const std::string &getArgStdStr(unsigned Idx) const { assert(getArgKind(Idx) == DiagnosticsEngine::ak_std_string && "invalid argument accessor!"); - return DiagObj->DiagStorage.DiagArgumentsStr[Idx]; + return DiagStorage.DiagArgumentsStr[Idx]; } /// Return the specified C string argument. @@ -1614,8 +1530,7 @@ class Diagnostic { const char *getArgCStr(unsigned Idx) const { assert(getArgKind(Idx) == DiagnosticsEngine::ak_c_string && "invalid argument accessor!"); - return reinterpret_cast( - DiagObj->DiagStorage.DiagArgumentsVal[Idx]); + return reinterpret_cast(DiagStorage.DiagArgumentsVal[Idx]); } /// Return the specified signed integer argument. @@ -1623,7 +1538,7 @@ class Diagnostic { int64_t getArgSInt(unsigned Idx) const { assert(getArgKind(Idx) == DiagnosticsEngine::ak_sint && "invalid argument accessor!"); - return (int64_t)DiagObj->DiagStorage.DiagArgumentsVal[Idx]; + return (int64_t)DiagStorage.DiagArgumentsVal[Idx]; } /// Return the specified unsigned integer argument. @@ -1631,7 +1546,7 @@ class Diagnostic { uint64_t getArgUInt(unsigned Idx) const { assert(getArgKind(Idx) == DiagnosticsEngine::ak_uint && "invalid argument accessor!"); - return DiagObj->DiagStorage.DiagArgumentsVal[Idx]; + return DiagStorage.DiagArgumentsVal[Idx]; } /// Return the specified IdentifierInfo argument. @@ -1640,7 +1555,7 @@ class Diagnostic { assert(getArgKind(Idx) == DiagnosticsEngine::ak_identifierinfo && "invalid argument accessor!"); return reinterpret_cast( - DiagObj->DiagStorage.DiagArgumentsVal[Idx]); + DiagStorage.DiagArgumentsVal[Idx]); } /// Return the specified non-string argument in an opaque form. @@ -1648,37 +1563,32 @@ class Diagnostic { uint64_t getRawArg(unsigned Idx) const { assert(getArgKind(Idx) != DiagnosticsEngine::ak_std_string && "invalid argument accessor!"); - return DiagObj->DiagStorage.DiagArgumentsVal[Idx]; + return DiagStorage.DiagArgumentsVal[Idx]; } /// Return the number of source ranges associated with this diagnostic. - unsigned getNumRanges() const { - return DiagObj->DiagStorage.DiagRanges.size(); - } + unsigned getNumRanges() const { return DiagStorage.DiagRanges.size(); } /// \pre Idx < getNumRanges() const CharSourceRange &getRange(unsigned Idx) const { assert(Idx < getNumRanges() && "Invalid diagnostic range index!"); - return DiagObj->DiagStorage.DiagRanges[Idx]; + return DiagStorage.DiagRanges[Idx]; } /// Return an array reference for this diagnostic's ranges. - ArrayRef getRanges() const { - return DiagObj->DiagStorage.DiagRanges; - } + ArrayRef getRanges() const { return DiagStorage.DiagRanges; } - unsigned getNumFixItHints() const { - return DiagObj->DiagStorage.FixItHints.size(); - } + unsigned getNumFixItHints() const { return DiagStorage.FixItHints.size(); } const FixItHint &getFixItHint(unsigned Idx) const { assert(Idx < getNumFixItHints() && "Invalid index!"); - return DiagObj->DiagStorage.FixItHints[Idx]; + return DiagStorage.FixItHints[Idx]; } - ArrayRef getFixItHints() const { - return DiagObj->DiagStorage.FixItHints; - } + ArrayRef getFixItHints() const { return DiagStorage.FixItHints; } + + /// Return the value associated with this diagnostic flag. + StringRef getFlagValue() const { return FlagValue; } /// Format this diagnostic into a string, substituting the /// formal arguments into the %0 slots. diff --git a/clang/include/clang/Basic/DiagnosticIDs.h b/clang/include/clang/Basic/DiagnosticIDs.h index daad66f499538f..1fa38ed6066e26 100644 --- a/clang/include/clang/Basic/DiagnosticIDs.h +++ b/clang/include/clang/Basic/DiagnosticIDs.h @@ -24,6 +24,7 @@ namespace clang { class DiagnosticsEngine; + class DiagnosticBuilder; class SourceLocation; // Import the diagnostic enums themselves. @@ -486,11 +487,13 @@ class DiagnosticIDs : public RefCountedBase { /// /// \returns \c true if the diagnostic was emitted, \c false if it was /// suppressed. - bool ProcessDiag(DiagnosticsEngine &Diag) const; + bool ProcessDiag(DiagnosticsEngine &Diag, + const DiagnosticBuilder &DiagBuilder) const; /// Used to emit a diagnostic that is finally fully formed, /// ignoring suppression. - void EmitDiag(DiagnosticsEngine &Diag, Level DiagLevel) const; + void EmitDiag(DiagnosticsEngine &Diag, const DiagnosticBuilder &DiagBuilder, + Level DiagLevel) const; /// Whether the diagnostic may leave the AST in a state where some /// invariants can break. diff --git a/clang/include/clang/Basic/PartialDiagnostic.h b/clang/include/clang/Basic/PartialDiagnostic.h index 507d789c54ff9b..4bf6049d08fdb4 100644 --- a/clang/include/clang/Basic/PartialDiagnostic.h +++ b/clang/include/clang/Basic/PartialDiagnostic.h @@ -166,13 +166,10 @@ class PartialDiagnostic : public StreamingDiagnostic { void EmitToString(DiagnosticsEngine &Diags, SmallVectorImpl &Buf) const { - // FIXME: It should be possible to render a diagnostic to a string without - // messing with the state of the diagnostics engine. DiagnosticBuilder DB(Diags.Report(getDiagID())); Emit(DB); - Diagnostic(&Diags).FormatDiagnostic(Buf); + Diagnostic(&Diags, DB).FormatDiagnostic(Buf); DB.Clear(); - Diags.Clear(); } /// Clear out this partial diagnostic, giving it a new diagnostic ID diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 0809ac1b144ef6..e1c3a99cfa167e 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -626,10 +626,10 @@ class Sema final : public SemaBase { const llvm::MapVector & getMismatchingDeleteExpressions() const; - /// Cause the active diagnostic on the DiagosticsEngine to be - /// emitted. This is closely coupled to the SemaDiagnosticBuilder class and + /// Cause the built diagnostic to be emitted on the DiagosticsEngine. + /// This is closely coupled to the SemaDiagnosticBuilder class and /// should not be used elsewhere. - void EmitCurrentDiagnostic(unsigned DiagID); + void EmitDiagnostic(unsigned DiagID, const DiagnosticBuilder &DB); void addImplicitTypedef(StringRef Name, QualType T); diff --git a/clang/lib/Basic/Diagnostic.cpp b/clang/lib/Basic/Diagnostic.cpp index ecff80a5090630..0bd6845085b735 100644 --- a/clang/lib/Basic/Diagnostic.cpp +++ b/clang/lib/Basic/Diagnostic.cpp @@ -126,9 +126,7 @@ void DiagnosticsEngine::Reset(bool soft /*=false*/) { TrapNumErrorsOccurred = 0; TrapNumUnrecoverableErrorsOccurred = 0; - CurDiagID = std::numeric_limits::max(); LastDiagLevel = DiagnosticIDs::Ignored; - DelayedDiagID = 0; if (!soft) { // Clear state related to #pragma diagnostic. @@ -143,23 +141,6 @@ void DiagnosticsEngine::Reset(bool soft /*=false*/) { } } -void DiagnosticsEngine::SetDelayedDiagnostic(unsigned DiagID, StringRef Arg1, - StringRef Arg2, StringRef Arg3) { - if (DelayedDiagID) - return; - - DelayedDiagID = DiagID; - DelayedDiagArg1 = Arg1.str(); - DelayedDiagArg2 = Arg2.str(); - DelayedDiagArg3 = Arg3.str(); -} - -void DiagnosticsEngine::ReportDelayed() { - unsigned ID = DelayedDiagID; - DelayedDiagID = 0; - Report(ID) << DelayedDiagArg1 << DelayedDiagArg2 << DelayedDiagArg3; -} - DiagnosticMapping & DiagnosticsEngine::DiagState::getOrAddMapping(diag::kind Diag) { std::pair Result = @@ -503,39 +484,31 @@ void DiagnosticsEngine::setSeverityForAll(diag::Flavor Flavor, } void DiagnosticsEngine::Report(const StoredDiagnostic &storedDiag) { - assert(CurDiagID == std::numeric_limits::max() && - "Multiple diagnostics in flight at once!"); - - CurDiagLoc = storedDiag.getLocation(); - CurDiagID = storedDiag.getID(); - DiagStorage.NumDiagArgs = 0; - - DiagStorage.DiagRanges.clear(); + DiagnosticStorage DiagStorage; DiagStorage.DiagRanges.append(storedDiag.range_begin(), storedDiag.range_end()); - DiagStorage.FixItHints.clear(); DiagStorage.FixItHints.append(storedDiag.fixit_begin(), storedDiag.fixit_end()); assert(Client && "DiagnosticConsumer not set!"); Level DiagLevel = storedDiag.getLevel(); - Diagnostic Info(this, storedDiag.getMessage()); + Diagnostic Info(this, storedDiag.getLocation(), storedDiag.getID(), + DiagStorage, storedDiag.getMessage()); Client->HandleDiagnostic(DiagLevel, Info); if (Client->IncludeInDiagnosticCounts()) { if (DiagLevel == DiagnosticsEngine::Warning) ++NumWarnings; } - - CurDiagID = std::numeric_limits::max(); } -bool DiagnosticsEngine::EmitCurrentDiagnostic(bool Force) { +bool DiagnosticsEngine::EmitDiagnostic(const DiagnosticBuilder &DB, + bool Force) { assert(getClient() && "DiagnosticClient not set!"); bool Emitted; if (Force) { - Diagnostic Info(this); + Diagnostic Info(this, DB); // Figure out the diagnostic level of this message. DiagnosticIDs::Level DiagLevel @@ -544,24 +517,50 @@ bool DiagnosticsEngine::EmitCurrentDiagnostic(bool Force) { Emitted = (DiagLevel != DiagnosticIDs::Ignored); if (Emitted) { // Emit the diagnostic regardless of suppression level. - Diags->EmitDiag(*this, DiagLevel); + Diags->EmitDiag(*this, DB, DiagLevel); } } else { // Process the diagnostic, sending the accumulated information to the // DiagnosticConsumer. - Emitted = ProcessDiag(); + Emitted = ProcessDiag(DB); } - // Clear out the current diagnostic object. - Clear(); + return Emitted; +} + +DiagnosticBuilder::DiagnosticBuilder(DiagnosticsEngine *DiagObj, + SourceLocation DiagLoc, unsigned DiagID) + : StreamingDiagnostic(DiagObj->DiagAllocator), DiagObj(DiagObj), + DiagLoc(DiagLoc), DiagID(DiagID), IsActive(true) { + assert(DiagObj && "DiagnosticBuilder requires a valid DiagnosticsEngine!"); +} - // If there was a delayed diagnostic, emit it now. - if (!Force && DelayedDiagID) - ReportDelayed(); +DiagnosticBuilder::DiagnosticBuilder(const DiagnosticBuilder &D) + : StreamingDiagnostic() { + DiagLoc = D.DiagLoc; + DiagID = D.DiagID; + FlagValue = D.FlagValue; + DiagObj = D.DiagObj; + DiagStorage = D.DiagStorage; + D.DiagStorage = nullptr; + Allocator = D.Allocator; + IsActive = D.IsActive; + IsForceEmit = D.IsForceEmit; + D.Clear(); +} - return Emitted; +Diagnostic::Diagnostic(const DiagnosticsEngine *DO, + const DiagnosticBuilder &DiagBuilder) + : DiagObj(DO), DiagLoc(DiagBuilder.DiagLoc), DiagID(DiagBuilder.DiagID), + FlagValue(DiagBuilder.FlagValue), DiagStorage(*DiagBuilder.getStorage()) { } +Diagnostic::Diagnostic(const DiagnosticsEngine *DO, SourceLocation DiagLoc, + unsigned DiagID, const DiagnosticStorage &DiagStorage, + StringRef StoredDiagMessage) + : DiagObj(DO), DiagLoc(DiagLoc), DiagID(DiagID), DiagStorage(DiagStorage), + StoredDiagMessage(StoredDiagMessage) {} + DiagnosticConsumer::~DiagnosticConsumer() = default; void DiagnosticConsumer::HandleDiagnostic(DiagnosticsEngine::Level DiagLevel, @@ -1216,13 +1215,13 @@ bool ForwardingDiagnosticConsumer::IncludeInDiagnosticCounts() const { return Target.IncludeInDiagnosticCounts(); } -PartialDiagnostic::DiagStorageAllocator::DiagStorageAllocator() { +DiagStorageAllocator::DiagStorageAllocator() { for (unsigned I = 0; I != NumCached; ++I) FreeList[I] = Cached + I; NumFreeListEntries = NumCached; } -PartialDiagnostic::DiagStorageAllocator::~DiagStorageAllocator() { +DiagStorageAllocator::~DiagStorageAllocator() { // Don't assert if we are in a CrashRecovery context, as this invariant may // be invalidated during a crash. assert((NumFreeListEntries == NumCached || diff --git a/clang/lib/Basic/DiagnosticIDs.cpp b/clang/lib/Basic/DiagnosticIDs.cpp index cae6642bd4bd3e..031d9d7817d1f6 100644 --- a/clang/lib/Basic/DiagnosticIDs.cpp +++ b/clang/lib/Basic/DiagnosticIDs.cpp @@ -566,7 +566,7 @@ DiagnosticIDs::getDiagnosticSeverity(unsigned DiagID, SourceLocation Loc, // If explicitly requested, map fatal errors to errors. if (Result == diag::Severity::Fatal && - Diag.CurDiagID != diag::fatal_too_many_errors && Diag.FatalsAsError) + DiagID != diag::fatal_too_many_errors && Diag.FatalsAsError) Result = diag::Severity::Error; bool ShowInSystemHeader = @@ -800,8 +800,9 @@ StringRef DiagnosticIDs::getNearestOption(diag::Flavor Flavor, /// ProcessDiag - This is the method used to report a diagnostic that is /// finally fully formed. -bool DiagnosticIDs::ProcessDiag(DiagnosticsEngine &Diag) const { - Diagnostic Info(&Diag); +bool DiagnosticIDs::ProcessDiag(DiagnosticsEngine &Diag, + const DiagnosticBuilder &DiagBuilder) const { + Diagnostic Info(&Diag, DiagBuilder); assert(Diag.getClient() && "DiagnosticClient not set!"); @@ -867,22 +868,24 @@ bool DiagnosticIDs::ProcessDiag(DiagnosticsEngine &Diag) const { // stop a flood of bogus errors. if (Diag.ErrorLimit && Diag.NumErrors > Diag.ErrorLimit && DiagLevel == DiagnosticIDs::Error) { - Diag.SetDelayedDiagnostic(diag::fatal_too_many_errors); + Diag.Report(diag::fatal_too_many_errors); return false; } } // Make sure we set FatalErrorOccurred to ensure that the notes from the // diagnostic that caused `fatal_too_many_errors` won't be emitted. - if (Diag.CurDiagID == diag::fatal_too_many_errors) + if (Info.getID() == diag::fatal_too_many_errors) Diag.FatalErrorOccurred = true; // Finally, report it. - EmitDiag(Diag, DiagLevel); + EmitDiag(Diag, DiagBuilder, DiagLevel); return true; } -void DiagnosticIDs::EmitDiag(DiagnosticsEngine &Diag, Level DiagLevel) const { - Diagnostic Info(&Diag); +void DiagnosticIDs::EmitDiag(DiagnosticsEngine &Diag, + const DiagnosticBuilder &DiagBuilder, + Level DiagLevel) const { + Diagnostic Info(&Diag, DiagBuilder); assert(DiagLevel != DiagnosticIDs::Ignored && "Cannot emit ignored diagnostics!"); Diag.Client->HandleDiagnostic((DiagnosticsEngine::Level)DiagLevel, Info); @@ -890,8 +893,6 @@ void DiagnosticIDs::EmitDiag(DiagnosticsEngine &Diag, Level DiagLevel) const { if (DiagLevel == DiagnosticIDs::Warning) ++Diag.NumWarnings; } - - Diag.CurDiagID = ~0U; } bool DiagnosticIDs::isUnrecoverable(unsigned DiagID) const { diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp index d6ec26af80aadd..65a8a7253e054f 100644 --- a/clang/lib/Basic/SourceManager.cpp +++ b/clang/lib/Basic/SourceManager.cpp @@ -130,13 +130,8 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, FileManager &FM, // the file could also have been removed during processing. Since we can't // really deal with this situation, just create an empty buffer. if (!BufferOrError) { - if (Diag.isDiagnosticInFlight()) - Diag.SetDelayedDiagnostic(diag::err_cannot_open_file, - ContentsEntry->getName(), - BufferOrError.getError().message()); - else - Diag.Report(Loc, diag::err_cannot_open_file) - << ContentsEntry->getName() << BufferOrError.getError().message(); + Diag.Report(Loc, diag::err_cannot_open_file) + << ContentsEntry->getName() << BufferOrError.getError().message(); return std::nullopt; } @@ -153,12 +148,7 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, FileManager &FM, // ContentsEntry::getSize() could have the wrong size. Use // MemoryBuffer::getBufferSize() instead. if (Buffer->getBufferSize() >= std::numeric_limits::max()) { - if (Diag.isDiagnosticInFlight()) - Diag.SetDelayedDiagnostic(diag::err_file_too_large, - ContentsEntry->getName()); - else - Diag.Report(Loc, diag::err_file_too_large) - << ContentsEntry->getName(); + Diag.Report(Loc, diag::err_file_too_large) << ContentsEntry->getName(); return std::nullopt; } @@ -168,12 +158,7 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, FileManager &FM, // have come from a stat cache). if (!ContentsEntry->isNamedPipe() && Buffer->getBufferSize() != (size_t)ContentsEntry->getSize()) { - if (Diag.isDiagnosticInFlight()) - Diag.SetDelayedDiagnostic(diag::err_file_modified, - ContentsEntry->getName()); - else - Diag.Report(Loc, diag::err_file_modified) - << ContentsEntry->getName(); + Diag.Report(Loc, diag::err_file_modified) << ContentsEntry->getName(); return std::nullopt; } diff --git a/clang/lib/Frontend/Rewrite/FixItRewriter.cpp b/clang/lib/Frontend/Rewrite/FixItRewriter.cpp index 44dfaf20eae73f..7309553e3bc0b4 100644 --- a/clang/lib/Frontend/Rewrite/FixItRewriter.cpp +++ b/clang/lib/Frontend/Rewrite/FixItRewriter.cpp @@ -200,10 +200,8 @@ void FixItRewriter::HandleDiagnostic(DiagnosticsEngine::Level DiagLevel, /// Emit a diagnostic via the adapted diagnostic client. void FixItRewriter::Diag(SourceLocation Loc, unsigned DiagID) { // When producing this diagnostic, we temporarily bypass ourselves, - // clear out any current diagnostic, and let the downstream client - // format the diagnostic. + // and let the downstream client format the diagnostic. Diags.setClient(Client, false); - Diags.Clear(); Diags.Report(Loc, DiagID); Diags.setClient(this, false); } diff --git a/clang/lib/Frontend/TextDiagnosticPrinter.cpp b/clang/lib/Frontend/TextDiagnosticPrinter.cpp index c2fea3d03f0c0f..28f7218dc23f54 100644 --- a/clang/lib/Frontend/TextDiagnosticPrinter.cpp +++ b/clang/lib/Frontend/TextDiagnosticPrinter.cpp @@ -84,7 +84,7 @@ static void printDiagnosticOptions(raw_ostream &OS, if (!Opt.empty()) { OS << (Started ? "," : " [") << (Level == DiagnosticsEngine::Remark ? "-R" : "-W") << Opt; - StringRef OptValue = Info.getDiags()->getFlagValue(); + StringRef OptValue = Info.getFlagValue(); if (!OptValue.empty()) OS << "=" << OptValue; Started = true; diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp index 85cbbe7750c2b3..69d72412471809 100644 --- a/clang/lib/Sema/Sema.cpp +++ b/clang/lib/Sema/Sema.cpp @@ -1590,7 +1590,7 @@ LangAS Sema::getDefaultCXXMethodAddrSpace() const { return LangAS::Default; } -void Sema::EmitCurrentDiagnostic(unsigned DiagID) { +void Sema::EmitDiagnostic(unsigned DiagID, const DiagnosticBuilder &DB) { // FIXME: It doesn't make sense to me that DiagID is an incoming argument here // and yet we also use the current diag ID on the DiagnosticsEngine. This has // been made more painfully obvious by the refactor that introduced this @@ -1598,9 +1598,9 @@ void Sema::EmitCurrentDiagnostic(unsigned DiagID) { // eliminated. If it truly cannot be (for example, there is some reentrancy // issue I am not seeing yet), then there should at least be a clarifying // comment somewhere. + Diagnostic DiagInfo(&Diags, DB); if (std::optional Info = isSFINAEContext()) { - switch (DiagnosticIDs::getDiagnosticSFINAEResponse( - Diags.getCurrentDiagID())) { + switch (DiagnosticIDs::getDiagnosticSFINAEResponse(DiagInfo.getID())) { case DiagnosticIDs::SFINAE_Report: // We'll report the diagnostic below. break; @@ -1613,13 +1613,11 @@ void Sema::EmitCurrentDiagnostic(unsigned DiagID) { // Make a copy of this suppressed diagnostic and store it with the // template-deduction information. if (*Info && !(*Info)->hasSFINAEDiagnostic()) { - Diagnostic DiagInfo(&Diags); (*Info)->addSFINAEDiagnostic(DiagInfo.getLocation(), PartialDiagnostic(DiagInfo, Context.getDiagAllocator())); } Diags.setLastDiagnosticIgnored(true); - Diags.Clear(); return; case DiagnosticIDs::SFINAE_AccessControl: { @@ -1630,7 +1628,7 @@ void Sema::EmitCurrentDiagnostic(unsigned DiagID) { if (!AccessCheckingSFINAE && !getLangOpts().CPlusPlus11) break; - SourceLocation Loc = Diags.getCurrentDiagLoc(); + SourceLocation Loc = DiagInfo.getLocation(); // Suppress this diagnostic. ++NumSFINAEErrors; @@ -1638,16 +1636,13 @@ void Sema::EmitCurrentDiagnostic(unsigned DiagID) { // Make a copy of this suppressed diagnostic and store it with the // template-deduction information. if (*Info && !(*Info)->hasSFINAEDiagnostic()) { - Diagnostic DiagInfo(&Diags); (*Info)->addSFINAEDiagnostic(DiagInfo.getLocation(), PartialDiagnostic(DiagInfo, Context.getDiagAllocator())); } Diags.setLastDiagnosticIgnored(true); - Diags.Clear(); - // Now the diagnostic state is clear, produce a C++98 compatibility - // warning. + // Now produce a C++98 compatibility warning. Diag(Loc, diag::warn_cxx98_compat_sfinae_access_control); // The last diagnostic which Sema produced was ignored. Suppress any @@ -1660,14 +1655,12 @@ void Sema::EmitCurrentDiagnostic(unsigned DiagID) { // Make a copy of this suppressed diagnostic and store it with the // template-deduction information; if (*Info) { - Diagnostic DiagInfo(&Diags); (*Info)->addSuppressedDiagnostic(DiagInfo.getLocation(), PartialDiagnostic(DiagInfo, Context.getDiagAllocator())); } // Suppress this diagnostic. Diags.setLastDiagnosticIgnored(true); - Diags.Clear(); return; } } @@ -1677,7 +1670,7 @@ void Sema::EmitCurrentDiagnostic(unsigned DiagID) { Context.setPrintingPolicy(getPrintingPolicy()); // Emit the diagnostic. - if (!Diags.EmitCurrentDiagnostic()) + if (!Diags.EmitDiagnostic(DB)) return; // If this is not a note, and we're in a template instantiation diff --git a/clang/lib/Sema/SemaBase.cpp b/clang/lib/Sema/SemaBase.cpp index a2f12d622e8ccc..5c24f21b469b01 100644 --- a/clang/lib/Sema/SemaBase.cpp +++ b/clang/lib/Sema/SemaBase.cpp @@ -26,7 +26,7 @@ SemaBase::ImmediateDiagBuilder::~ImmediateDiagBuilder() { Clear(); // Dispatch to Sema to emit the diagnostic. - SemaRef.EmitCurrentDiagnostic(DiagID); + SemaRef.EmitDiagnostic(DiagID, *this); } PartialDiagnostic SemaBase::PDiag(unsigned DiagID) { diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 4fae6ff02ea9a3..7efcc81e194d95 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -1382,7 +1382,7 @@ bool ASTReader::ReadVisibleDeclContextStorage(ModuleFile &M, void ASTReader::Error(StringRef Msg) const { Error(diag::err_fe_pch_malformed, Msg); - if (PP.getLangOpts().Modules && !Diags.isDiagnosticInFlight() && + if (PP.getLangOpts().Modules && !PP.getHeaderSearchInfo().getModuleCachePath().empty()) { Diag(diag::note_module_cache_path) << PP.getHeaderSearchInfo().getModuleCachePath(); @@ -1391,10 +1391,7 @@ void ASTReader::Error(StringRef Msg) const { void ASTReader::Error(unsigned DiagID, StringRef Arg1, StringRef Arg2, StringRef Arg3) const { - if (Diags.isDiagnosticInFlight()) - Diags.SetDelayedDiagnostic(DiagID, Arg1, Arg2, Arg3); - else - Diag(DiagID) << Arg1 << Arg2 << Arg3; + Diag(DiagID) << Arg1 << Arg2 << Arg3; } void ASTReader::Error(llvm::Error &&Err) const { @@ -2713,7 +2710,7 @@ InputFile ASTReader::getInputFile(ModuleFile &F, unsigned ID, bool Complain) { // For an overridden file, there is nothing to validate. if (!Overridden && FileChange.Kind != Change::None) { - if (Complain && !Diags.isDiagnosticInFlight()) { + if (Complain) { // Build a list of the PCH imports that got us here (in reverse). SmallVector ImportStack(1, &F); while (!ImportStack.back()->ImportedBy.empty()) @@ -3689,10 +3686,8 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F, SourceMgr.AllocateLoadedSLocEntries(F.LocalNumSLocEntries, SLocSpaceSize); if (!F.SLocEntryBaseID) { - if (!Diags.isDiagnosticInFlight()) { - Diags.Report(SourceLocation(), diag::remark_sloc_usage); - SourceMgr.noteSLocAddressSpaceUsage(Diags); - } + Diags.Report(SourceLocation(), diag::remark_sloc_usage); + SourceMgr.noteSLocAddressSpaceUsage(Diags); return llvm::createStringError(std::errc::invalid_argument, "ran out of source locations"); } diff --git a/clang/test/PCH/race-condition.cpp b/clang/test/PCH/race-condition.cpp new file mode 100644 index 00000000000000..752b0cc3ff6286 --- /dev/null +++ b/clang/test/PCH/race-condition.cpp @@ -0,0 +1,41 @@ +// RUN: %clang_cc1 -fallow-pch-with-compiler-errors -std=c++20 -x c++-header -emit-pch %s -o %t -verify +// RUN: %clang_cc1 -fallow-pch-with-compiler-errors -std=c++20 -include-pch %t %s -verify +#ifndef HEADER_H +#define HEADER_H + +#include "bad_include.h" +// expected-error@6{{'bad_include.h' file not found}} + +template struct enable_if {}; +template struct enable_if { typedef T type; }; +template using enable_if_t = typename enable_if::type; + +template struct meta { static constexpr int value = 0; }; +template <> struct meta { static constexpr int value = 1; }; +template <> struct meta { static constexpr int value = 2; }; + +namespace N { +inline namespace inner { + +template +constexpr enable_if_t::value == 0, void> midpoint(T) {} + +template +constexpr enable_if_t::value == 1, void> midpoint(U) {} + +template +constexpr enable_if_t::value == 2, void> midpoint(F) {} + +} // namespace inner +} // namespace N + +#else + +// expected-error@27{{'N::midpoint' has different definitions in different modules; defined here first difference is 1st parameter with type 'F'}} +// expected-error@24{{'N::midpoint' has different definitions in different modules; defined here first difference is 1st parameter with type 'U'}} +// expected-note@21{{but in '' found 1st parameter with type 'T'}} +int x = N::something; +// expected-error@37{{no member named 'something' in namespace 'N'}} +// expected-note@21{{but in '' found 1st parameter with type 'T'}} + +#endif diff --git a/clang/unittests/Basic/DiagnosticTest.cpp b/clang/unittests/Basic/DiagnosticTest.cpp index 74690193917165..691d74f697f278 100644 --- a/clang/unittests/Basic/DiagnosticTest.cpp +++ b/clang/unittests/Basic/DiagnosticTest.cpp @@ -17,9 +17,6 @@ using namespace llvm; using namespace clang; void clang::DiagnosticsTestHelper(DiagnosticsEngine &diag) { - unsigned delayedDiagID = 0U; - - EXPECT_EQ(diag.DelayedDiagID, delayedDiagID); EXPECT_FALSE(diag.DiagStates.empty()); EXPECT_TRUE(diag.DiagStatesByLoc.empty()); EXPECT_TRUE(diag.DiagStateOnPushStack.empty()); @@ -83,6 +80,21 @@ TEST(DiagnosticTest, fatalsAsError) { } } +TEST(DiagnosticTest, tooManyErrorsIsAlwaysFatal) { + DiagnosticsEngine Diags(new DiagnosticIDs(), new DiagnosticOptions, + new IgnoringDiagConsumer()); + Diags.setFatalsAsError(true); + + // Report a fatal_too_many_errors diagnostic to ensure that still + // acts as a fatal error despite downgrading fatal errors to errors. + Diags.Report(diag::fatal_too_many_errors); + EXPECT_TRUE(Diags.hasFatalErrorOccurred()); + + // Ensure that the severity of that diagnostic is really "fatal". + EXPECT_EQ(Diags.getDiagnosticLevel(diag::fatal_too_many_errors, {}), + DiagnosticsEngine::Level::Fatal); +} + // Check that soft RESET works as intended TEST(DiagnosticTest, softReset) { DiagnosticsEngine Diags(new DiagnosticIDs(), new DiagnosticOptions, @@ -104,7 +116,6 @@ TEST(DiagnosticTest, softReset) { // Check for private variables of DiagnosticsEngine differentiating soft reset DiagnosticsTestHelper(Diags); - EXPECT_FALSE(Diags.isDiagnosticInFlight()); EXPECT_TRUE(Diags.isLastDiagnosticIgnored()); } diff --git a/clang/unittests/Driver/DXCModeTest.cpp b/clang/unittests/Driver/DXCModeTest.cpp index 41ab30bc81d5f9..2a079a62f1bc13 100644 --- a/clang/unittests/Driver/DXCModeTest.cpp +++ b/clang/unittests/Driver/DXCModeTest.cpp @@ -51,7 +51,6 @@ static void validateTargetProfile( EXPECT_TRUE(C); EXPECT_EQ(Diags.getNumErrors(), NumOfErrors); EXPECT_STREQ(DiagConsumer->Errors.back().c_str(), ExpectError.data()); - Diags.Clear(); DiagConsumer->clear(); } @@ -160,7 +159,6 @@ TEST(DxcModeTest, ValidatorVersionValidation) { DiagConsumer->Errors.back().c_str(), "invalid validator version : 0.1; if validator major version is 0, " "minor version must also be 0"); - Diags.Clear(); DiagConsumer->clear(); Args = TheDriver.ParseArgStrings({"-validator-version", "1"}, false, @@ -176,7 +174,6 @@ TEST(DxcModeTest, ValidatorVersionValidation) { EXPECT_STREQ(DiagConsumer->Errors.back().c_str(), "invalid validator version : 1; format of validator version is " "\".\" (ex:\"1.4\")"); - Diags.Clear(); DiagConsumer->clear(); Args = TheDriver.ParseArgStrings({"-validator-version", "-Tlib_6_7"}, false, @@ -193,7 +190,6 @@ TEST(DxcModeTest, ValidatorVersionValidation) { DiagConsumer->Errors.back().c_str(), "invalid validator version : -Tlib_6_7; format of validator version is " "\".\" (ex:\"1.4\")"); - Diags.Clear(); DiagConsumer->clear(); Args = TheDriver.ParseArgStrings({"-validator-version", "foo"}, false, @@ -210,7 +206,6 @@ TEST(DxcModeTest, ValidatorVersionValidation) { DiagConsumer->Errors.back().c_str(), "invalid validator version : foo; format of validator version is " "\".\" (ex:\"1.4\")"); - Diags.Clear(); DiagConsumer->clear(); } diff --git a/flang/lib/Frontend/TextDiagnosticPrinter.cpp b/flang/lib/Frontend/TextDiagnosticPrinter.cpp index 8b00fb69b3cefb..dc182d68a1a979 100644 --- a/flang/lib/Frontend/TextDiagnosticPrinter.cpp +++ b/flang/lib/Frontend/TextDiagnosticPrinter.cpp @@ -45,7 +45,7 @@ static void printRemarkOption(llvm::raw_ostream &os, // warning could be printed i.e. [-Wunknown-warning-option] os << " [" << (level == clang::DiagnosticsEngine::Remark ? "-R" : "-W") << opt; - llvm::StringRef optValue = info.getDiags()->getFlagValue(); + llvm::StringRef optValue = info.getFlagValue(); if (!optValue.empty()) os << "=" << optValue; os << ']'; From 2731be7ac505f9ef2e90b77b84ef0fbe411bf9f5 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 18 Sep 2024 08:54:11 -0700 Subject: [PATCH 077/321] [Support] Add helper struct `indent` for adding indentation (#108966) Add helper struct indent() for adding indentation to raw_ostream. --- llvm/include/llvm/Support/raw_ostream.h | 19 +++++++++++++++++++ llvm/unittests/Support/raw_ostream_test.cpp | 13 +++++++++++++ llvm/utils/yaml-bench/YAMLBench.cpp | 11 ----------- 3 files changed, 32 insertions(+), 11 deletions(-) diff --git a/llvm/include/llvm/Support/raw_ostream.h b/llvm/include/llvm/Support/raw_ostream.h index 2570c826502e7c..34f91cbe9551f4 100644 --- a/llvm/include/llvm/Support/raw_ostream.h +++ b/llvm/include/llvm/Support/raw_ostream.h @@ -769,6 +769,25 @@ class buffer_unique_ostream : public raw_svector_ostream { ~buffer_unique_ostream() override { *OS << str(); } }; +// Helper struct to add indentation to raw_ostream. Instead of +// OS.indent(6) << "more stuff"; +// you can use +// OS << indent(6) << "more stuff"; +// which has better ergonomics (and clang-formats better as well). +struct indent { + unsigned NumSpaces; + + explicit indent(unsigned NumSpaces) : NumSpaces(NumSpaces) {} + void operator+=(unsigned N) { NumSpaces += N; } + void operator-=(unsigned N) { NumSpaces -= N; } + indent operator+(unsigned N) const { return indent(NumSpaces + N); } + indent operator-(unsigned N) const { return indent(NumSpaces - N); } +}; + +inline raw_ostream &operator<<(raw_ostream &OS, const indent &Indent) { + return OS.indent(Indent.NumSpaces); +} + class Error; /// This helper creates an output stream and then passes it to \p Write. diff --git a/llvm/unittests/Support/raw_ostream_test.cpp b/llvm/unittests/Support/raw_ostream_test.cpp index 1c6dfb6260cc04..99aa350adad71d 100644 --- a/llvm/unittests/Support/raw_ostream_test.cpp +++ b/llvm/unittests/Support/raw_ostream_test.cpp @@ -177,6 +177,19 @@ TEST(raw_ostreamTest, Justify) { EXPECT_EQ("none", printToString(center_justify("none", 1), 1)); } +TEST(raw_ostreamTest, Indent) { + indent Indent(4); + auto Spaces = [](int N) { return std::string(N, ' '); }; + EXPECT_EQ(Spaces(4), printToString(Indent)); + EXPECT_EQ("", printToString(indent(0))); + EXPECT_EQ(Spaces(5), printToString(Indent + 1)); + EXPECT_EQ(Spaces(3), printToString(Indent - 1)); + Indent += 1; + EXPECT_EQ(Spaces(5), printToString(Indent)); + Indent -= 1; + EXPECT_EQ(Spaces(4), printToString(Indent)); +} + TEST(raw_ostreamTest, FormatHex) { EXPECT_EQ("0x1234", printToString(format_hex(0x1234, 6), 6)); EXPECT_EQ("0x001234", printToString(format_hex(0x1234, 8), 8)); diff --git a/llvm/utils/yaml-bench/YAMLBench.cpp b/llvm/utils/yaml-bench/YAMLBench.cpp index 50e55538a011cb..4dc6caeb6fdbfc 100644 --- a/llvm/utils/yaml-bench/YAMLBench.cpp +++ b/llvm/utils/yaml-bench/YAMLBench.cpp @@ -56,17 +56,6 @@ cl::opt UseColor("use-color", cl::desc("Emit colored output (default=autodetect)"), cl::init(cl::BOU_UNSET)); -struct indent { - unsigned distance; - indent(unsigned d) : distance(d) {} -}; - -static raw_ostream &operator <<(raw_ostream &os, const indent &in) { - for (unsigned i = 0; i < in.distance; ++i) - os << " "; - return os; -} - /// Pretty print a tag by replacing tag:yaml.org,2002: with !!. static std::string prettyTag(yaml::Node *N) { std::string Tag = N->getVerbatimTag(); From 6b3c9e5c20e36d7c8a0dbabd9e71be8522d2dde4 Mon Sep 17 00:00:00 2001 From: weiguozhi <57237827+weiguozhi@users.noreply.github.com> Date: Wed, 18 Sep 2024 09:12:04 -0700 Subject: [PATCH 078/321] [X86] Speed up X86 Domain Reassignment pass by early return (#108108) Current implementation of X86 Domain Reassignment pass is finding out the complete closure of a general register, then check if it's possible to change the domain. It causes compile time issue when compiling large functions. This patch checks the possibility of change domain in the process of constructing closure, if it's illegal to change domain, we can return immedietely. For one of our large files, it reduced X86 Domain Reassignment pass time from 200+ seconds to less than 1s. --- llvm/lib/Target/X86/X86DomainReassignment.cpp | 57 ++++++++++++------- 1 file changed, 36 insertions(+), 21 deletions(-) diff --git a/llvm/lib/Target/X86/X86DomainReassignment.cpp b/llvm/lib/Target/X86/X86DomainReassignment.cpp index 4823183113989a..9c667f5036dd56 100644 --- a/llvm/lib/Target/X86/X86DomainReassignment.cpp +++ b/llvm/lib/Target/X86/X86DomainReassignment.cpp @@ -367,7 +367,7 @@ class X86DomainReassignment : public MachineFunctionPass { const X86InstrInfo *TII = nullptr; /// All edges that are included in some closure - BitVector EnclosedEdges{8, false}; + DenseMap EnclosedEdges; /// All instructions that are included in some closure. DenseMap EnclosedInstrs; @@ -399,14 +399,16 @@ class X86DomainReassignment : public MachineFunctionPass { void buildClosure(Closure &, Register Reg); /// Enqueue \p Reg to be considered for addition to the closure. - void visitRegister(Closure &, Register Reg, RegDomain &Domain, + /// Return false if the closure becomes invalid. + bool visitRegister(Closure &, Register Reg, RegDomain &Domain, SmallVectorImpl &Worklist); /// Reassign the closure to \p Domain. void reassign(const Closure &C, RegDomain Domain) const; /// Add \p MI to the closure. - void encloseInstr(Closure &C, MachineInstr *MI); + /// Return false if the closure becomes invalid. + bool encloseInstr(Closure &C, MachineInstr *MI); /// /returns true if it is profitable to reassign the closure to \p Domain. bool isReassignmentProfitable(const Closure &C, RegDomain Domain) const; @@ -419,17 +421,23 @@ char X86DomainReassignment::ID = 0; } // End anonymous namespace. -void X86DomainReassignment::visitRegister(Closure &C, Register Reg, +bool X86DomainReassignment::visitRegister(Closure &C, Register Reg, RegDomain &Domain, SmallVectorImpl &Worklist) { if (!Reg.isVirtual()) - return; + return true; - if (EnclosedEdges.test(Register::virtReg2Index(Reg))) - return; + auto I = EnclosedEdges.find(Reg); + if (I != EnclosedEdges.end()) { + if (I->second != C.getID()) { + C.setAllIllegal(); + return false; + } + return true; + } if (!MRI->hasOneDef(Reg)) - return; + return true; RegDomain RD = getDomain(MRI->getRegClass(Reg), MRI->getTargetRegisterInfo()); // First edge in closure sets the domain. @@ -437,19 +445,22 @@ void X86DomainReassignment::visitRegister(Closure &C, Register Reg, Domain = RD; if (Domain != RD) - return; + return true; Worklist.push_back(Reg); + return true; } -void X86DomainReassignment::encloseInstr(Closure &C, MachineInstr *MI) { +bool X86DomainReassignment::encloseInstr(Closure &C, MachineInstr *MI) { auto I = EnclosedInstrs.find(MI); if (I != EnclosedInstrs.end()) { - if (I->second != C.getID()) + if (I->second != C.getID()) { // Instruction already belongs to another closure, avoid conflicts between // closure and mark this closure as illegal. C.setAllIllegal(); - return; + return false; + } + return true; } EnclosedInstrs[MI] = C.getID(); @@ -465,6 +476,7 @@ void X86DomainReassignment::encloseInstr(Closure &C, MachineInstr *MI) { C.setIllegal((RegDomain)i); } } + return C.hasLegalDstDomain(); } double X86DomainReassignment::calculateCost(const Closure &C, @@ -543,10 +555,11 @@ void X86DomainReassignment::buildClosure(Closure &C, Register Reg) { // Register already in this closure. if (!C.insertEdge(CurReg)) continue; - EnclosedEdges.set(Register::virtReg2Index(Reg)); + EnclosedEdges[Reg] = C.getID(); MachineInstr *DefMI = MRI->getVRegDef(CurReg); - encloseInstr(C, DefMI); + if (!encloseInstr(C, DefMI)) + return; // Add register used by the defining MI to the worklist. // Do not add registers which are used in address calculation, they will be @@ -565,7 +578,8 @@ void X86DomainReassignment::buildClosure(Closure &C, Register Reg) { auto &Op = DefMI->getOperand(OpIdx); if (!Op.isReg() || !Op.isUse()) continue; - visitRegister(C, Op.getReg(), Domain, Worklist); + if (!visitRegister(C, Op.getReg(), Domain, Worklist)) + return; } // Expand closure through register uses. @@ -574,9 +588,10 @@ void X86DomainReassignment::buildClosure(Closure &C, Register Reg) { // as this should remain in GPRs. if (usedAsAddr(UseMI, CurReg, TII)) { C.setAllIllegal(); - continue; + return; } - encloseInstr(C, &UseMI); + if (!encloseInstr(C, &UseMI)) + return; for (auto &DefOp : UseMI.defs()) { if (!DefOp.isReg()) @@ -585,9 +600,10 @@ void X86DomainReassignment::buildClosure(Closure &C, Register Reg) { Register DefReg = DefOp.getReg(); if (!DefReg.isVirtual()) { C.setAllIllegal(); - continue; + return; } - visitRegister(C, DefReg, Domain, Worklist); + if (!visitRegister(C, DefReg, Domain, Worklist)) + return; } } } @@ -775,7 +791,6 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; EnclosedEdges.clear(); - EnclosedEdges.resize(MRI->getNumVirtRegs()); EnclosedInstrs.clear(); std::vector Closures; @@ -795,7 +810,7 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) { continue; // Register already in closure. - if (EnclosedEdges.test(Idx)) + if (EnclosedEdges.contains(Reg)) continue; // Calculate closure starting with Reg. From 292ee93a87018bfef519ceff7de676e4792aa8d9 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 18 Sep 2024 09:43:21 -0700 Subject: [PATCH 079/321] [CodeGen] Use Register in SwitchLoweringUtils. NFC (#109092) Use an empty Register() instead of -1U. --- llvm/include/llvm/CodeGen/SwitchLoweringUtils.h | 8 ++++---- llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 2 +- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 7 +++---- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h | 2 +- llvm/lib/CodeGen/SwitchLoweringUtils.cpp | 4 ++-- 5 files changed, 11 insertions(+), 12 deletions(-) diff --git a/llvm/include/llvm/CodeGen/SwitchLoweringUtils.h b/llvm/include/llvm/CodeGen/SwitchLoweringUtils.h index 9282c4a771afb2..9f1d6f7b4f9524 100644 --- a/llvm/include/llvm/CodeGen/SwitchLoweringUtils.h +++ b/llvm/include/llvm/CodeGen/SwitchLoweringUtils.h @@ -170,7 +170,7 @@ struct CaseBlock { struct JumpTable { /// The virtual register containing the index of the jump table entry /// to jump to. - unsigned Reg; + Register Reg; /// The JumpTableIndex for this jump table in the function. unsigned JTI; /// The MBB into which to emit the code for the indirect jump. @@ -182,7 +182,7 @@ struct JumpTable { /// The debug location of the instruction this JumpTable was produced from. std::optional SL; // For SelectionDAG - JumpTable(unsigned R, unsigned J, MachineBasicBlock *M, MachineBasicBlock *D, + JumpTable(Register R, unsigned J, MachineBasicBlock *M, MachineBasicBlock *D, std::optional SL) : Reg(R), JTI(J), MBB(M), Default(D), SL(SL) {} }; @@ -218,7 +218,7 @@ struct BitTestBlock { APInt First; APInt Range; const Value *SValue; - unsigned Reg; + Register Reg; MVT RegVT; bool Emitted; bool ContiguousRange; @@ -229,7 +229,7 @@ struct BitTestBlock { BranchProbability DefaultProb; bool FallthroughUnreachable = false; - BitTestBlock(APInt F, APInt R, const Value *SV, unsigned Rg, MVT RgVT, bool E, + BitTestBlock(APInt F, APInt R, const Value *SV, Register Rg, MVT RgVT, bool E, bool CR, MachineBasicBlock *P, MachineBasicBlock *D, BitTestInfo C, BranchProbability Pr) : First(std::move(F)), Range(std::move(R)), SValue(SV), Reg(Rg), diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 5710bda2b2cf86..07c189344c6429 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -838,7 +838,7 @@ void IRTranslator::splitWorkItem(SwitchCG::SwitchWorkList &WorkList, void IRTranslator::emitJumpTable(SwitchCG::JumpTable &JT, MachineBasicBlock *MBB) { // Emit the code for the jump table - assert(JT.Reg != -1U && "Should lower JT Header first!"); + assert(JT.Reg && "Should lower JT Header first!"); MachineIRBuilder MIB(*MBB->getParent()); MIB.setMBB(*MBB); MIB.setDebugLoc(CurBuilder->getDebugLoc()); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index a719ff859e778e..eec89f04c6356d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -2981,7 +2981,7 @@ void SelectionDAGBuilder::visitSwitchCase(CaseBlock &CB, void SelectionDAGBuilder::visitJumpTable(SwitchCG::JumpTable &JT) { // Emit the code for the jump table assert(JT.SL && "Should set SDLoc for SelectionDAG!"); - assert(JT.Reg != -1U && "Should lower JT Header first!"); + assert(JT.Reg && "Should lower JT Header first!"); EVT PTy = DAG.getTargetLoweringInfo().getJumpTableRegTy(DAG.getDataLayout()); SDValue Index = DAG.getCopyFromReg(getControlRoot(), *JT.SL, JT.Reg, PTy); SDValue Table = DAG.getJumpTable(JT.JTI, PTy); @@ -3261,10 +3261,9 @@ void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B, /// visitBitTestCase - this function produces one "bit test" void SelectionDAGBuilder::visitBitTestCase(BitTestBlock &BB, - MachineBasicBlock* NextMBB, + MachineBasicBlock *NextMBB, BranchProbability BranchProbToNext, - unsigned Reg, - BitTestCase &B, + Register Reg, BitTestCase &B, MachineBasicBlock *SwitchBB) { SDLoc dl = getCurSDLoc(); MVT VT = BB.RegVT; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index 9544f02b9a4808..3f8a3e7ffb65bb 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -526,7 +526,7 @@ class SelectionDAGBuilder { void visitBitTestHeader(SwitchCG::BitTestBlock &B, MachineBasicBlock *SwitchBB); void visitBitTestCase(SwitchCG::BitTestBlock &BB, MachineBasicBlock *NextMBB, - BranchProbability BranchProbToNext, unsigned Reg, + BranchProbability BranchProbToNext, Register Reg, SwitchCG::BitTestCase &B, MachineBasicBlock *SwitchBB); void visitJumpTable(SwitchCG::JumpTable &JT); void visitJumpTableHeader(SwitchCG::JumpTable &JT, diff --git a/llvm/lib/CodeGen/SwitchLoweringUtils.cpp b/llvm/lib/CodeGen/SwitchLoweringUtils.cpp index e741a0fc49fb3d..038c499fe236e0 100644 --- a/llvm/lib/CodeGen/SwitchLoweringUtils.cpp +++ b/llvm/lib/CodeGen/SwitchLoweringUtils.cpp @@ -254,7 +254,7 @@ bool SwitchCG::SwitchLowering::buildJumpTable(const CaseClusterVector &Clusters, ->createJumpTableIndex(Table); // Set up the jump table info. - JumpTable JT(-1U, JTI, JumpTableMBB, nullptr, SL); + JumpTable JT(Register(), JTI, JumpTableMBB, nullptr, SL); JumpTableHeader JTH(Clusters[First].Low->getValue(), Clusters[Last].High->getValue(), SI->getCondition(), nullptr, false); @@ -455,7 +455,7 @@ bool SwitchCG::SwitchLowering::buildBitTests(CaseClusterVector &Clusters, BTI.push_back(BitTestCase(CB.Mask, BitTestBB, CB.BB, CB.ExtraProb)); } BitTestCases.emplace_back(std::move(LowBound), std::move(CmpRange), - SI->getCondition(), -1U, MVT::Other, false, + SI->getCondition(), Register(), MVT::Other, false, ContiguousRange, nullptr, nullptr, std::move(BTI), TotalProb); From 8e4909aa198d8beaf32ee0abc59a06e2e54dc3bd Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 18 Sep 2024 09:43:48 -0700 Subject: [PATCH 080/321] [RISCV] Remove unnecessary vand.vi from vXi1 and nvXvi1 VECTOR_REVERSE codegen. (#109071) Use a setne with 0 instead of a trunc. We know we zero extended the node so we can get by with a non-zero check only. The truncate lowering doesn't know that we zero extended so has to mask the lsb. I don't think DAG combine sees the trunc before we lower it to RISCVISD nodes so we don't get a chance to use computeKnownBits to remove the AND. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 3 +- .../RISCV/rvv/named-vector-shuffle-reverse.ll | 84 ++++++------------- 2 files changed, 26 insertions(+), 61 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 0f76ad6c5e9288..189fb741f34cd1 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -10775,7 +10775,8 @@ SDValue RISCVTargetLowering::lowerVECTOR_REVERSE(SDValue Op, MVT WidenVT = MVT::getVectorVT(MVT::i8, VecVT.getVectorElementCount()); SDValue Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenVT, Op.getOperand(0)); SDValue Op2 = DAG.getNode(ISD::VECTOR_REVERSE, DL, WidenVT, Op1); - return DAG.getNode(ISD::TRUNCATE, DL, VecVT, Op2); + return DAG.getSetCC(DL, VecVT, Op2, + DAG.getConstant(0, DL, Op2.getValueType()), ISD::SETNE); } MVT ContainerVT = VecVT; diff --git a/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll index 2a915529e61dca..9d0cb22eb5f475 100644 --- a/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll +++ b/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll @@ -24,8 +24,7 @@ define @reverse_nxv2i1( %a) { ; RV32-BITS-UNKNOWN-NEXT: vrsub.vx v9, v9, a0 ; RV32-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v10, v8, v9 -; RV32-BITS-UNKNOWN-NEXT: vand.vi v8, v10, 1 -; RV32-BITS-UNKNOWN-NEXT: vmsne.vi v0, v8, 0 +; RV32-BITS-UNKNOWN-NEXT: vmsne.vi v0, v10, 0 ; RV32-BITS-UNKNOWN-NEXT: ret ; ; RV32-BITS-256-LABEL: reverse_nxv2i1: @@ -39,8 +38,7 @@ define @reverse_nxv2i1( %a) { ; RV32-BITS-256-NEXT: vid.v v9 ; RV32-BITS-256-NEXT: vrsub.vx v9, v9, a0 ; RV32-BITS-256-NEXT: vrgather.vv v10, v8, v9 -; RV32-BITS-256-NEXT: vand.vi v8, v10, 1 -; RV32-BITS-256-NEXT: vmsne.vi v0, v8, 0 +; RV32-BITS-256-NEXT: vmsne.vi v0, v10, 0 ; RV32-BITS-256-NEXT: ret ; ; RV32-BITS-512-LABEL: reverse_nxv2i1: @@ -54,8 +52,7 @@ define @reverse_nxv2i1( %a) { ; RV32-BITS-512-NEXT: vid.v v9 ; RV32-BITS-512-NEXT: vrsub.vx v9, v9, a0 ; RV32-BITS-512-NEXT: vrgather.vv v10, v8, v9 -; RV32-BITS-512-NEXT: vand.vi v8, v10, 1 -; RV32-BITS-512-NEXT: vmsne.vi v0, v8, 0 +; RV32-BITS-512-NEXT: vmsne.vi v0, v10, 0 ; RV32-BITS-512-NEXT: ret ; ; RV64-BITS-UNKNOWN-LABEL: reverse_nxv2i1: @@ -71,8 +68,7 @@ define @reverse_nxv2i1( %a) { ; RV64-BITS-UNKNOWN-NEXT: vrsub.vx v9, v9, a0 ; RV64-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v10, v8, v9 -; RV64-BITS-UNKNOWN-NEXT: vand.vi v8, v10, 1 -; RV64-BITS-UNKNOWN-NEXT: vmsne.vi v0, v8, 0 +; RV64-BITS-UNKNOWN-NEXT: vmsne.vi v0, v10, 0 ; RV64-BITS-UNKNOWN-NEXT: ret ; ; RV64-BITS-256-LABEL: reverse_nxv2i1: @@ -86,8 +82,7 @@ define @reverse_nxv2i1( %a) { ; RV64-BITS-256-NEXT: vid.v v9 ; RV64-BITS-256-NEXT: vrsub.vx v9, v9, a0 ; RV64-BITS-256-NEXT: vrgather.vv v10, v8, v9 -; RV64-BITS-256-NEXT: vand.vi v8, v10, 1 -; RV64-BITS-256-NEXT: vmsne.vi v0, v8, 0 +; RV64-BITS-256-NEXT: vmsne.vi v0, v10, 0 ; RV64-BITS-256-NEXT: ret ; ; RV64-BITS-512-LABEL: reverse_nxv2i1: @@ -101,8 +96,7 @@ define @reverse_nxv2i1( %a) { ; RV64-BITS-512-NEXT: vid.v v9 ; RV64-BITS-512-NEXT: vrsub.vx v9, v9, a0 ; RV64-BITS-512-NEXT: vrgather.vv v10, v8, v9 -; RV64-BITS-512-NEXT: vand.vi v8, v10, 1 -; RV64-BITS-512-NEXT: vmsne.vi v0, v8, 0 +; RV64-BITS-512-NEXT: vmsne.vi v0, v10, 0 ; RV64-BITS-512-NEXT: ret %res = call @llvm.vector.reverse.nxv2i1( %a) ret %res @@ -122,8 +116,7 @@ define @reverse_nxv4i1( %a) { ; RV32-BITS-UNKNOWN-NEXT: vrsub.vx v9, v9, a0 ; RV32-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v10, v8, v9 -; RV32-BITS-UNKNOWN-NEXT: vand.vi v8, v10, 1 -; RV32-BITS-UNKNOWN-NEXT: vmsne.vi v0, v8, 0 +; RV32-BITS-UNKNOWN-NEXT: vmsne.vi v0, v10, 0 ; RV32-BITS-UNKNOWN-NEXT: ret ; ; RV32-BITS-256-LABEL: reverse_nxv4i1: @@ -137,8 +130,7 @@ define @reverse_nxv4i1( %a) { ; RV32-BITS-256-NEXT: vid.v v9 ; RV32-BITS-256-NEXT: vrsub.vx v9, v9, a0 ; RV32-BITS-256-NEXT: vrgather.vv v10, v8, v9 -; RV32-BITS-256-NEXT: vand.vi v8, v10, 1 -; RV32-BITS-256-NEXT: vmsne.vi v0, v8, 0 +; RV32-BITS-256-NEXT: vmsne.vi v0, v10, 0 ; RV32-BITS-256-NEXT: ret ; ; RV32-BITS-512-LABEL: reverse_nxv4i1: @@ -152,8 +144,7 @@ define @reverse_nxv4i1( %a) { ; RV32-BITS-512-NEXT: vid.v v9 ; RV32-BITS-512-NEXT: vrsub.vx v9, v9, a0 ; RV32-BITS-512-NEXT: vrgather.vv v10, v8, v9 -; RV32-BITS-512-NEXT: vand.vi v8, v10, 1 -; RV32-BITS-512-NEXT: vmsne.vi v0, v8, 0 +; RV32-BITS-512-NEXT: vmsne.vi v0, v10, 0 ; RV32-BITS-512-NEXT: ret ; ; RV64-BITS-UNKNOWN-LABEL: reverse_nxv4i1: @@ -169,8 +160,7 @@ define @reverse_nxv4i1( %a) { ; RV64-BITS-UNKNOWN-NEXT: vrsub.vx v9, v9, a0 ; RV64-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v10, v8, v9 -; RV64-BITS-UNKNOWN-NEXT: vand.vi v8, v10, 1 -; RV64-BITS-UNKNOWN-NEXT: vmsne.vi v0, v8, 0 +; RV64-BITS-UNKNOWN-NEXT: vmsne.vi v0, v10, 0 ; RV64-BITS-UNKNOWN-NEXT: ret ; ; RV64-BITS-256-LABEL: reverse_nxv4i1: @@ -184,8 +174,7 @@ define @reverse_nxv4i1( %a) { ; RV64-BITS-256-NEXT: vid.v v9 ; RV64-BITS-256-NEXT: vrsub.vx v9, v9, a0 ; RV64-BITS-256-NEXT: vrgather.vv v10, v8, v9 -; RV64-BITS-256-NEXT: vand.vi v8, v10, 1 -; RV64-BITS-256-NEXT: vmsne.vi v0, v8, 0 +; RV64-BITS-256-NEXT: vmsne.vi v0, v10, 0 ; RV64-BITS-256-NEXT: ret ; ; RV64-BITS-512-LABEL: reverse_nxv4i1: @@ -199,8 +188,7 @@ define @reverse_nxv4i1( %a) { ; RV64-BITS-512-NEXT: vid.v v9 ; RV64-BITS-512-NEXT: vrsub.vx v9, v9, a0 ; RV64-BITS-512-NEXT: vrgather.vv v10, v8, v9 -; RV64-BITS-512-NEXT: vand.vi v8, v10, 1 -; RV64-BITS-512-NEXT: vmsne.vi v0, v8, 0 +; RV64-BITS-512-NEXT: vmsne.vi v0, v10, 0 ; RV64-BITS-512-NEXT: ret %res = call @llvm.vector.reverse.nxv4i1( %a) ret %res @@ -219,8 +207,7 @@ define @reverse_nxv8i1( %a) { ; RV32-BITS-UNKNOWN-NEXT: vrsub.vx v10, v10, a0 ; RV32-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v9, v8, v10 -; RV32-BITS-UNKNOWN-NEXT: vand.vi v8, v9, 1 -; RV32-BITS-UNKNOWN-NEXT: vmsne.vi v0, v8, 0 +; RV32-BITS-UNKNOWN-NEXT: vmsne.vi v0, v9, 0 ; RV32-BITS-UNKNOWN-NEXT: ret ; ; RV32-BITS-256-LABEL: reverse_nxv8i1: @@ -233,8 +220,7 @@ define @reverse_nxv8i1( %a) { ; RV32-BITS-256-NEXT: vid.v v9 ; RV32-BITS-256-NEXT: vrsub.vx v9, v9, a0 ; RV32-BITS-256-NEXT: vrgather.vv v10, v8, v9 -; RV32-BITS-256-NEXT: vand.vi v8, v10, 1 -; RV32-BITS-256-NEXT: vmsne.vi v0, v8, 0 +; RV32-BITS-256-NEXT: vmsne.vi v0, v10, 0 ; RV32-BITS-256-NEXT: ret ; ; RV32-BITS-512-LABEL: reverse_nxv8i1: @@ -247,8 +233,7 @@ define @reverse_nxv8i1( %a) { ; RV32-BITS-512-NEXT: vid.v v9 ; RV32-BITS-512-NEXT: vrsub.vx v9, v9, a0 ; RV32-BITS-512-NEXT: vrgather.vv v10, v8, v9 -; RV32-BITS-512-NEXT: vand.vi v8, v10, 1 -; RV32-BITS-512-NEXT: vmsne.vi v0, v8, 0 +; RV32-BITS-512-NEXT: vmsne.vi v0, v10, 0 ; RV32-BITS-512-NEXT: ret ; ; RV64-BITS-UNKNOWN-LABEL: reverse_nxv8i1: @@ -263,8 +248,7 @@ define @reverse_nxv8i1( %a) { ; RV64-BITS-UNKNOWN-NEXT: vrsub.vx v10, v10, a0 ; RV64-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v9, v8, v10 -; RV64-BITS-UNKNOWN-NEXT: vand.vi v8, v9, 1 -; RV64-BITS-UNKNOWN-NEXT: vmsne.vi v0, v8, 0 +; RV64-BITS-UNKNOWN-NEXT: vmsne.vi v0, v9, 0 ; RV64-BITS-UNKNOWN-NEXT: ret ; ; RV64-BITS-256-LABEL: reverse_nxv8i1: @@ -277,8 +261,7 @@ define @reverse_nxv8i1( %a) { ; RV64-BITS-256-NEXT: vid.v v9 ; RV64-BITS-256-NEXT: vrsub.vx v9, v9, a0 ; RV64-BITS-256-NEXT: vrgather.vv v10, v8, v9 -; RV64-BITS-256-NEXT: vand.vi v8, v10, 1 -; RV64-BITS-256-NEXT: vmsne.vi v0, v8, 0 +; RV64-BITS-256-NEXT: vmsne.vi v0, v10, 0 ; RV64-BITS-256-NEXT: ret ; ; RV64-BITS-512-LABEL: reverse_nxv8i1: @@ -291,8 +274,7 @@ define @reverse_nxv8i1( %a) { ; RV64-BITS-512-NEXT: vid.v v9 ; RV64-BITS-512-NEXT: vrsub.vx v9, v9, a0 ; RV64-BITS-512-NEXT: vrgather.vv v10, v8, v9 -; RV64-BITS-512-NEXT: vand.vi v8, v10, 1 -; RV64-BITS-512-NEXT: vmsne.vi v0, v8, 0 +; RV64-BITS-512-NEXT: vmsne.vi v0, v10, 0 ; RV64-BITS-512-NEXT: ret %res = call @llvm.vector.reverse.nxv8i1( %a) ret %res @@ -313,8 +295,7 @@ define @reverse_nxv16i1( %a) { ; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v13, v10, v8 ; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v12, v11, v8 ; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; RV32-BITS-UNKNOWN-NEXT: vand.vi v8, v12, 1 -; RV32-BITS-UNKNOWN-NEXT: vmsne.vi v0, v8, 0 +; RV32-BITS-UNKNOWN-NEXT: vmsne.vi v0, v12, 0 ; RV32-BITS-UNKNOWN-NEXT: ret ; ; RV32-BITS-256-LABEL: reverse_nxv16i1: @@ -331,8 +312,7 @@ define @reverse_nxv16i1( %a) { ; RV32-BITS-256-NEXT: vrgather.vv v13, v10, v8 ; RV32-BITS-256-NEXT: vrgather.vv v12, v11, v8 ; RV32-BITS-256-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; RV32-BITS-256-NEXT: vand.vi v8, v12, 1 -; RV32-BITS-256-NEXT: vmsne.vi v0, v8, 0 +; RV32-BITS-256-NEXT: vmsne.vi v0, v12, 0 ; RV32-BITS-256-NEXT: ret ; ; RV32-BITS-512-LABEL: reverse_nxv16i1: @@ -349,8 +329,7 @@ define @reverse_nxv16i1( %a) { ; RV32-BITS-512-NEXT: vrgather.vv v13, v10, v8 ; RV32-BITS-512-NEXT: vrgather.vv v12, v11, v8 ; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; RV32-BITS-512-NEXT: vand.vi v8, v12, 1 -; RV32-BITS-512-NEXT: vmsne.vi v0, v8, 0 +; RV32-BITS-512-NEXT: vmsne.vi v0, v12, 0 ; RV32-BITS-512-NEXT: ret ; ; RV64-BITS-UNKNOWN-LABEL: reverse_nxv16i1: @@ -367,8 +346,7 @@ define @reverse_nxv16i1( %a) { ; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v13, v10, v8 ; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v12, v11, v8 ; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; RV64-BITS-UNKNOWN-NEXT: vand.vi v8, v12, 1 -; RV64-BITS-UNKNOWN-NEXT: vmsne.vi v0, v8, 0 +; RV64-BITS-UNKNOWN-NEXT: vmsne.vi v0, v12, 0 ; RV64-BITS-UNKNOWN-NEXT: ret ; ; RV64-BITS-256-LABEL: reverse_nxv16i1: @@ -385,8 +363,7 @@ define @reverse_nxv16i1( %a) { ; RV64-BITS-256-NEXT: vrgather.vv v13, v10, v8 ; RV64-BITS-256-NEXT: vrgather.vv v12, v11, v8 ; RV64-BITS-256-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; RV64-BITS-256-NEXT: vand.vi v8, v12, 1 -; RV64-BITS-256-NEXT: vmsne.vi v0, v8, 0 +; RV64-BITS-256-NEXT: vmsne.vi v0, v12, 0 ; RV64-BITS-256-NEXT: ret ; ; RV64-BITS-512-LABEL: reverse_nxv16i1: @@ -403,8 +380,7 @@ define @reverse_nxv16i1( %a) { ; RV64-BITS-512-NEXT: vrgather.vv v13, v10, v8 ; RV64-BITS-512-NEXT: vrgather.vv v12, v11, v8 ; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; RV64-BITS-512-NEXT: vand.vi v8, v12, 1 -; RV64-BITS-512-NEXT: vmsne.vi v0, v8, 0 +; RV64-BITS-512-NEXT: vmsne.vi v0, v12, 0 ; RV64-BITS-512-NEXT: ret %res = call @llvm.vector.reverse.nxv16i1( %a) ret %res @@ -427,7 +403,6 @@ define @reverse_nxv32i1( %a) { ; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v9, v18, v12 ; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v8, v19, v12 ; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m4, ta, ma -; RV32-BITS-UNKNOWN-NEXT: vand.vi v8, v8, 1 ; RV32-BITS-UNKNOWN-NEXT: vmsne.vi v0, v8, 0 ; RV32-BITS-UNKNOWN-NEXT: ret ; @@ -447,7 +422,6 @@ define @reverse_nxv32i1( %a) { ; RV32-BITS-256-NEXT: vrgather.vv v9, v18, v12 ; RV32-BITS-256-NEXT: vrgather.vv v8, v19, v12 ; RV32-BITS-256-NEXT: vsetvli a0, zero, e8, m4, ta, ma -; RV32-BITS-256-NEXT: vand.vi v8, v8, 1 ; RV32-BITS-256-NEXT: vmsne.vi v0, v8, 0 ; RV32-BITS-256-NEXT: ret ; @@ -467,7 +441,6 @@ define @reverse_nxv32i1( %a) { ; RV32-BITS-512-NEXT: vrgather.vv v9, v18, v12 ; RV32-BITS-512-NEXT: vrgather.vv v8, v19, v12 ; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, m4, ta, ma -; RV32-BITS-512-NEXT: vand.vi v8, v8, 1 ; RV32-BITS-512-NEXT: vmsne.vi v0, v8, 0 ; RV32-BITS-512-NEXT: ret ; @@ -487,7 +460,6 @@ define @reverse_nxv32i1( %a) { ; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v9, v18, v12 ; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v8, v19, v12 ; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m4, ta, ma -; RV64-BITS-UNKNOWN-NEXT: vand.vi v8, v8, 1 ; RV64-BITS-UNKNOWN-NEXT: vmsne.vi v0, v8, 0 ; RV64-BITS-UNKNOWN-NEXT: ret ; @@ -507,7 +479,6 @@ define @reverse_nxv32i1( %a) { ; RV64-BITS-256-NEXT: vrgather.vv v9, v18, v12 ; RV64-BITS-256-NEXT: vrgather.vv v8, v19, v12 ; RV64-BITS-256-NEXT: vsetvli a0, zero, e8, m4, ta, ma -; RV64-BITS-256-NEXT: vand.vi v8, v8, 1 ; RV64-BITS-256-NEXT: vmsne.vi v0, v8, 0 ; RV64-BITS-256-NEXT: ret ; @@ -527,7 +498,6 @@ define @reverse_nxv32i1( %a) { ; RV64-BITS-512-NEXT: vrgather.vv v9, v18, v12 ; RV64-BITS-512-NEXT: vrgather.vv v8, v19, v12 ; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, m4, ta, ma -; RV64-BITS-512-NEXT: vand.vi v8, v8, 1 ; RV64-BITS-512-NEXT: vmsne.vi v0, v8, 0 ; RV64-BITS-512-NEXT: ret %res = call @llvm.vector.reverse.nxv32i1( %a) @@ -555,7 +525,6 @@ define @reverse_nxv64i1( %a) { ; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v9, v30, v16 ; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v8, v31, v16 ; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m8, ta, ma -; RV32-BITS-UNKNOWN-NEXT: vand.vi v8, v8, 1 ; RV32-BITS-UNKNOWN-NEXT: vmsne.vi v0, v8, 0 ; RV32-BITS-UNKNOWN-NEXT: ret ; @@ -579,7 +548,6 @@ define @reverse_nxv64i1( %a) { ; RV32-BITS-256-NEXT: vrgather.vv v9, v22, v24 ; RV32-BITS-256-NEXT: vrgather.vv v8, v23, v24 ; RV32-BITS-256-NEXT: vsetvli a0, zero, e8, m8, ta, ma -; RV32-BITS-256-NEXT: vand.vi v8, v8, 1 ; RV32-BITS-256-NEXT: vmsne.vi v0, v8, 0 ; RV32-BITS-256-NEXT: ret ; @@ -603,7 +571,6 @@ define @reverse_nxv64i1( %a) { ; RV32-BITS-512-NEXT: vrgather.vv v9, v22, v24 ; RV32-BITS-512-NEXT: vrgather.vv v8, v23, v24 ; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, m8, ta, ma -; RV32-BITS-512-NEXT: vand.vi v8, v8, 1 ; RV32-BITS-512-NEXT: vmsne.vi v0, v8, 0 ; RV32-BITS-512-NEXT: ret ; @@ -627,7 +594,6 @@ define @reverse_nxv64i1( %a) { ; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v9, v30, v16 ; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v8, v31, v16 ; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m8, ta, ma -; RV64-BITS-UNKNOWN-NEXT: vand.vi v8, v8, 1 ; RV64-BITS-UNKNOWN-NEXT: vmsne.vi v0, v8, 0 ; RV64-BITS-UNKNOWN-NEXT: ret ; @@ -651,7 +617,6 @@ define @reverse_nxv64i1( %a) { ; RV64-BITS-256-NEXT: vrgather.vv v9, v22, v24 ; RV64-BITS-256-NEXT: vrgather.vv v8, v23, v24 ; RV64-BITS-256-NEXT: vsetvli a0, zero, e8, m8, ta, ma -; RV64-BITS-256-NEXT: vand.vi v8, v8, 1 ; RV64-BITS-256-NEXT: vmsne.vi v0, v8, 0 ; RV64-BITS-256-NEXT: ret ; @@ -675,7 +640,6 @@ define @reverse_nxv64i1( %a) { ; RV64-BITS-512-NEXT: vrgather.vv v9, v22, v24 ; RV64-BITS-512-NEXT: vrgather.vv v8, v23, v24 ; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, m8, ta, ma -; RV64-BITS-512-NEXT: vand.vi v8, v8, 1 ; RV64-BITS-512-NEXT: vmsne.vi v0, v8, 0 ; RV64-BITS-512-NEXT: ret %res = call @llvm.vector.reverse.nxv64i1( %a) From 87da9e2fac04a97fb35f8546c582d4cd4c06ec14 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 18 Sep 2024 09:44:57 -0700 Subject: [PATCH 081/321] [LLVM][TableGen] Change CodeEmitterGen to use const RecordKeeper (#109025) Change CodeEmitterGen to use const RecordKeeper. This is a part of effort to have better const correctness in TableGen backends: https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089 --- llvm/utils/TableGen/CodeEmitterGen.cpp | 75 +++++++++++++------------- 1 file changed, 38 insertions(+), 37 deletions(-) diff --git a/llvm/utils/TableGen/CodeEmitterGen.cpp b/llvm/utils/TableGen/CodeEmitterGen.cpp index 69ca9a84953a30..4d356774f98dcc 100644 --- a/llvm/utils/TableGen/CodeEmitterGen.cpp +++ b/llvm/utils/TableGen/CodeEmitterGen.cpp @@ -47,28 +47,30 @@ using namespace llvm; namespace { class CodeEmitterGen { - RecordKeeper &Records; + const RecordKeeper &Records; public: - CodeEmitterGen(RecordKeeper &R) : Records(R) {} + CodeEmitterGen(const RecordKeeper &R) : Records(R) {} void run(raw_ostream &o); private: - int getVariableBit(const std::string &VarName, BitsInit *BI, int bit); + int getVariableBit(const std::string &VarName, const BitsInit *BI, int bit); std::pair - getInstructionCases(Record *R, CodeGenTarget &Target); - void addInstructionCasesForEncoding(Record *R, const Record *EncodingDef, - CodeGenTarget &Target, std::string &Case, + getInstructionCases(const Record *R, const CodeGenTarget &Target); + void addInstructionCasesForEncoding(const Record *R, + const Record *EncodingDef, + const CodeGenTarget &Target, + std::string &Case, std::string &BitOffsetCase); - bool addCodeToMergeInOperand(Record *R, BitsInit *BI, + bool addCodeToMergeInOperand(const Record *R, const BitsInit *BI, const std::string &VarName, std::string &Case, std::string &BitOffsetCase, - CodeGenTarget &Target); + const CodeGenTarget &Target); void emitInstructionBaseValues( raw_ostream &o, ArrayRef NumberedInstructions, - CodeGenTarget &Target, unsigned HwMode = DefaultMode); + const CodeGenTarget &Target, unsigned HwMode = DefaultMode); void emitCaseMap(raw_ostream &o, const std::map> &CaseMap); @@ -78,13 +80,13 @@ class CodeEmitterGen { // If the VarBitInit at position 'bit' matches the specified variable then // return the variable bit position. Otherwise return -1. -int CodeEmitterGen::getVariableBit(const std::string &VarName, BitsInit *BI, - int bit) { - if (VarBitInit *VBI = dyn_cast(BI->getBit(bit))) { - if (VarInit *VI = dyn_cast(VBI->getBitVar())) +int CodeEmitterGen::getVariableBit(const std::string &VarName, + const BitsInit *BI, int bit) { + if (const VarBitInit *VBI = dyn_cast(BI->getBit(bit))) { + if (const VarInit *VI = dyn_cast(VBI->getBitVar())) if (VI->getName() == VarName) return VBI->getBitNum(); - } else if (VarInit *VI = dyn_cast(BI->getBit(bit))) { + } else if (const VarInit *VI = dyn_cast(BI->getBit(bit))) { if (VI->getName() == VarName) return 0; } @@ -93,11 +95,12 @@ int CodeEmitterGen::getVariableBit(const std::string &VarName, BitsInit *BI, } // Returns true if it succeeds, false if an error. -bool CodeEmitterGen::addCodeToMergeInOperand(Record *R, BitsInit *BI, +bool CodeEmitterGen::addCodeToMergeInOperand(const Record *R, + const BitsInit *BI, const std::string &VarName, std::string &Case, std::string &BitOffsetCase, - CodeGenTarget &Target) { + const CodeGenTarget &Target) { CodeGenInstruction &CGI = Target.getInstruction(R); // Determine if VarName actually contributes to the Inst encoding. @@ -278,7 +281,8 @@ bool CodeEmitterGen::addCodeToMergeInOperand(Record *R, BitsInit *BI, } std::pair -CodeEmitterGen::getInstructionCases(Record *R, CodeGenTarget &Target) { +CodeEmitterGen::getInstructionCases(const Record *R, + const CodeGenTarget &Target) { std::string Case, BitOffsetCase; auto append = [&](const std::string &S) { @@ -287,7 +291,7 @@ CodeEmitterGen::getInstructionCases(Record *R, CodeGenTarget &Target) { }; if (const RecordVal *RV = R->getValue("EncodingInfos")) { - if (auto *DI = dyn_cast_or_null(RV->getValue())) { + if (const auto *DI = dyn_cast_or_null(RV->getValue())) { const CodeGenHwModes &HWM = Target.getHwModes(); EncodingInfoByHwMode EBM(DI->getDef(), HWM); @@ -342,7 +346,7 @@ CodeEmitterGen::getInstructionCases(Record *R, CodeGenTarget &Target) { } void CodeEmitterGen::addInstructionCasesForEncoding( - Record *R, const Record *EncodingDef, CodeGenTarget &Target, + const Record *R, const Record *EncodingDef, const CodeGenTarget &Target, std::string &Case, std::string &BitOffsetCase) { BitsInit *BI = EncodingDef->getValueAsBitsInit("Inst"); @@ -394,7 +398,7 @@ static void emitInstBits(raw_ostream &OS, const APInt &Bits) { void CodeEmitterGen::emitInstructionBaseValues( raw_ostream &o, ArrayRef NumberedInstructions, - CodeGenTarget &Target, unsigned HwMode) { + const CodeGenTarget &Target, unsigned HwMode) { const CodeGenHwModes &HWM = Target.getHwModes(); if (HwMode == DefaultMode) o << " static const uint64_t InstBits[] = {\n"; @@ -430,12 +434,12 @@ void CodeEmitterGen::emitInstructionBaseValues( } } } - BitsInit *BI = EncodingDef->getValueAsBitsInit("Inst"); + const BitsInit *BI = EncodingDef->getValueAsBitsInit("Inst"); // Start by filling in fixed values. APInt Value(BitWidth, 0); for (unsigned i = 0, e = BI->getNumBits(); i != e; ++i) { - if (auto *B = dyn_cast(BI->getBit(i)); B && B->getValue()) + if (const auto *B = dyn_cast(BI->getBit(i)); B && B->getValue()) Value.setBit(i); } o << " "; @@ -448,15 +452,13 @@ void CodeEmitterGen::emitInstructionBaseValues( void CodeEmitterGen::emitCaseMap( raw_ostream &o, const std::map> &CaseMap) { - std::map>::const_iterator IE, EE; - for (IE = CaseMap.begin(), EE = CaseMap.end(); IE != EE; ++IE) { - const std::string &Case = IE->first; - const std::vector &InstList = IE->second; - - for (int i = 0, N = InstList.size(); i < N; i++) { - if (i) + for (const auto &[Case, InstList] : CaseMap) { + bool First = true; + for (const auto &Inst : InstList) { + if (!First) o << "\n"; - o << " case " << InstList[i] << ":"; + o << " case " << Inst << ":"; + First = false; } o << " {\n"; o << Case; @@ -469,7 +471,6 @@ void CodeEmitterGen::run(raw_ostream &o) { emitSourceFileHeader("Machine Code Emitter", o); CodeGenTarget Target(Records); - std::vector Insts = Records.getAllDerivedDefinitions("Instruction"); // For little-endian instruction bit encodings, reverse the bit order Target.reverseBitsForLittleEndianEncoding(); @@ -491,17 +492,17 @@ void CodeEmitterGen::run(raw_ostream &o) { continue; if (const RecordVal *RV = R->getValue("EncodingInfos")) { - if (DefInit *DI = dyn_cast_or_null(RV->getValue())) { + if (const DefInit *DI = dyn_cast_or_null(RV->getValue())) { EncodingInfoByHwMode EBM(DI->getDef(), HWM); - for (auto &KV : EBM) { - BitsInit *BI = KV.second->getValueAsBitsInit("Inst"); + for (const auto &[Key, Value] : EBM) { + const BitsInit *BI = Value->getValueAsBitsInit("Inst"); BitWidth = std::max(BitWidth, BI->getNumBits()); - HwModes.insert(KV.first); + HwModes.insert(Key); } continue; } } - BitsInit *BI = R->getValueAsBitsInit("Inst"); + const BitsInit *BI = R->getValueAsBitsInit("Inst"); BitWidth = std::max(BitWidth, BI->getNumBits()); } UseAPInt = BitWidth > 64; @@ -540,7 +541,7 @@ void CodeEmitterGen::run(raw_ostream &o) { std::map> BitOffsetCaseMap; // Construct all cases statement for each opcode - for (Record *R : Insts) { + for (const Record *R : Records.getAllDerivedDefinitions("Instruction")) { if (R->getValueAsString("Namespace") == "TargetOpcode" || R->getValueAsBit("isPseudo")) continue; From 0545e9f5b6e0cb3743ca27ce88c24974e6f29f56 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 18 Sep 2024 09:52:50 -0700 Subject: [PATCH 082/321] [LLVM][TableGen] Change DFAEmitter to use const Record pointers (#109042) Change DFAEmitter to use const Record pointers. This is a part of effort to have better const correctness in TableGen backends: https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089 --- llvm/utils/TableGen/DFAEmitter.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/utils/TableGen/DFAEmitter.cpp b/llvm/utils/TableGen/DFAEmitter.cpp index 18620b2a073f19..7d274a1cf632e5 100644 --- a/llvm/utils/TableGen/DFAEmitter.cpp +++ b/llvm/utils/TableGen/DFAEmitter.cpp @@ -170,7 +170,7 @@ void DfaEmitter::printActionValue(action_type A, raw_ostream &OS) { OS << A; } namespace { -using Action = std::variant; +using Action = std::variant; using ActionTuple = std::vector; class Automaton; @@ -356,7 +356,7 @@ void CustomDfaEmitter::printActionValue(action_type A, raw_ostream &OS) { ListSeparator LS; for (const auto &SingleAction : AT) { OS << LS; - if (const auto *R = std::get_if(&SingleAction)) + if (const auto *R = std::get_if(&SingleAction)) OS << (*R)->getName(); else if (const auto *S = std::get_if(&SingleAction)) OS << '"' << *S << '"'; From 06048aaa73cac62b95fab4cca3ce9d19f596898f Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 18 Sep 2024 09:54:21 -0700 Subject: [PATCH 083/321] [Target] Use 'unsigned' as the underlying type for the tablegened physical register enums. (#109086) Otherwise, the enum defaults to 'int'. Update a few places that used 'int' for registers that now need to change to avoid a signed/unsigned compare warning. I was hoping this would allow us to remove the 'int' comparison operators in Register.h and MCRegister.h, but compares with literal 0 still need them. --- llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp | 22 +++++++++++--------- llvm/utils/TableGen/RegisterInfoEmitter.cpp | 2 +- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp index 42440bc36f2498..fe26d6c2dd090f 100644 --- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -1589,7 +1589,7 @@ void ARMExpandPseudo::CMSESaveClearFPRegsV81(MachineBasicBlock &MBB, BuildMI(MBB, MBBI, DL, TII->get(ARM::VSTMSDB_UPD), ARM::SP) .addReg(ARM::SP) .add(predOps(ARMCC::AL)); - for (int Reg = ARM::S16; Reg <= ARM::S31; ++Reg) + for (unsigned Reg = ARM::S16; Reg <= ARM::S31; ++Reg) VPUSH.addReg(Reg); // Clear FP registers with a VSCCLRM. @@ -1794,7 +1794,7 @@ void ARMExpandPseudo::CMSERestoreFPRegsV81( BuildMI(MBB, MBBI, DL, TII->get(ARM::VLDMSIA_UPD), ARM::SP) .addReg(ARM::SP) .add(predOps(ARMCC::AL)); - for (int Reg = ARM::S16; Reg <= ARM::S31; ++Reg) + for (unsigned Reg = ARM::S16; Reg <= ARM::S31; ++Reg) VPOP.addReg(Reg, RegState::Define); } } @@ -2044,13 +2044,14 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB, static void CMSEPushCalleeSaves(const TargetInstrInfo &TII, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, int JumpReg, - const LivePhysRegs &LiveRegs, bool Thumb1Only) { + MachineBasicBlock::iterator MBBI, + Register JumpReg, const LivePhysRegs &LiveRegs, + bool Thumb1Only) { const DebugLoc &DL = MBBI->getDebugLoc(); if (Thumb1Only) { // push Lo and Hi regs separately MachineInstrBuilder PushMIB = BuildMI(MBB, MBBI, DL, TII.get(ARM::tPUSH)).add(predOps(ARMCC::AL)); - for (int Reg = ARM::R4; Reg < ARM::R8; ++Reg) { + for (unsigned Reg = ARM::R4; Reg < ARM::R8; ++Reg) { PushMIB.addReg( Reg, Reg == JumpReg || LiveRegs.contains(Reg) ? 0 : RegState::Undef); } @@ -2062,7 +2063,8 @@ static void CMSEPushCalleeSaves(const TargetInstrInfo &TII, // memory, and allow us to later pop them with a single instructions. // FIXME: Could also use any of r0-r3 that are free (including in the // first PUSH above). - for (int LoReg = ARM::R7, HiReg = ARM::R11; LoReg >= ARM::R4; --LoReg) { + for (unsigned LoReg = ARM::R7, HiReg = ARM::R11; LoReg >= ARM::R4; + --LoReg) { if (JumpReg == LoReg) continue; BuildMI(MBB, MBBI, DL, TII.get(ARM::tMOVr), LoReg) @@ -2072,7 +2074,7 @@ static void CMSEPushCalleeSaves(const TargetInstrInfo &TII, } MachineInstrBuilder PushMIB2 = BuildMI(MBB, MBBI, DL, TII.get(ARM::tPUSH)).add(predOps(ARMCC::AL)); - for (int Reg = ARM::R4; Reg < ARM::R8; ++Reg) { + for (unsigned Reg = ARM::R4; Reg < ARM::R8; ++Reg) { if (Reg == JumpReg) continue; PushMIB2.addReg(Reg, RegState::Kill); @@ -2082,7 +2084,7 @@ static void CMSEPushCalleeSaves(const TargetInstrInfo &TII, // the JumpReg), use r4 or r5, whichever is not JumpReg. It has already been // saved. if (JumpReg >= ARM::R4 && JumpReg <= ARM::R7) { - int LoReg = JumpReg == ARM::R4 ? ARM::R5 : ARM::R4; + Register LoReg = JumpReg == ARM::R4 ? ARM::R5 : ARM::R4; BuildMI(MBB, MBBI, DL, TII.get(ARM::tMOVr), LoReg) .addReg(ARM::R8, LiveRegs.contains(ARM::R8) ? 0 : RegState::Undef) .add(predOps(ARMCC::AL)); @@ -2095,7 +2097,7 @@ static void CMSEPushCalleeSaves(const TargetInstrInfo &TII, BuildMI(MBB, MBBI, DL, TII.get(ARM::t2STMDB_UPD), ARM::SP) .addReg(ARM::SP) .add(predOps(ARMCC::AL)); - for (int Reg = ARM::R4; Reg < ARM::R12; ++Reg) { + for (unsigned Reg = ARM::R4; Reg < ARM::R12; ++Reg) { PushMIB.addReg( Reg, Reg == JumpReg || LiveRegs.contains(Reg) ? 0 : RegState::Undef); } @@ -2125,7 +2127,7 @@ static void CMSEPopCalleeSaves(const TargetInstrInfo &TII, BuildMI(MBB, MBBI, DL, TII.get(ARM::t2LDMIA_UPD), ARM::SP) .addReg(ARM::SP) .add(predOps(ARMCC::AL)); - for (int Reg = ARM::R4; Reg < ARM::R12; ++Reg) + for (unsigned Reg = ARM::R4; Reg < ARM::R12; ++Reg) PopMIB.addReg(Reg, RegState::Define); } } diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp index e076832674bde2..63e70698d7cd6f 100644 --- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp +++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp @@ -123,7 +123,7 @@ void RegisterInfoEmitter::runEnums(raw_ostream &OS, CodeGenTarget &Target, if (!Namespace.empty()) OS << "namespace " << Namespace << " {\n"; - OS << "enum {\n NoRegister,\n"; + OS << "enum : unsigned {\n NoRegister,\n"; for (const auto &Reg : Registers) OS << " " << Reg.getName() << " = " << Reg.EnumValue << ",\n"; From 4fbac52841e967033f9f783e9223798232dca4dd Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 18 Sep 2024 09:56:56 -0700 Subject: [PATCH 084/321] [LLVM][TableGen] Change DXILEmitter to use const RecordKeeper (#109045) Change DXILEmitter to use const RecordKeeper. This is a part of effort to have better const correctness in TableGen backends: https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089 --- llvm/utils/TableGen/DXILEmitter.cpp | 52 +++++++++++++---------------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/llvm/utils/TableGen/DXILEmitter.cpp b/llvm/utils/TableGen/DXILEmitter.cpp index 20164e1368ee9c..a4b54950928677 100644 --- a/llvm/utils/TableGen/DXILEmitter.cpp +++ b/llvm/utils/TableGen/DXILEmitter.cpp @@ -325,8 +325,7 @@ static std::string getAttributeMaskString(const SmallVector Recs) { } /// Emit a mapping of DXIL opcode to opname -static void emitDXILOpCodes(std::vector &Ops, - raw_ostream &OS) { +static void emitDXILOpCodes(ArrayRef Ops, raw_ostream &OS) { OS << "#ifdef DXIL_OPCODE\n"; for (const DXILOperationDesc &Op : Ops) OS << "DXIL_OPCODE(" << Op.OpCode << ", " << Op.OpName << ")\n"; @@ -336,23 +335,20 @@ static void emitDXILOpCodes(std::vector &Ops, } /// Emit a list of DXIL op classes -static void emitDXILOpClasses(RecordKeeper &Records, raw_ostream &OS) { +static void emitDXILOpClasses(const RecordKeeper &Records, raw_ostream &OS) { OS << "#ifdef DXIL_OPCLASS\n"; - std::vector OpClasses = - Records.getAllDerivedDefinitions("DXILOpClass"); - for (Record *OpClass : OpClasses) + for (const Record *OpClass : Records.getAllDerivedDefinitions("DXILOpClass")) OS << "DXIL_OPCLASS(" << OpClass->getName() << ")\n"; OS << "#undef DXIL_OPCLASS\n"; OS << "#endif\n\n"; } /// Emit a list of DXIL op parameter types -static void emitDXILOpParamTypes(RecordKeeper &Records, raw_ostream &OS) { +static void emitDXILOpParamTypes(const RecordKeeper &Records, raw_ostream &OS) { OS << "#ifdef DXIL_OP_PARAM_TYPE\n"; - std::vector OpClasses = - Records.getAllDerivedDefinitions("DXILOpParamType"); - for (Record *OpClass : OpClasses) - OS << "DXIL_OP_PARAM_TYPE(" << OpClass->getName() << ")\n"; + for (const Record *OpParamType : + Records.getAllDerivedDefinitions("DXILOpParamType")) + OS << "DXIL_OP_PARAM_TYPE(" << OpParamType->getName() << ")\n"; OS << "#undef DXIL_OP_PARAM_TYPE\n"; OS << "#endif\n\n"; } @@ -378,7 +374,7 @@ static void emitDXILOpFunctionTypes(ArrayRef Ops, /// Emit map of DXIL operation to LLVM or DirectX intrinsic /// \param A vector of DXIL Ops /// \param Output stream -static void emitDXILIntrinsicMap(std::vector &Ops, +static void emitDXILIntrinsicMap(ArrayRef Ops, raw_ostream &OS) { OS << "#ifdef DXIL_OP_INTRINSIC\n"; OS << "\n"; @@ -396,14 +392,14 @@ static void emitDXILIntrinsicMap(std::vector &Ops, /// Emit DXIL operation table /// \param A vector of DXIL Ops /// \param Output stream -static void emitDXILOperationTable(std::vector &Ops, +static void emitDXILOperationTable(ArrayRef Ops, raw_ostream &OS) { // Collect Names. SequenceToOffsetTable OpClassStrings; SequenceToOffsetTable OpStrings; StringSet<> ClassSet; - for (auto &Op : Ops) { + for (const auto &Op : Ops) { OpStrings.add(Op.OpName); if (ClassSet.insert(Op.OpClass).second) @@ -421,7 +417,7 @@ static void emitDXILOperationTable(std::vector &Ops, OS << " static const OpCodeProperty OpCodeProps[] = {\n"; std::string Prefix = ""; - for (auto &Op : Ops) { + for (const auto &Op : Ops) { OS << Prefix << " { dxil::OpCode::" << Op.OpName << ", " << OpStrings.get(Op.OpName) << ", OpCodeClass::" << Op.OpClass << ", " << OpClassStrings.get(Op.OpClass.data()) << ", " @@ -469,14 +465,15 @@ static void emitDXILOperationTable(std::vector &Ops, OS << "}\n\n"; } -static void emitDXILOperationTableDataStructs(RecordKeeper &Records, +static void emitDXILOperationTableDataStructs(const RecordKeeper &Records, raw_ostream &OS) { // Get Shader stage records - std::vector ShaderKindRecs = + std::vector ShaderKindRecs = Records.getAllDerivedDefinitions("DXILShaderStage"); // Sort records by name - llvm::sort(ShaderKindRecs, - [](Record *A, Record *B) { return A->getName() < B->getName(); }); + llvm::sort(ShaderKindRecs, [](const Record *A, const Record *B) { + return A->getName() < B->getName(); + }); OS << "// Valid shader kinds\n\n"; // Choose the type of enum ShaderKind based on the number of stages declared. @@ -508,22 +505,21 @@ static void emitDXILOperationTableDataStructs(RecordKeeper &Records, /// Entry function call that invokes the functionality of this TableGen backend /// \param Records TableGen records of DXIL Operations defined in DXIL.td /// \param OS output stream -static void EmitDXILOperation(RecordKeeper &Records, raw_ostream &OS) { +static void EmitDXILOperation(const RecordKeeper &Records, raw_ostream &OS) { OS << "// Generated code, do not edit.\n"; OS << "\n"; // Get all DXIL Ops property records - std::vector OpIntrProps = - Records.getAllDerivedDefinitions("DXILOp"); std::vector DXILOps; - for (auto *Record : OpIntrProps) { - DXILOps.emplace_back(DXILOperationDesc(Record)); + for (const Record *R : Records.getAllDerivedDefinitions("DXILOp")) { + DXILOps.emplace_back(DXILOperationDesc(R)); } // Sort by opcode. - llvm::sort(DXILOps, [](DXILOperationDesc &A, DXILOperationDesc &B) { - return A.OpCode < B.OpCode; - }); + llvm::sort(DXILOps, + [](const DXILOperationDesc &A, const DXILOperationDesc &B) { + return A.OpCode < B.OpCode; + }); int PrevOp = -1; - for (DXILOperationDesc &Desc : DXILOps) { + for (const DXILOperationDesc &Desc : DXILOps) { if (Desc.OpCode == PrevOp) PrintFatalError(Twine("Duplicate opcode: ") + Twine(Desc.OpCode)); PrevOp = Desc.OpCode; From 74335fb7ae7731a57a163524aeffd93911b50a46 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 18 Sep 2024 09:57:31 -0700 Subject: [PATCH 085/321] [LLVM][TableGen] Change X86MnemonicTables to use const RecordKeeper (#109053) Change X86MnemonicTables to use const RecordKeeper. This is a part of effort to have better const correctness in TableGen backends: https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089 --- llvm/utils/TableGen/X86MnemonicTables.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/llvm/utils/TableGen/X86MnemonicTables.cpp b/llvm/utils/TableGen/X86MnemonicTables.cpp index d9ceed40f7c70d..ddbfb2af9869f4 100644 --- a/llvm/utils/TableGen/X86MnemonicTables.cpp +++ b/llvm/utils/TableGen/X86MnemonicTables.cpp @@ -22,10 +22,10 @@ using namespace llvm; namespace { class X86MnemonicTablesEmitter { - CodeGenTarget Target; + const CodeGenTarget Target; public: - X86MnemonicTablesEmitter(RecordKeeper &R) : Target(R) {} + X86MnemonicTablesEmitter(const RecordKeeper &R) : Target(R) {} // Output X86 mnemonic tables. void run(raw_ostream &OS); @@ -34,15 +34,13 @@ class X86MnemonicTablesEmitter { void X86MnemonicTablesEmitter::run(raw_ostream &OS) { emitSourceFileHeader("X86 Mnemonic tables", OS); OS << "namespace llvm {\nnamespace X86 {\n\n"; - Record *AsmWriter = Target.getAsmWriter(); + const Record *AsmWriter = Target.getAsmWriter(); unsigned Variant = AsmWriter->getValueAsInt("Variant"); // Hold all instructions grouped by mnemonic StringMap> MnemonicToCGInstrMap; - ArrayRef NumberedInstructions = - Target.getInstructionsByEnumValue(); - for (const CodeGenInstruction *I : NumberedInstructions) { + for (const CodeGenInstruction *I : Target.getInstructionsByEnumValue()) { const Record *Def = I->TheDef; // Filter non-X86 instructions. if (!Def->isSubClassOf("X86Inst")) From ab2b333f0db50c4124f422343db3cc8c9e075787 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 18 Sep 2024 09:58:04 -0700 Subject: [PATCH 086/321] [LLVM][TableGen] Change FastISelEmitter to use const RecordKeeper (#109060) Change FastISelEmitter to use const RecordKeeper. This is a part of effort to have better const correctness in TableGen backends: https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089 --- llvm/utils/TableGen/FastISelEmitter.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/llvm/utils/TableGen/FastISelEmitter.cpp b/llvm/utils/TableGen/FastISelEmitter.cpp index 01df873ece1fcf..af05496a7b6ab9 100644 --- a/llvm/utils/TableGen/FastISelEmitter.cpp +++ b/llvm/utils/TableGen/FastISelEmitter.cpp @@ -272,7 +272,7 @@ struct OperandsSignature { DefInit *OpDI = dyn_cast(Op.getLeafValue()); if (!OpDI) return false; - Record *OpLeafRec = OpDI->getDef(); + const Record *OpLeafRec = OpDI->getDef(); // For now, the only other thing we accept is register operands. const CodeGenRegisterClass *RC = nullptr; @@ -407,7 +407,7 @@ class FastISelMap { public: explicit FastISelMap(StringRef InstNS); - void collectPatterns(CodeGenDAGPatterns &CGP); + void collectPatterns(const CodeGenDAGPatterns &CGP); void printImmediatePredicates(raw_ostream &OS); void printFunctionDefinitions(raw_ostream &OS); @@ -417,7 +417,8 @@ class FastISelMap { }; } // End anonymous namespace -static std::string getOpcodeName(const Record *Op, CodeGenDAGPatterns &CGP) { +static std::string getOpcodeName(const Record *Op, + const CodeGenDAGPatterns &CGP) { return std::string(CGP.getSDNodeInfo(Op).getEnumName()); } @@ -437,7 +438,7 @@ static std::string PhyRegForNode(TreePatternNode &Op, if (!Op.isLeaf()) return PhysReg; - Record *OpLeafRec = cast(Op.getLeafValue())->getDef(); + const Record *OpLeafRec = cast(Op.getLeafValue())->getDef(); if (!OpLeafRec->isSubClassOf("Register")) return PhysReg; @@ -448,7 +449,7 @@ static std::string PhyRegForNode(TreePatternNode &Op, return PhysReg; } -void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) { +void FastISelMap::collectPatterns(const CodeGenDAGPatterns &CGP) { const CodeGenTarget &Target = CGP.getTargetInfo(); // Scan through all the patterns and record the simple ones. @@ -864,8 +865,8 @@ void FastISelMap::printFunctionDefinitions(raw_ostream &OS) { // TODO: SignaturesWithConstantForms should be empty here. } -static void EmitFastISel(RecordKeeper &RK, raw_ostream &OS) { - CodeGenDAGPatterns CGP(RK); +static void EmitFastISel(const RecordKeeper &RK, raw_ostream &OS) { + const CodeGenDAGPatterns CGP(RK); const CodeGenTarget &Target = CGP.getTargetInfo(); emitSourceFileHeader("\"Fast\" Instruction Selector for the " + Target.getName().str() + " target", From 47d76a9910bad0f3db7bc887c5e769bb0f219107 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 18 Sep 2024 09:58:36 -0700 Subject: [PATCH 087/321] [LLVM][TableGen] Change InstrDocsEmitter to use const RecordKeeper (#109061) Change InstrDocsEmitter to use const RecordKeeper. This is a part of effort to have better const correctness in TableGen backends: https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089 --- llvm/utils/TableGen/InstrDocsEmitter.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/llvm/utils/TableGen/InstrDocsEmitter.cpp b/llvm/utils/TableGen/InstrDocsEmitter.cpp index f53428ecdffede..d32cfa23545454 100644 --- a/llvm/utils/TableGen/InstrDocsEmitter.cpp +++ b/llvm/utils/TableGen/InstrDocsEmitter.cpp @@ -61,9 +61,9 @@ static std::string escapeForRST(StringRef Str) { return Result; } -static void EmitInstrDocs(RecordKeeper &RK, raw_ostream &OS) { - CodeGenDAGPatterns CDP(RK); - CodeGenTarget &Target = CDP.getTargetInfo(); +static void EmitInstrDocs(const RecordKeeper &RK, raw_ostream &OS) { + const CodeGenDAGPatterns CDP(RK); + const CodeGenTarget &Target = CDP.getTargetInfo(); unsigned VariantCount = Target.getAsmParserVariantCount(); // Page title. @@ -86,7 +86,7 @@ static void EmitInstrDocs(RecordKeeper &RK, raw_ostream &OS) { // Assembly string(s). if (!II->AsmString.empty()) { for (unsigned VarNum = 0; VarNum < VariantCount; ++VarNum) { - Record *AsmVariant = Target.getAsmParserVariant(VarNum); + const Record *AsmVariant = Target.getAsmParserVariant(VarNum); OS << "Assembly string"; if (VariantCount != 1) OS << " (" << AsmVariant->getValueAsString("Name") << ")"; @@ -167,7 +167,7 @@ static void EmitInstrDocs(RecordKeeper &RK, raw_ostream &OS) { // names of both the compound operand and the basic operands it // contains. for (unsigned SubOpIdx = 0; SubOpIdx < Op.MINumOperands; ++SubOpIdx) { - Record *SubRec = + const Record *SubRec = cast(Op.MIOperandInfo->getArg(SubOpIdx))->getDef(); StringRef SubOpName = Op.MIOperandInfo->getArgNameStr(SubOpIdx); StringRef SubOpTypeName = SubRec->getName(); @@ -198,7 +198,7 @@ static void EmitInstrDocs(RecordKeeper &RK, raw_ostream &OS) { if (!II->ImplicitDefs.empty()) { OS << "Implicit defs: "; ListSeparator LS; - for (Record *Def : II->ImplicitDefs) + for (const Record *Def : II->ImplicitDefs) OS << LS << "``" << Def->getName() << "``"; OS << "\n\n"; } @@ -207,18 +207,18 @@ static void EmitInstrDocs(RecordKeeper &RK, raw_ostream &OS) { if (!II->ImplicitUses.empty()) { OS << "Implicit uses: "; ListSeparator LS; - for (Record *Use : II->ImplicitUses) + for (const Record *Use : II->ImplicitUses) OS << LS << "``" << Use->getName() << "``"; OS << "\n\n"; } // Predicates. - std::vector Predicates = - II->TheDef->getValueAsListOfDefs("Predicates"); + std::vector Predicates = + II->TheDef->getValueAsListOfConstDefs("Predicates"); if (!Predicates.empty()) { OS << "Predicates: "; ListSeparator LS; - for (Record *P : Predicates) + for (const Record *P : Predicates) OS << LS << "``" << P->getName() << "``"; OS << "\n\n"; } From b18190ebfc4bc724eca07fb8432c3a3e899098b3 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 18 Sep 2024 09:59:24 -0700 Subject: [PATCH 088/321] [LLVM][TableGen] Change MacroFusionPredicator to use const RecordKeeper (#109064) Change MacroFusionPredicator to use const RecordKeeper. This is a part of effort to have better const correctness in TableGen backends: https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089 --- .../TableGen/MacroFusionPredicatorEmitter.cpp | 58 +++++++++---------- 1 file changed, 28 insertions(+), 30 deletions(-) diff --git a/llvm/utils/TableGen/MacroFusionPredicatorEmitter.cpp b/llvm/utils/TableGen/MacroFusionPredicatorEmitter.cpp index f61a05861981c5..c4f238b67476a7 100644 --- a/llvm/utils/TableGen/MacroFusionPredicatorEmitter.cpp +++ b/llvm/utils/TableGen/MacroFusionPredicatorEmitter.cpp @@ -52,36 +52,37 @@ using namespace llvm; namespace { class MacroFusionPredicatorEmitter { - RecordKeeper &Records; - CodeGenTarget Target; + const RecordKeeper &Records; + const CodeGenTarget Target; - void emitMacroFusionDecl(ArrayRef Fusions, PredicateExpander &PE, - raw_ostream &OS); - void emitMacroFusionImpl(ArrayRef Fusions, PredicateExpander &PE, - raw_ostream &OS); - void emitPredicates(ArrayRef FirstPredicate, bool IsCommutable, - PredicateExpander &PE, raw_ostream &OS); - void emitFirstPredicate(Record *SecondPredicate, bool IsCommutable, + void emitMacroFusionDecl(ArrayRef Fusions, + PredicateExpander &PE, raw_ostream &OS); + void emitMacroFusionImpl(ArrayRef Fusions, + PredicateExpander &PE, raw_ostream &OS); + void emitPredicates(ArrayRef FirstPredicate, + bool IsCommutable, PredicateExpander &PE, + raw_ostream &OS); + void emitFirstPredicate(const Record *SecondPredicate, bool IsCommutable, PredicateExpander &PE, raw_ostream &OS); - void emitSecondPredicate(Record *SecondPredicate, bool IsCommutable, + void emitSecondPredicate(const Record *SecondPredicate, bool IsCommutable, PredicateExpander &PE, raw_ostream &OS); - void emitBothPredicate(Record *Predicates, bool IsCommutable, + void emitBothPredicate(const Record *Predicates, bool IsCommutable, PredicateExpander &PE, raw_ostream &OS); public: - MacroFusionPredicatorEmitter(RecordKeeper &R) : Records(R), Target(R) {} + MacroFusionPredicatorEmitter(const RecordKeeper &R) : Records(R), Target(R) {} void run(raw_ostream &OS); }; } // End anonymous namespace. void MacroFusionPredicatorEmitter::emitMacroFusionDecl( - ArrayRef Fusions, PredicateExpander &PE, raw_ostream &OS) { + ArrayRef Fusions, PredicateExpander &PE, raw_ostream &OS) { OS << "#ifdef GET_" << Target.getName() << "_MACRO_FUSION_PRED_DECL\n"; OS << "#undef GET_" << Target.getName() << "_MACRO_FUSION_PRED_DECL\n\n"; OS << "namespace llvm {\n"; - for (Record *Fusion : Fusions) { + for (const Record *Fusion : Fusions) { OS << "bool is" << Fusion->getName() << "(const TargetInstrInfo &, " << "const TargetSubtargetInfo &, " << "const MachineInstr *, " @@ -93,14 +94,14 @@ void MacroFusionPredicatorEmitter::emitMacroFusionDecl( } void MacroFusionPredicatorEmitter::emitMacroFusionImpl( - ArrayRef Fusions, PredicateExpander &PE, raw_ostream &OS) { + ArrayRef Fusions, PredicateExpander &PE, raw_ostream &OS) { OS << "#ifdef GET_" << Target.getName() << "_MACRO_FUSION_PRED_IMPL\n"; OS << "#undef GET_" << Target.getName() << "_MACRO_FUSION_PRED_IMPL\n\n"; OS << "namespace llvm {\n"; - for (Record *Fusion : Fusions) { - std::vector Predicates = - Fusion->getValueAsListOfDefs("Predicates"); + for (const Record *Fusion : Fusions) { + std::vector Predicates = + Fusion->getValueAsListOfConstDefs("Predicates"); bool IsCommutable = Fusion->getValueAsBit("IsCommutable"); OS << "bool is" << Fusion->getName() << "(\n"; @@ -121,12 +122,11 @@ void MacroFusionPredicatorEmitter::emitMacroFusionImpl( OS << "\n#endif\n"; } -void MacroFusionPredicatorEmitter::emitPredicates(ArrayRef Predicates, - bool IsCommutable, - PredicateExpander &PE, - raw_ostream &OS) { - for (Record *Predicate : Predicates) { - Record *Target = Predicate->getValueAsDef("Target"); +void MacroFusionPredicatorEmitter::emitPredicates( + ArrayRef Predicates, bool IsCommutable, + PredicateExpander &PE, raw_ostream &OS) { + for (const Record *Predicate : Predicates) { + const Record *Target = Predicate->getValueAsDef("Target"); if (Target->getName() == "first_fusion_target") emitFirstPredicate(Predicate, IsCommutable, PE, OS); else if (Target->getName() == "second_fusion_target") @@ -139,7 +139,7 @@ void MacroFusionPredicatorEmitter::emitPredicates(ArrayRef Predicates, } } -void MacroFusionPredicatorEmitter::emitFirstPredicate(Record *Predicate, +void MacroFusionPredicatorEmitter::emitFirstPredicate(const Record *Predicate, bool IsCommutable, PredicateExpander &PE, raw_ostream &OS) { @@ -172,7 +172,7 @@ void MacroFusionPredicatorEmitter::emitFirstPredicate(Record *Predicate, } } -void MacroFusionPredicatorEmitter::emitSecondPredicate(Record *Predicate, +void MacroFusionPredicatorEmitter::emitSecondPredicate(const Record *Predicate, bool IsCommutable, PredicateExpander &PE, raw_ostream &OS) { @@ -223,7 +223,7 @@ void MacroFusionPredicatorEmitter::emitSecondPredicate(Record *Predicate, } } -void MacroFusionPredicatorEmitter::emitBothPredicate(Record *Predicate, +void MacroFusionPredicatorEmitter::emitBothPredicate(const Record *Predicate, bool IsCommutable, PredicateExpander &PE, raw_ostream &OS) { @@ -277,9 +277,7 @@ void MacroFusionPredicatorEmitter::run(raw_ostream &OS) { PE.setByRef(false); PE.setExpandForMC(false); - std::vector Fusions = Records.getAllDerivedDefinitions("Fusion"); - // Sort macro fusions by name. - sort(Fusions, LessRecord()); + ArrayRef Fusions = Records.getAllDerivedDefinitions("Fusion"); emitMacroFusionDecl(Fusions, PE, OS); OS << "\n"; emitMacroFusionImpl(Fusions, PE, OS); From 5f02558d820fdc9aa8ac7d3d887e72526574e1d9 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Wed, 18 Sep 2024 10:02:16 -0700 Subject: [PATCH 089/321] [OpenMP] Fix not linking C libraries when enabled (#109168) Summary: We used to do this automatically, add it back in to do it manually. --- offload/test/lit.cfg | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg index 9ddef42cf90370..514bb89e0b644e 100644 --- a/offload/test/lit.cfg +++ b/offload/test/lit.cfg @@ -182,7 +182,12 @@ def remove_suffix_if_present(name): return name def add_libraries(source): - return source + " " + config.llvm_library_intdir + "/libomptarget.devicertl.a" + if config.libomptarget_has_libc: + return source + " -Xoffload-linker " + "-lc " + \ + "-Xoffload-linker " + "-lm " + \ + config.llvm_library_intdir + "/libomptarget.devicertl.a" + else: + return source + " " + config.llvm_library_intdir + "/libomptarget.devicertl.a" # Add platform targets host_targets = [ From 1a793a8ca024a5b6e6a659cc4e1a8c4ab45e3cee Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 18 Sep 2024 10:41:21 -0700 Subject: [PATCH 090/321] [LLVM][TableGen] Change X86InstrMapping to use const RecordKeeper (#109066) Change X86InstrMappingEmitter to use const RecordKeeper. This is a part of effort to have better const correctness in TableGen backends: https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089 --- .../utils/TableGen/X86InstrMappingEmitter.cpp | 29 +++++++++---------- llvm/utils/TableGen/X86RecognizableInstr.cpp | 9 +++--- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/llvm/utils/TableGen/X86InstrMappingEmitter.cpp b/llvm/utils/TableGen/X86InstrMappingEmitter.cpp index 0abe353a9a579b..f68c727cbe9230 100644 --- a/llvm/utils/TableGen/X86InstrMappingEmitter.cpp +++ b/llvm/utils/TableGen/X86InstrMappingEmitter.cpp @@ -26,8 +26,8 @@ using namespace X86Disassembler; namespace { class X86InstrMappingEmitter { - RecordKeeper &Records; - CodeGenTarget Target; + const RecordKeeper &Records; + const CodeGenTarget Target; // Hold all pontentially compressible EVEX instructions std::vector PreCompressionInsts; @@ -44,7 +44,7 @@ class X86InstrMappingEmitter { PredicateInstMap PredicateInsts; public: - X86InstrMappingEmitter(RecordKeeper &R) : Records(R), Target(R) {} + X86InstrMappingEmitter(const RecordKeeper &R) : Records(R), Target(R) {} // run - Output X86 EVEX compression tables. void run(raw_ostream &OS); @@ -63,8 +63,8 @@ class X86InstrMappingEmitter { void printClassDef(raw_ostream &OS); // Prints the given table as a C++ array of type X86TableEntry under the guard // \p Macro. - void printTable(const std::vector &Table, StringRef Name, - StringRef Macro, raw_ostream &OS); + void printTable(ArrayRef Table, StringRef Name, StringRef Macro, + raw_ostream &OS); }; void X86InstrMappingEmitter::printClassDef(raw_ostream &OS) { @@ -90,9 +90,8 @@ static void printMacroEnd(StringRef Macro, raw_ostream &OS) { OS << "#endif // " << Macro << "\n\n"; } -void X86InstrMappingEmitter::printTable(const std::vector &Table, - StringRef Name, StringRef Macro, - raw_ostream &OS) { +void X86InstrMappingEmitter::printTable(ArrayRef Table, StringRef Name, + StringRef Macro, raw_ostream &OS) { printMacroBegin(Macro, OS); OS << "static const X86TableEntry " << Name << "[] = {\n"; @@ -220,7 +219,7 @@ void X86InstrMappingEmitter::emitCompressEVEXTable( assert(NewRec && "Instruction not found!"); NewInst = &Target.getInstruction(NewRec); } else if (Name.ends_with("_EVEX")) { - if (auto *NewRec = Records.getDef(Name.drop_back(5))) + if (const auto *NewRec = Records.getDef(Name.drop_back(5))) NewInst = &Target.getInstruction(NewRec); } else if (Name.ends_with("_ND")) // Leave it to ND2NONND table. @@ -319,7 +318,7 @@ void X86InstrMappingEmitter::emitND2NonNDTable( if (!isInteresting(Rec) || NoCompressSet.find(Name) != NoCompressSet.end()) continue; if (ManualMap.find(Name) != ManualMap.end()) { - auto *NewRec = Records.getDef(ManualMap.at(Rec->getName())); + const auto *NewRec = Records.getDef(ManualMap.at(Rec->getName())); assert(NewRec && "Instruction not found!"); auto &NewInst = Target.getInstruction(NewRec); Table.push_back(std::pair(Inst, &NewInst)); @@ -328,10 +327,10 @@ void X86InstrMappingEmitter::emitND2NonNDTable( if (!Name.ends_with("_ND")) continue; - auto *NewRec = Records.getDef(Name.drop_back(3)); + const auto *NewRec = Records.getDef(Name.drop_back(3)); if (!NewRec) continue; - auto &NewInst = Target.getInstruction(NewRec); + const auto &NewInst = Target.getInstruction(NewRec); if (isRegisterOperand(NewInst.Operands[0].Rec)) Table.push_back(std::pair(Inst, &NewInst)); } @@ -353,15 +352,15 @@ void X86InstrMappingEmitter::emitSSE2AVXTable( if (!isInteresting(Rec)) continue; if (ManualMap.find(Name) != ManualMap.end()) { - auto *NewRec = Records.getDef(ManualMap.at(Rec->getName())); + const auto *NewRec = Records.getDef(ManualMap.at(Rec->getName())); assert(NewRec && "Instruction not found!"); - auto &NewInst = Target.getInstruction(NewRec); + const auto &NewInst = Target.getInstruction(NewRec); Table.push_back(std::pair(Inst, &NewInst)); continue; } std::string NewName = ("V" + Name).str(); - auto *AVXRec = Records.getDef(NewName); + const auto *AVXRec = Records.getDef(NewName); if (!AVXRec) continue; auto &AVXInst = Target.getInstruction(AVXRec); diff --git a/llvm/utils/TableGen/X86RecognizableInstr.cpp b/llvm/utils/TableGen/X86RecognizableInstr.cpp index a1e67e3ea692a3..4386e8361712b8 100644 --- a/llvm/utils/TableGen/X86RecognizableInstr.cpp +++ b/llvm/utils/TableGen/X86RecognizableInstr.cpp @@ -154,14 +154,13 @@ RecognizableInstr::RecognizableInstr(DisassemblerTables &tables, UID(uid), Spec(&tables.specForUID(uid)) { // Check for 64-bit inst which does not require REX // FIXME: Is there some better way to check for In64BitMode? - std::vector Predicates = Rec->getValueAsListOfDefs("Predicates"); - for (unsigned i = 0, e = Predicates.size(); i != e; ++i) { - if (Predicates[i]->getName().contains("Not64Bit") || - Predicates[i]->getName().contains("In32Bit")) { + for (const Record *Predicate : Rec->getValueAsListOfConstDefs("Predicates")) { + if (Predicate->getName().contains("Not64Bit") || + Predicate->getName().contains("In32Bit")) { Is32Bit = true; break; } - if (Predicates[i]->getName().contains("In64Bit")) { + if (Predicate->getName().contains("In64Bit")) { Is64Bit = true; break; } From 80aa4dab1e0fc6ede382efd1e7fddb063fecac09 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 18 Sep 2024 10:41:57 -0700 Subject: [PATCH 091/321] [LLVM][TableGen] Change X86FoldTablesEmitter to use const RecordKeeper (#109070) Change X86FoldTablesEmitter to use const RecordKeeper. This is a part of effort to have better const correctness in TableGen backends: https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089 --- llvm/utils/TableGen/X86FoldTablesEmitter.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp index 8952c8e0a1c6f1..dfa10f74974c7e 100644 --- a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp +++ b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp @@ -63,8 +63,8 @@ static bool isExplicitUnalign(const CodeGenInstruction *Inst) { } class X86FoldTablesEmitter { - RecordKeeper &Records; - CodeGenTarget Target; + const RecordKeeper &Records; + const CodeGenTarget Target; // Represents an entry in the folding table class X86FoldTableEntry { @@ -196,7 +196,7 @@ class X86FoldTablesEmitter { FoldTable BroadcastTable4; public: - X86FoldTablesEmitter(RecordKeeper &R) : Records(R), Target(R) {} + X86FoldTablesEmitter(const RecordKeeper &R) : Records(R), Target(R) {} // run - Generate the 6 X86 memory fold tables. void run(raw_ostream &OS); @@ -670,7 +670,7 @@ void X86FoldTablesEmitter::run(raw_ostream &OS) { // added into memory fold tables. auto RegInstsForBroadcast = RegInsts; - Record *AsmWriter = Target.getAsmWriter(); + const Record *AsmWriter = Target.getAsmWriter(); unsigned Variant = AsmWriter->getValueAsInt("Variant"); auto FixUp = [&](const CodeGenInstruction *RegInst) { StringRef RegInstName = RegInst->TheDef->getName(); @@ -721,8 +721,8 @@ void X86FoldTablesEmitter::run(raw_ostream &OS) { // Add the manually mapped instructions listed above. for (const ManualMapEntry &Entry : ManualMapSet) { - Record *RegInstIter = Records.getDef(Entry.RegInstStr); - Record *MemInstIter = Records.getDef(Entry.MemInstStr); + const Record *RegInstIter = Records.getDef(Entry.RegInstStr); + const Record *MemInstIter = Records.getDef(Entry.MemInstStr); updateTables(&(Target.getInstruction(RegInstIter)), &(Target.getInstruction(MemInstIter)), Entry.Strategy, true); From 2c966709b7bafb08a97c7bfc3d3252a932a8b1f8 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 18 Sep 2024 10:42:51 -0700 Subject: [PATCH 092/321] [LLVM][TableGen] Change CompressInstEmitter to use const RecordKeeper (#109035) Change CompressInstEmitter to use const RecordKeeper. This is a part of effort to have better const correctness in TableGen backends: https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089 --- llvm/utils/TableGen/CompressInstEmitter.cpp | 92 +++++++++++---------- 1 file changed, 47 insertions(+), 45 deletions(-) diff --git a/llvm/utils/TableGen/CompressInstEmitter.cpp b/llvm/utils/TableGen/CompressInstEmitter.cpp index 06801e93f4f403..f46ceb5174229e 100644 --- a/llvm/utils/TableGen/CompressInstEmitter.cpp +++ b/llvm/utils/TableGen/CompressInstEmitter.cpp @@ -92,7 +92,7 @@ class CompressInstEmitter { // Integer immediate value. int64_t Imm; // Physical register. - Record *Reg; + const Record *Reg; } Data; // Tied operand index within the instruction. int TiedOpIdx = -1; @@ -103,7 +103,7 @@ class CompressInstEmitter { // The destination instruction to transform to. CodeGenInstruction Dest; // Required target features to enable pattern. - std::vector PatReqFeatures; + std::vector PatReqFeatures; // Maps operands in the Source Instruction to // the corresponding Dest instruction operand. IndexedMap SourceOperandMap; @@ -112,38 +112,40 @@ class CompressInstEmitter { IndexedMap DestOperandMap; bool IsCompressOnly; - CompressPat(CodeGenInstruction &S, CodeGenInstruction &D, - std::vector RF, IndexedMap &SourceMap, + CompressPat(const CodeGenInstruction &S, const CodeGenInstruction &D, + std::vector RF, IndexedMap &SourceMap, IndexedMap &DestMap, bool IsCompressOnly) : Source(S), Dest(D), PatReqFeatures(RF), SourceOperandMap(SourceMap), DestOperandMap(DestMap), IsCompressOnly(IsCompressOnly) {} }; enum EmitterType { Compress, Uncompress, CheckCompress }; - RecordKeeper &Records; - CodeGenTarget Target; + const RecordKeeper &Records; + const CodeGenTarget Target; SmallVector CompressPatterns; - void addDagOperandMapping(Record *Rec, DagInit *Dag, CodeGenInstruction &Inst, + void addDagOperandMapping(const Record *Rec, const DagInit *Dag, + const CodeGenInstruction &Inst, IndexedMap &OperandMap, bool IsSourceInst); - void evaluateCompressPat(Record *Compress); + void evaluateCompressPat(const Record *Compress); void emitCompressInstEmitter(raw_ostream &OS, EmitterType EType); bool validateTypes(const Record *DagOpType, const Record *InstOpType, bool IsSourceInst); bool validateRegister(const Record *Reg, const Record *RegClass); - void createDagOperandMapping(Record *Rec, StringMap &SourceOperands, + void createDagOperandMapping(const Record *Rec, + StringMap &SourceOperands, StringMap &DestOperands, - DagInit *SourceDag, DagInit *DestDag, + const DagInit *SourceDag, const DagInit *DestDag, IndexedMap &SourceOperandMap); - void createInstOperandMapping(Record *Rec, DagInit *SourceDag, - DagInit *DestDag, + void createInstOperandMapping(const Record *Rec, const DagInit *SourceDag, + const DagInit *DestDag, IndexedMap &SourceOperandMap, IndexedMap &DestOperandMap, StringMap &SourceOperands, - CodeGenInstruction &DestInst); + const CodeGenInstruction &DestInst); public: - CompressInstEmitter(RecordKeeper &R) : Records(R), Target(R) {} + CompressInstEmitter(const RecordKeeper &R) : Records(R), Target(R) {} void run(raw_ostream &OS); }; @@ -156,7 +158,7 @@ bool CompressInstEmitter::validateRegister(const Record *Reg, "RegClass record should be a RegisterClass"); const CodeGenRegisterClass &RC = Target.getRegisterClass(RegClass); const CodeGenRegister *R = Target.getRegisterByName(Reg->getName().lower()); - assert((R != nullptr) && "Register not defined!!"); + assert(R != nullptr && "Register not defined!!"); return RC.contains(R); } @@ -199,8 +201,9 @@ bool CompressInstEmitter::validateTypes(const Record *DagOpType, /// operands and fixed registers it expects the Dag operand type to be contained /// in the instantiated instruction operand type. For immediate operands and /// immediates no validation checks are enforced at pattern validation time. -void CompressInstEmitter::addDagOperandMapping(Record *Rec, DagInit *Dag, - CodeGenInstruction &Inst, +void CompressInstEmitter::addDagOperandMapping(const Record *Rec, + const DagInit *Dag, + const CodeGenInstruction &Inst, IndexedMap &OperandMap, bool IsSourceInst) { // TiedCount keeps track of the number of operands skipped in Inst @@ -218,7 +221,7 @@ void CompressInstEmitter::addDagOperandMapping(Record *Rec, DagInit *Dag, TiedCount++; continue; } - if (DefInit *DI = dyn_cast(Dag->getArg(I - TiedCount))) { + if (const DefInit *DI = dyn_cast(Dag->getArg(I - TiedCount))) { if (DI->getDef()->isSubClassOf("Register")) { // Check if the fixed register belongs to the Register class. if (!validateRegister(DI->getDef(), Inst.Operands[I].Rec)) @@ -267,7 +270,7 @@ void CompressInstEmitter::addDagOperandMapping(Record *Rec, DagInit *Dag, } // Verify the Dag operand count is enough to build an instruction. -static bool verifyDagOpCount(CodeGenInstruction &Inst, DagInit *Dag, +static bool verifyDagOpCount(const CodeGenInstruction &Inst, const DagInit *Dag, bool IsSource) { if (Dag->getNumArgs() == Inst.Operands.size()) return true; @@ -297,7 +300,7 @@ static bool verifyDagOpCount(CodeGenInstruction &Inst, DagInit *Dag, return true; } -static bool validateArgsTypes(Init *Arg1, Init *Arg2) { +static bool validateArgsTypes(const Init *Arg1, const Init *Arg2) { return cast(Arg1)->getDef() == cast(Arg2)->getDef(); } @@ -307,9 +310,9 @@ static bool validateArgsTypes(Init *Arg1, Init *Arg2) { // mapping $rs1 --> 0, $rs2 ---> 1. If the operand appears twice in the (tied) // same Dag we use the last occurrence for indexing. void CompressInstEmitter::createDagOperandMapping( - Record *Rec, StringMap &SourceOperands, - StringMap &DestOperands, DagInit *SourceDag, DagInit *DestDag, - IndexedMap &SourceOperandMap) { + const Record *Rec, StringMap &SourceOperands, + StringMap &DestOperands, const DagInit *SourceDag, + const DagInit *DestDag, IndexedMap &SourceOperandMap) { for (unsigned I = 0; I < DestDag->getNumArgs(); ++I) { // Skip fixed immediates and registers, they were handled in // addDagOperandMapping. @@ -354,9 +357,9 @@ void CompressInstEmitter::createDagOperandMapping( /// output instructions. Validate that operands defined in the input are /// used in the output pattern while populating the maps. void CompressInstEmitter::createInstOperandMapping( - Record *Rec, DagInit *SourceDag, DagInit *DestDag, + const Record *Rec, const DagInit *SourceDag, const DagInit *DestDag, IndexedMap &SourceOperandMap, IndexedMap &DestOperandMap, - StringMap &SourceOperands, CodeGenInstruction &DestInst) { + StringMap &SourceOperands, const CodeGenInstruction &DestInst) { // TiedCount keeps track of the number of operands skipped in Inst // operands list to get to the corresponding Dag operand. unsigned TiedCount = 0; @@ -423,14 +426,14 @@ void CompressInstEmitter::createInstOperandMapping( /// and generate warning. /// - Immediate operand type in Dag Input differs from the corresponding Source /// Instruction type and generate a warning. -void CompressInstEmitter::evaluateCompressPat(Record *Rec) { +void CompressInstEmitter::evaluateCompressPat(const Record *Rec) { // Validate input Dag operands. DagInit *SourceDag = Rec->getValueAsDag("Input"); assert(SourceDag && "Missing 'Input' in compress pattern!"); LLVM_DEBUG(dbgs() << "Input: " << *SourceDag << "\n"); // Checking we are transforming from compressed to uncompressed instructions. - Record *SourceOperator = SourceDag->getOperatorAsDef(Rec->getLoc()); + const Record *SourceOperator = SourceDag->getOperatorAsDef(Rec->getLoc()); CodeGenInstruction SourceInst(SourceOperator); verifyDagOpCount(SourceInst, SourceDag, true); @@ -439,7 +442,7 @@ void CompressInstEmitter::evaluateCompressPat(Record *Rec) { assert(DestDag && "Missing 'Output' in compress pattern!"); LLVM_DEBUG(dbgs() << "Output: " << *DestDag << "\n"); - Record *DestOperator = DestDag->getOperatorAsDef(Rec->getLoc()); + const Record *DestOperator = DestDag->getOperatorAsDef(Rec->getLoc()); CodeGenInstruction DestInst(DestOperator); verifyDagOpCount(DestInst, DestDag, false); @@ -475,9 +478,9 @@ void CompressInstEmitter::evaluateCompressPat(Record *Rec) { DestOperandMap, SourceOperands, DestInst); // Get the target features for the CompressPat. - std::vector PatReqFeatures; - std::vector RF = Rec->getValueAsListOfDefs("Predicates"); - copy_if(RF, std::back_inserter(PatReqFeatures), [](Record *R) { + std::vector PatReqFeatures; + std::vector RF = Rec->getValueAsListOfConstDefs("Predicates"); + copy_if(RF, std::back_inserter(PatReqFeatures), [](const Record *R) { return R->getValueAsBit("AssemblerMatcherPredicate"); }); @@ -489,8 +492,8 @@ void CompressInstEmitter::evaluateCompressPat(Record *Rec) { static void getReqFeatures(std::set> &FeaturesSet, std::set>> &AnyOfFeatureSets, - const std::vector &ReqFeatures) { - for (auto &R : ReqFeatures) { + ArrayRef ReqFeatures) { + for (const Record *R : ReqFeatures) { const DagInit *D = R->getValueAsDag("AssemblerCondDag"); std::string CombineType = D->getOperator()->getAsString(); if (CombineType != "any_of" && CombineType != "all_of") @@ -542,8 +545,8 @@ static unsigned getPredicates(DenseMap &PredicateMap, return 0; } -static void printPredicates(const std::vector &Predicates, - StringRef Name, raw_ostream &OS) { +static void printPredicates(ArrayRef Predicates, StringRef Name, + raw_ostream &OS) { for (unsigned I = 0; I < Predicates.size(); ++I) { StringRef Pred = Predicates[I]->getValueAsString(Name); OS << " case " << I + 1 << ": {\n" @@ -565,7 +568,7 @@ static void mergeCondAndCode(raw_ostream &CombinedStream, StringRef CondStr, void CompressInstEmitter::emitCompressInstEmitter(raw_ostream &OS, EmitterType EType) { - Record *AsmWriter = Target.getAsmWriter(); + const Record *AsmWriter = Target.getAsmWriter(); if (!AsmWriter->getValueAsInt("PassSubtarget")) PrintFatalError(AsmWriter->getLoc(), "'PassSubtarget' is false. SubTargetInfo object is needed " @@ -683,9 +686,10 @@ void CompressInstEmitter::emitCompressInstEmitter(raw_ostream &OS, getReqFeatures(FeaturesSet, AnyOfFeatureSets, CompressPat.PatReqFeatures); // Add Dest instruction required features. - std::vector ReqFeatures; - std::vector RF = Dest.TheDef->getValueAsListOfDefs("Predicates"); - copy_if(RF, std::back_inserter(ReqFeatures), [](Record *R) { + std::vector ReqFeatures; + std::vector RF = + Dest.TheDef->getValueAsListOfConstDefs("Predicates"); + copy_if(RF, std::back_inserter(ReqFeatures), [](const Record *R) { return R->getValueAsBit("AssemblerMatcherPredicate"); }); getReqFeatures(FeaturesSet, AnyOfFeatureSets, ReqFeatures); @@ -738,7 +742,7 @@ void CompressInstEmitter::emitCompressInstEmitter(raw_ostream &OS, << ").getImm() == " << SourceOperandMap[OpNo].Data.Imm << ") &&\n"; break; case OpData::Reg: { - Record *Reg = SourceOperandMap[OpNo].Data.Reg; + const Record *Reg = SourceOperandMap[OpNo].Data.Reg; CondStream.indent(6) << "(MI.getOperand(" << OpNo << ").isReg()) &&\n" << " (MI.getOperand(" << OpNo << ").getReg() == " << TargetName @@ -827,7 +831,7 @@ void CompressInstEmitter::emitCompressInstEmitter(raw_ostream &OS, case OpData::Reg: { if (CompressOrUncompress) { // Fixed register has been validated at pattern validation time. - Record *Reg = DestOperandMap[OpNo].Data.Reg; + const Record *Reg = DestOperandMap[OpNo].Data.Reg; CodeStream.indent(6) << "OutInst.addOperand(MCOperand::createReg(" << TargetName << "::" << Reg->getName() << "));\n"; @@ -891,11 +895,9 @@ void CompressInstEmitter::emitCompressInstEmitter(raw_ostream &OS, } void CompressInstEmitter::run(raw_ostream &OS) { - std::vector Insts = Records.getAllDerivedDefinitions("CompressPat"); - // Process the CompressPat definitions, validating them as we do so. - for (unsigned I = 0, E = Insts.size(); I != E; ++I) - evaluateCompressPat(Insts[I]); + for (const Record *Pat : Records.getAllDerivedDefinitions("CompressPat")) + evaluateCompressPat(Pat); // Emit file header. emitSourceFileHeader("Compress instruction Source Fragment", OS, Records); From e06f32114d0bda2ce5f008f18e838aba529d7a58 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Wed, 18 Sep 2024 10:48:45 -0700 Subject: [PATCH 093/321] [CSKY,M68k,Xtensa] Update function names after #108643 --- llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp | 8 ++++---- llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp | 8 ++++---- llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp | 8 ++++---- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp b/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp index 30bd3dcefa605a..d923c96bc008e4 100644 --- a/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp +++ b/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp @@ -67,14 +67,14 @@ class CSKYAsmParser : public MCTargetAsmParser { SMLoc getLoc() const { return getParser().getTok().getLoc(); } - bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + bool matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) override; bool parseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) override; - bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + bool parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) override; ParseStatus parseDirective(AsmToken DirectiveID) override; @@ -656,7 +656,7 @@ bool CSKYAsmParser::generateImmOutOfRangeError( return Error(ErrorLoc, Msg + " [" + Twine(Lower) + ", " + Twine(Upper) + "]"); } -bool CSKYAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, +bool CSKYAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, @@ -1485,7 +1485,7 @@ ParseStatus CSKYAsmParser::parseRegList(OperandVector &Operands) { return ParseStatus::Success; } -bool CSKYAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, +bool CSKYAsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) { // First operand is token for instruction. Operands.push_back(CSKYOperand::createToken(Name, NameLoc)); diff --git a/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp b/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp index 126176133dc027..3a0d9dd316d824 100644 --- a/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp +++ b/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp @@ -69,9 +69,9 @@ class M68kAsmParser : public MCTargetAsmParser { bool parseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) override; ParseStatus tryParseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) override; - bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + bool parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) override; - bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + bool matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) override; @@ -959,7 +959,7 @@ void M68kAsmParser::eatComma() { } } -bool M68kAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, +bool M68kAsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) { SMLoc Start = getLexer().getLoc(); Operands.push_back(M68kOperand::createToken(Name, Start, Start)); @@ -1024,7 +1024,7 @@ bool M68kAsmParser::emit(MCInst &Inst, SMLoc const &Loc, return false; } -bool M68kAsmParser::MatchAndEmitInstruction(SMLoc Loc, unsigned &Opcode, +bool M68kAsmParser::matchAndEmitInstruction(SMLoc Loc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, diff --git a/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp b/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp index b0ce624a495fd5..83b1cfca529bf3 100644 --- a/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp +++ b/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp @@ -45,9 +45,9 @@ class XtensaAsmParser : public MCTargetAsmParser { ParseStatus parseDirective(AsmToken DirectiveID) override; bool parseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) override; - bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + bool parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) override; - bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + bool matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) override; @@ -425,7 +425,7 @@ bool XtensaAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, return true; } -bool XtensaAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, +bool XtensaAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, @@ -730,7 +730,7 @@ bool XtensaAsmParser::ParseInstructionWithSR(ParseInstructionInfo &Info, return false; } -bool XtensaAsmParser::ParseInstruction(ParseInstructionInfo &Info, +bool XtensaAsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) { if (Name.starts_with("wsr") || Name.starts_with("rsr") || From 86d2abefcb8b6b2d3f57b3ec8650f11861c226ff Mon Sep 17 00:00:00 2001 From: Jacek Caban Date: Wed, 18 Sep 2024 19:49:06 +0200 Subject: [PATCH 094/321] [LLD][COFF] Store __imp_ symbols as Defined in InputFile (#109115) --- lld/COFF/InputFiles.h | 4 ++-- lld/COFF/MapFile.cpp | 3 +-- lld/COFF/SymbolTable.cpp | 6 +++--- lld/COFF/SymbolTable.h | 4 ++-- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/lld/COFF/InputFiles.h b/lld/COFF/InputFiles.h index 5fa93f57ef9e3a..a20b097cbe04af 100644 --- a/lld/COFF/InputFiles.h +++ b/lld/COFF/InputFiles.h @@ -349,7 +349,7 @@ class ImportFile : public InputFile { MachineTypes getMachineType() const override; DefinedImportData *impSym = nullptr; - Symbol *thunkSym = nullptr; + Defined *thunkSym = nullptr; ImportThunkChunkARM64EC *impchkThunk = nullptr; std::string dllName; @@ -365,7 +365,7 @@ class ImportFile : public InputFile { // Auxiliary IAT symbols and chunks on ARM64EC. DefinedImportData *impECSym = nullptr; Chunk *auxLocation = nullptr; - Symbol *auxThunkSym = nullptr; + Defined *auxThunkSym = nullptr; DefinedImportData *auxImpCopySym = nullptr; Chunk *auxCopyLocation = nullptr; diff --git a/lld/COFF/MapFile.cpp b/lld/COFF/MapFile.cpp index 52e9ce996f2390..751a2238e701f7 100644 --- a/lld/COFF/MapFile.cpp +++ b/lld/COFF/MapFile.cpp @@ -128,8 +128,7 @@ static void getSymbols(const COFFLinkerContext &ctx, if (!file->thunkSym->isLive()) continue; - if (auto *thunkSym = dyn_cast(file->thunkSym)) - syms.push_back(thunkSym); + syms.push_back(file->thunkSym); if (auto *impSym = dyn_cast_or_null(file->impSym)) syms.push_back(impSym); diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp index 1488ad95d0da62..0ef58910151cf0 100644 --- a/lld/COFF/SymbolTable.cpp +++ b/lld/COFF/SymbolTable.cpp @@ -823,13 +823,13 @@ DefinedImportData *SymbolTable::addImportData(StringRef n, ImportFile *f, return nullptr; } -Symbol *SymbolTable::addImportThunk(StringRef name, DefinedImportData *id, - ImportThunkChunk *chunk) { +Defined *SymbolTable::addImportThunk(StringRef name, DefinedImportData *id, + ImportThunkChunk *chunk) { auto [s, wasInserted] = insert(name, nullptr); s->isUsedInRegularObj = true; if (wasInserted || isa(s) || s->isLazy()) { replaceSymbol(s, ctx, name, id, chunk); - return s; + return cast(s); } reportDuplicate(s, id->file); diff --git a/lld/COFF/SymbolTable.h b/lld/COFF/SymbolTable.h index bf97cf442039e0..e3f674b8098f8b 100644 --- a/lld/COFF/SymbolTable.h +++ b/lld/COFF/SymbolTable.h @@ -105,8 +105,8 @@ class SymbolTable { CommonChunk *c = nullptr); DefinedImportData *addImportData(StringRef n, ImportFile *f, Chunk *&location); - Symbol *addImportThunk(StringRef name, DefinedImportData *s, - ImportThunkChunk *chunk); + Defined *addImportThunk(StringRef name, DefinedImportData *s, + ImportThunkChunk *chunk); void addLibcall(StringRef name); void addEntryThunk(Symbol *from, Symbol *to); void addExitThunk(Symbol *from, Symbol *to); From f2128267c26e548bef59209e7a351ff94d343bf3 Mon Sep 17 00:00:00 2001 From: Helena Kotas Date: Wed, 18 Sep 2024 10:51:30 -0700 Subject: [PATCH 095/321] [HLSL][NFC] Remove RegisterBindingFlags struct (#108924) When diagnosing register bindings we just need to make sure there is a resource that matches the provided register type. We can emit the diagnostics right away instead of collecting flags in the RegisterBindingFlags struct. That also enables early exit when scanning user defined types because we can return as soon as we find a matching resource for the given register type. --- clang/lib/Sema/SemaHLSL.cpp | 308 ++++++++++++++---------------------- 1 file changed, 119 insertions(+), 189 deletions(-) diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp index a303f211501348..03b7c2edb605fe 100644 --- a/clang/lib/Sema/SemaHLSL.cpp +++ b/clang/lib/Sema/SemaHLSL.cpp @@ -40,6 +40,48 @@ #include using namespace clang; +using llvm::dxil::ResourceClass; + +enum class RegisterType { SRV, UAV, CBuffer, Sampler, C, I, Invalid }; + +static RegisterType getRegisterType(ResourceClass RC) { + switch (RC) { + case ResourceClass::SRV: + return RegisterType::SRV; + case ResourceClass::UAV: + return RegisterType::UAV; + case ResourceClass::CBuffer: + return RegisterType::CBuffer; + case ResourceClass::Sampler: + return RegisterType::Sampler; + } + llvm_unreachable("unexpected ResourceClass value"); +} + +static RegisterType getRegisterType(StringRef Slot) { + switch (Slot[0]) { + case 't': + case 'T': + return RegisterType::SRV; + case 'u': + case 'U': + return RegisterType::UAV; + case 'b': + case 'B': + return RegisterType::CBuffer; + case 's': + case 'S': + return RegisterType::Sampler; + case 'c': + case 'C': + return RegisterType::C; + case 'i': + case 'I': + return RegisterType::I; + default: + return RegisterType::Invalid; + } +} SemaHLSL::SemaHLSL(Sema &S) : SemaBase(S) {} @@ -586,8 +628,7 @@ bool clang::CreateHLSLAttributedResourceType( LocEnd = A->getRange().getEnd(); switch (A->getKind()) { case attr::HLSLResourceClass: { - llvm::dxil::ResourceClass RC = - cast(A)->getResourceClass(); + ResourceClass RC = cast(A)->getResourceClass(); if (HasResourceClass) { S.Diag(A->getLocation(), ResAttrs.ResourceClass == RC ? diag::warn_duplicate_attribute_exact @@ -672,7 +713,7 @@ bool SemaHLSL::handleResourceTypeAttr(const ParsedAttr &AL) { SourceLocation ArgLoc = Loc->Loc; // Validate resource class value - llvm::dxil::ResourceClass RC; + ResourceClass RC; if (!HLSLResourceClassAttr::ConvertStrToResourceClass(Identifier, RC)) { Diag(ArgLoc, diag::warn_attribute_type_not_supported) << "ResourceClass" << Identifier; @@ -750,28 +791,6 @@ SemaHLSL::TakeLocForHLSLAttribute(const HLSLAttributedResourceType *RT) { return LocInfo; } -struct RegisterBindingFlags { - bool Resource = false; - bool UDT = false; - bool Other = false; - bool Basic = false; - - bool SRV = false; - bool UAV = false; - bool CBV = false; - bool Sampler = false; - - bool ContainsNumeric = false; - bool DefaultGlobals = false; - - // used only when Resource == true - std::optional ResourceClass; -}; - -static bool isDeclaredWithinCOrTBuffer(const Decl *TheDecl) { - return TheDecl && isa(TheDecl->getDeclContext()); -} - // get the record decl from a var decl that we expect // represents a resource static CXXRecordDecl *getRecordDeclFromVarDecl(VarDecl *VD) { @@ -786,24 +805,6 @@ static CXXRecordDecl *getRecordDeclFromVarDecl(VarDecl *VD) { return TheRecordDecl; } -static void updateResourceClassFlagsFromDeclResourceClass( - RegisterBindingFlags &Flags, llvm::hlsl::ResourceClass DeclResourceClass) { - switch (DeclResourceClass) { - case llvm::hlsl::ResourceClass::SRV: - Flags.SRV = true; - break; - case llvm::hlsl::ResourceClass::UAV: - Flags.UAV = true; - break; - case llvm::hlsl::ResourceClass::CBuffer: - Flags.CBV = true; - break; - case llvm::hlsl::ResourceClass::Sampler: - Flags.Sampler = true; - break; - } -} - const HLSLAttributedResourceType * findAttributedResourceTypeOnField(VarDecl *VD) { assert(VD != nullptr && "expected VarDecl"); @@ -817,8 +818,10 @@ findAttributedResourceTypeOnField(VarDecl *VD) { return nullptr; } -static void updateResourceClassFlagsFromRecordType(RegisterBindingFlags &Flags, - const RecordType *RT) { +// Iterate over RecordType fields and return true if any of them matched the +// register type +static bool ContainsResourceForRegisterType(Sema &S, const RecordType *RT, + RegisterType RegType) { llvm::SmallVector TypesToScan; TypesToScan.emplace_back(RT); @@ -827,8 +830,8 @@ static void updateResourceClassFlagsFromRecordType(RegisterBindingFlags &Flags, while (T->isArrayType()) T = T->getArrayElementTypeNoTypeQual(); if (T->isIntegralOrEnumerationType() || T->isFloatingType()) { - Flags.ContainsNumeric = true; - continue; + if (RegType == RegisterType::C) + return true; } const RecordType *RT = T->getAs(); if (!RT) @@ -839,100 +842,84 @@ static void updateResourceClassFlagsFromRecordType(RegisterBindingFlags &Flags, const Type *FieldTy = FD->getType().getTypePtr(); if (const HLSLAttributedResourceType *AttrResType = dyn_cast(FieldTy)) { - updateResourceClassFlagsFromDeclResourceClass( - Flags, AttrResType->getAttrs().ResourceClass); - continue; + ResourceClass RC = AttrResType->getAttrs().ResourceClass; + if (getRegisterType(RC) == RegType) + return true; + } else { + TypesToScan.emplace_back(FD->getType().getTypePtr()); } - TypesToScan.emplace_back(FD->getType().getTypePtr()); } } + return false; } -static RegisterBindingFlags HLSLFillRegisterBindingFlags(Sema &S, - Decl *TheDecl) { - RegisterBindingFlags Flags; +static void CheckContainsResourceForRegisterType(Sema &S, + SourceLocation &ArgLoc, + Decl *D, RegisterType RegType, + bool SpecifiedSpace) { + int RegTypeNum = static_cast(RegType); // check if the decl type is groupshared - if (TheDecl->hasAttr()) { - Flags.Other = true; - return Flags; + if (D->hasAttr()) { + S.Diag(ArgLoc, diag::err_hlsl_binding_type_mismatch) << RegTypeNum; + return; } // Cbuffers and Tbuffers are HLSLBufferDecl types - if (HLSLBufferDecl *CBufferOrTBuffer = dyn_cast(TheDecl)) { - Flags.Resource = true; - Flags.ResourceClass = CBufferOrTBuffer->isCBuffer() - ? llvm::dxil::ResourceClass::CBuffer - : llvm::dxil::ResourceClass::SRV; + if (HLSLBufferDecl *CBufferOrTBuffer = dyn_cast(D)) { + ResourceClass RC = CBufferOrTBuffer->isCBuffer() ? ResourceClass::CBuffer + : ResourceClass::SRV; + if (RegType != getRegisterType(RC)) + S.Diag(D->getLocation(), diag::err_hlsl_binding_type_mismatch) + << RegTypeNum; + return; } + // Samplers, UAVs, and SRVs are VarDecl types - else if (VarDecl *TheVarDecl = dyn_cast(TheDecl)) { - if (const HLSLAttributedResourceType *AttrResType = - findAttributedResourceTypeOnField(TheVarDecl)) { - Flags.Resource = true; - Flags.ResourceClass = AttrResType->getAttrs().ResourceClass; - } else { - const clang::Type *TheBaseType = TheVarDecl->getType().getTypePtr(); - while (TheBaseType->isArrayType()) - TheBaseType = TheBaseType->getArrayElementTypeNoTypeQual(); - - if (TheBaseType->isArithmeticType()) { - Flags.Basic = true; - if (!isDeclaredWithinCOrTBuffer(TheDecl) && - (TheBaseType->isIntegralType(S.getASTContext()) || - TheBaseType->isFloatingType())) - Flags.DefaultGlobals = true; - } else if (TheBaseType->isRecordType()) { - Flags.UDT = true; - const RecordType *TheRecordTy = TheBaseType->getAs(); - updateResourceClassFlagsFromRecordType(Flags, TheRecordTy); - } else - Flags.Other = true; - } - } else { - llvm_unreachable("expected be VarDecl or HLSLBufferDecl"); + assert(isa(D) && "D is expected to be VarDecl or HLSLBufferDecl"); + VarDecl *VD = cast(D); + + // Resource + if (const HLSLAttributedResourceType *AttrResType = + findAttributedResourceTypeOnField(VD)) { + if (RegType != getRegisterType(AttrResType->getAttrs().ResourceClass)) + S.Diag(D->getLocation(), diag::err_hlsl_binding_type_mismatch) + << RegTypeNum; + return; } - return Flags; -} -enum class RegisterType { SRV, UAV, CBuffer, Sampler, C, I, Invalid }; + const clang::Type *Ty = VD->getType().getTypePtr(); + while (Ty->isArrayType()) + Ty = Ty->getArrayElementTypeNoTypeQual(); -static RegisterType getRegisterType(llvm::dxil::ResourceClass RC) { - switch (RC) { - case llvm::dxil::ResourceClass::SRV: - return RegisterType::SRV; - case llvm::dxil::ResourceClass::UAV: - return RegisterType::UAV; - case llvm::dxil::ResourceClass::CBuffer: - return RegisterType::CBuffer; - case llvm::dxil::ResourceClass::Sampler: - return RegisterType::Sampler; - } - llvm_unreachable("unexpected ResourceClass value"); -} + // Basic types + if (Ty->isArithmeticType()) { + bool DeclaredInCOrTBuffer = isa(D->getDeclContext()); + if (SpecifiedSpace && !DeclaredInCOrTBuffer) + S.Diag(ArgLoc, diag::err_hlsl_space_on_global_constant); -static RegisterType getRegisterType(StringRef Slot) { - switch (Slot[0]) { - case 't': - case 'T': - return RegisterType::SRV; - case 'u': - case 'U': - return RegisterType::UAV; - case 'b': - case 'B': - return RegisterType::CBuffer; - case 's': - case 'S': - return RegisterType::Sampler; - case 'c': - case 'C': - return RegisterType::C; - case 'i': - case 'I': - return RegisterType::I; - default: - return RegisterType::Invalid; + if (!DeclaredInCOrTBuffer && + (Ty->isIntegralType(S.getASTContext()) || Ty->isFloatingType())) { + // Default Globals + if (RegType == RegisterType::CBuffer) + S.Diag(ArgLoc, diag::warn_hlsl_deprecated_register_type_b); + else if (RegType != RegisterType::C) + S.Diag(ArgLoc, diag::err_hlsl_binding_type_mismatch) << RegTypeNum; + } else { + if (RegType == RegisterType::C) + S.Diag(ArgLoc, diag::warn_hlsl_register_type_c_packoffset); + else + S.Diag(ArgLoc, diag::err_hlsl_binding_type_mismatch) << RegTypeNum; + } + } else if (Ty->isRecordType()) { + // Class/struct types - walk the declaration and check each field and + // subclass + if (!ContainsResourceForRegisterType(S, Ty->getAs(), RegType)) + S.Diag(D->getLocation(), diag::warn_hlsl_user_defined_type_missing_member) + << RegTypeNum; + } else { + // Anything else is an error + S.Diag(ArgLoc, diag::err_hlsl_binding_type_mismatch) << RegTypeNum; } } @@ -969,76 +956,19 @@ static void ValidateMultipleRegisterAnnotations(Sema &S, Decl *TheDecl, } static void DiagnoseHLSLRegisterAttribute(Sema &S, SourceLocation &ArgLoc, - Decl *TheDecl, RegisterType RegType, - const bool SpecifiedSpace) { + Decl *D, RegisterType RegType, + bool SpecifiedSpace) { // exactly one of these two types should be set - assert(((isa(TheDecl) && !isa(TheDecl)) || - (!isa(TheDecl) && isa(TheDecl))) && + assert(((isa(D) && !isa(D)) || + (!isa(D) && isa(D))) && "expecting VarDecl or HLSLBufferDecl"); - RegisterBindingFlags Flags = HLSLFillRegisterBindingFlags(S, TheDecl); - assert((int)Flags.Other + (int)Flags.Resource + (int)Flags.Basic + - (int)Flags.UDT == - 1 && - "only one resource analysis result should be expected"); - - int RegTypeNum = static_cast(RegType); - - // first, if "other" is set, emit an error - if (Flags.Other) { - S.Diag(ArgLoc, diag::err_hlsl_binding_type_mismatch) << RegTypeNum; - return; - } + // check if the declaration contains resource matching the register type + CheckContainsResourceForRegisterType(S, ArgLoc, D, RegType, SpecifiedSpace); // next, if multiple register annotations exist, check that none conflict. - ValidateMultipleRegisterAnnotations(S, TheDecl, RegType); - - // next, if resource is set, make sure the register type in the register - // annotation is compatible with the variable's resource type. - if (Flags.Resource) { - RegisterType ExpRegType = getRegisterType(Flags.ResourceClass.value()); - if (RegType != ExpRegType) { - S.Diag(TheDecl->getLocation(), diag::err_hlsl_binding_type_mismatch) - << RegTypeNum; - } - - return; - } - - // next, handle diagnostics for when the "basic" flag is set - if (Flags.Basic) { - if (SpecifiedSpace && !isDeclaredWithinCOrTBuffer(TheDecl)) - S.Diag(ArgLoc, diag::err_hlsl_space_on_global_constant); - - if (Flags.DefaultGlobals) { - if (RegType == RegisterType::CBuffer) - S.Diag(ArgLoc, diag::warn_hlsl_deprecated_register_type_b); - else if (RegType != RegisterType::C) - S.Diag(ArgLoc, diag::err_hlsl_binding_type_mismatch) << RegTypeNum; - return; - } - - if (RegType == RegisterType::C) - S.Diag(ArgLoc, diag::warn_hlsl_register_type_c_packoffset); - else - S.Diag(ArgLoc, diag::err_hlsl_binding_type_mismatch) << RegTypeNum; - - return; - } - - // finally, we handle the udt case - if (Flags.UDT) { - const bool ExpectedRegisterTypesForUDT[] = { - Flags.SRV, Flags.UAV, Flags.CBV, Flags.Sampler, Flags.ContainsNumeric}; - assert((size_t)RegTypeNum < std::size(ExpectedRegisterTypesForUDT) && - "regType has unexpected value"); - - if (!ExpectedRegisterTypesForUDT[RegTypeNum]) - S.Diag(TheDecl->getLocation(), - diag::warn_hlsl_user_defined_type_missing_member) - << RegTypeNum; - } + ValidateMultipleRegisterAnnotations(S, D, RegType); } void SemaHLSL::handleResourceBindingAttr(Decl *TheDecl, const ParsedAttr &AL) { From 13502c7f2c053b2a060f7b9926328cfda46404de Mon Sep 17 00:00:00 2001 From: "Henrik G. Olsson" Date: Wed, 18 Sep 2024 10:51:53 -0700 Subject: [PATCH 096/321] Revert update-verify-tests.py (#109171) This reverts commits c96ee0ffaf5ee7afa1f4b0be0662852f57b47244 and 9ceb9676678ad979a0b767450855d7852ce6a553. Discussion in github PR #108658. --- .../Inputs/duplicate-diag.c | 8 - .../Inputs/duplicate-diag.c.expected | 8 - .../Inputs/infer-indentation.c | 8 - .../Inputs/infer-indentation.c.expected | 11 - .../Inputs/leave-existing-diags.c | 11 - .../Inputs/leave-existing-diags.c.expected | 12 - .../Inputs/multiple-errors.c | 6 - .../Inputs/multiple-errors.c.expected | 9 - .../multiple-missing-errors-same-line.c | 8 - ...ltiple-missing-errors-same-line.c.expected | 13 - .../update-verify-tests/Inputs/no-checks.c | 3 - .../Inputs/no-checks.c.expected | 4 - .../update-verify-tests/Inputs/no-diags.c | 5 - .../Inputs/no-diags.c.expected | 5 - .../Inputs/no-expected-diags.c | 4 - .../Inputs/no-expected-diags.c.expected | 4 - .../Inputs/non-default-prefix.c | 5 - .../Inputs/non-default-prefix.c.expected | 5 - .../Inputs/update-same-line.c | 4 - .../Inputs/update-same-line.c.expected | 4 - .../Inputs/update-single-check.c | 4 - .../Inputs/update-single-check.c.expected | 4 - .../update-verify-tests/duplicate-diag.test | 4 - .../infer-indentation.test | 3 - .../leave-existing-diags.test | 4 - .../utils/update-verify-tests/lit.local.cfg | 28 -- .../update-verify-tests/multiple-errors.test | 3 - .../multiple-missing-errors-same-line.test | 3 - .../utils/update-verify-tests/no-checks.test | 3 - .../utils/update-verify-tests/no-diags.test | 4 - .../no-expected-diags.test | 4 - .../non-default-prefix.test | 4 - .../update-verify-tests/update-same-line.test | 4 - .../update-single-check.test | 3 - clang/utils/UpdateVerifyTests/core.py | 452 ------------------ clang/utils/update-verify-tests.py | 38 -- 36 files changed, 702 deletions(-) delete mode 100644 clang/test/utils/update-verify-tests/Inputs/duplicate-diag.c delete mode 100644 clang/test/utils/update-verify-tests/Inputs/duplicate-diag.c.expected delete mode 100644 clang/test/utils/update-verify-tests/Inputs/infer-indentation.c delete mode 100644 clang/test/utils/update-verify-tests/Inputs/infer-indentation.c.expected delete mode 100644 clang/test/utils/update-verify-tests/Inputs/leave-existing-diags.c delete mode 100644 clang/test/utils/update-verify-tests/Inputs/leave-existing-diags.c.expected delete mode 100644 clang/test/utils/update-verify-tests/Inputs/multiple-errors.c delete mode 100644 clang/test/utils/update-verify-tests/Inputs/multiple-errors.c.expected delete mode 100644 clang/test/utils/update-verify-tests/Inputs/multiple-missing-errors-same-line.c delete mode 100644 clang/test/utils/update-verify-tests/Inputs/multiple-missing-errors-same-line.c.expected delete mode 100644 clang/test/utils/update-verify-tests/Inputs/no-checks.c delete mode 100644 clang/test/utils/update-verify-tests/Inputs/no-checks.c.expected delete mode 100644 clang/test/utils/update-verify-tests/Inputs/no-diags.c delete mode 100644 clang/test/utils/update-verify-tests/Inputs/no-diags.c.expected delete mode 100644 clang/test/utils/update-verify-tests/Inputs/no-expected-diags.c delete mode 100644 clang/test/utils/update-verify-tests/Inputs/no-expected-diags.c.expected delete mode 100644 clang/test/utils/update-verify-tests/Inputs/non-default-prefix.c delete mode 100644 clang/test/utils/update-verify-tests/Inputs/non-default-prefix.c.expected delete mode 100644 clang/test/utils/update-verify-tests/Inputs/update-same-line.c delete mode 100644 clang/test/utils/update-verify-tests/Inputs/update-same-line.c.expected delete mode 100644 clang/test/utils/update-verify-tests/Inputs/update-single-check.c delete mode 100644 clang/test/utils/update-verify-tests/Inputs/update-single-check.c.expected delete mode 100644 clang/test/utils/update-verify-tests/duplicate-diag.test delete mode 100644 clang/test/utils/update-verify-tests/infer-indentation.test delete mode 100644 clang/test/utils/update-verify-tests/leave-existing-diags.test delete mode 100644 clang/test/utils/update-verify-tests/lit.local.cfg delete mode 100644 clang/test/utils/update-verify-tests/multiple-errors.test delete mode 100644 clang/test/utils/update-verify-tests/multiple-missing-errors-same-line.test delete mode 100644 clang/test/utils/update-verify-tests/no-checks.test delete mode 100644 clang/test/utils/update-verify-tests/no-diags.test delete mode 100644 clang/test/utils/update-verify-tests/no-expected-diags.test delete mode 100644 clang/test/utils/update-verify-tests/non-default-prefix.test delete mode 100644 clang/test/utils/update-verify-tests/update-same-line.test delete mode 100644 clang/test/utils/update-verify-tests/update-single-check.test delete mode 100644 clang/utils/UpdateVerifyTests/core.py delete mode 100644 clang/utils/update-verify-tests.py diff --git a/clang/test/utils/update-verify-tests/Inputs/duplicate-diag.c b/clang/test/utils/update-verify-tests/Inputs/duplicate-diag.c deleted file mode 100644 index 8c7e46c6eca9c1..00000000000000 --- a/clang/test/utils/update-verify-tests/Inputs/duplicate-diag.c +++ /dev/null @@ -1,8 +0,0 @@ -void foo() { - // expected-error@+1{{use of undeclared identifier 'a'}} - a = 2; a = 2; - b = 2; b = 2; - // expected-error@+1 3{{use of undeclared identifier 'c'}} - c = 2; c = 2; - // expected-error 2{{asdf}} -} diff --git a/clang/test/utils/update-verify-tests/Inputs/duplicate-diag.c.expected b/clang/test/utils/update-verify-tests/Inputs/duplicate-diag.c.expected deleted file mode 100644 index 6214ff382f4495..00000000000000 --- a/clang/test/utils/update-verify-tests/Inputs/duplicate-diag.c.expected +++ /dev/null @@ -1,8 +0,0 @@ -void foo() { - // expected-error@+1 2{{use of undeclared identifier 'a'}} - a = 2; a = 2; - // expected-error@+1 2{{use of undeclared identifier 'b'}} - b = 2; b = 2; - // expected-error@+1 2{{use of undeclared identifier 'c'}} - c = 2; c = 2; -} diff --git a/clang/test/utils/update-verify-tests/Inputs/infer-indentation.c b/clang/test/utils/update-verify-tests/Inputs/infer-indentation.c deleted file mode 100644 index 0210ac35fd5cd1..00000000000000 --- a/clang/test/utils/update-verify-tests/Inputs/infer-indentation.c +++ /dev/null @@ -1,8 +0,0 @@ -void foo() { - // expected-error@+1 2 {{use of undeclared identifier 'a'}} - a = 2; a = 2; b = 2; b = 2; c = 2; - // expected-error@+1 2 {{asdf}} - d = 2; - e = 2; f = 2; // expected-error 2 {{use of undeclared identifier 'e'}} -} - diff --git a/clang/test/utils/update-verify-tests/Inputs/infer-indentation.c.expected b/clang/test/utils/update-verify-tests/Inputs/infer-indentation.c.expected deleted file mode 100644 index 5c5aaeeef97acf..00000000000000 --- a/clang/test/utils/update-verify-tests/Inputs/infer-indentation.c.expected +++ /dev/null @@ -1,11 +0,0 @@ -void foo() { - // expected-error@+3 {{use of undeclared identifier 'c'}} - // expected-error@+2 2 {{use of undeclared identifier 'b'}} - // expected-error@+1 2 {{use of undeclared identifier 'a'}} - a = 2; a = 2; b = 2; b = 2; c = 2; - // expected-error@+1 {{use of undeclared identifier 'd'}} - d = 2; - // expected-error@+1 {{use of undeclared identifier 'f'}} - e = 2; f = 2; // expected-error {{use of undeclared identifier 'e'}} -} - diff --git a/clang/test/utils/update-verify-tests/Inputs/leave-existing-diags.c b/clang/test/utils/update-verify-tests/Inputs/leave-existing-diags.c deleted file mode 100644 index 1aa8d088e97273..00000000000000 --- a/clang/test/utils/update-verify-tests/Inputs/leave-existing-diags.c +++ /dev/null @@ -1,11 +0,0 @@ -void foo() { - a = 2; - // expected-error@-1{{use of undeclared identifier 'a'}} - b = 2;// expected-error{{use of undeclared identifier 'b'}} - c = 2; - // expected-error@5{{use of undeclared identifier 'c'}} - d = 2; // expected-error-re{{use of {{.*}} identifier 'd'}} - - e = 2; // error to trigger mismatch -} - diff --git a/clang/test/utils/update-verify-tests/Inputs/leave-existing-diags.c.expected b/clang/test/utils/update-verify-tests/Inputs/leave-existing-diags.c.expected deleted file mode 100644 index 6b621061bbfbbd..00000000000000 --- a/clang/test/utils/update-verify-tests/Inputs/leave-existing-diags.c.expected +++ /dev/null @@ -1,12 +0,0 @@ -void foo() { - a = 2; - // expected-error@-1{{use of undeclared identifier 'a'}} - b = 2;// expected-error{{use of undeclared identifier 'b'}} - c = 2; - // expected-error@5{{use of undeclared identifier 'c'}} - d = 2; // expected-error-re{{use of {{.*}} identifier 'd'}} - - // expected-error@+1{{use of undeclared identifier 'e'}} - e = 2; // error to trigger mismatch -} - diff --git a/clang/test/utils/update-verify-tests/Inputs/multiple-errors.c b/clang/test/utils/update-verify-tests/Inputs/multiple-errors.c deleted file mode 100644 index e230e0a337bf49..00000000000000 --- a/clang/test/utils/update-verify-tests/Inputs/multiple-errors.c +++ /dev/null @@ -1,6 +0,0 @@ -void foo() { - a = 2; - b = 2; - - c = 2; -} diff --git a/clang/test/utils/update-verify-tests/Inputs/multiple-errors.c.expected b/clang/test/utils/update-verify-tests/Inputs/multiple-errors.c.expected deleted file mode 100644 index 27dc1f30a26faf..00000000000000 --- a/clang/test/utils/update-verify-tests/Inputs/multiple-errors.c.expected +++ /dev/null @@ -1,9 +0,0 @@ -void foo() { - // expected-error@+1{{use of undeclared identifier 'a'}} - a = 2; - // expected-error@+1{{use of undeclared identifier 'b'}} - b = 2; - - // expected-error@+1{{use of undeclared identifier 'c'}} - c = 2; -} diff --git a/clang/test/utils/update-verify-tests/Inputs/multiple-missing-errors-same-line.c b/clang/test/utils/update-verify-tests/Inputs/multiple-missing-errors-same-line.c deleted file mode 100644 index 03f723d44bbe82..00000000000000 --- a/clang/test/utils/update-verify-tests/Inputs/multiple-missing-errors-same-line.c +++ /dev/null @@ -1,8 +0,0 @@ -void foo() { - a = 2; b = 2; c = 2; -} - -void bar() { - x = 2; y = 2; z = 2; - // expected-error@-1{{use of undeclared identifier 'x'}} -} diff --git a/clang/test/utils/update-verify-tests/Inputs/multiple-missing-errors-same-line.c.expected b/clang/test/utils/update-verify-tests/Inputs/multiple-missing-errors-same-line.c.expected deleted file mode 100644 index 24b57f4353d95d..00000000000000 --- a/clang/test/utils/update-verify-tests/Inputs/multiple-missing-errors-same-line.c.expected +++ /dev/null @@ -1,13 +0,0 @@ -void foo() { - // expected-error@+3{{use of undeclared identifier 'c'}} - // expected-error@+2{{use of undeclared identifier 'b'}} - // expected-error@+1{{use of undeclared identifier 'a'}} - a = 2; b = 2; c = 2; -} - -void bar() { - x = 2; y = 2; z = 2; - // expected-error@-1{{use of undeclared identifier 'x'}} - // expected-error@-2{{use of undeclared identifier 'y'}} - // expected-error@-3{{use of undeclared identifier 'z'}} -} diff --git a/clang/test/utils/update-verify-tests/Inputs/no-checks.c b/clang/test/utils/update-verify-tests/Inputs/no-checks.c deleted file mode 100644 index 8fd1f7cd333705..00000000000000 --- a/clang/test/utils/update-verify-tests/Inputs/no-checks.c +++ /dev/null @@ -1,3 +0,0 @@ -void foo() { - bar = 2; -} diff --git a/clang/test/utils/update-verify-tests/Inputs/no-checks.c.expected b/clang/test/utils/update-verify-tests/Inputs/no-checks.c.expected deleted file mode 100644 index e80548fbe50f2c..00000000000000 --- a/clang/test/utils/update-verify-tests/Inputs/no-checks.c.expected +++ /dev/null @@ -1,4 +0,0 @@ -void foo() { - // expected-error@+1{{use of undeclared identifier 'bar'}} - bar = 2; -} diff --git a/clang/test/utils/update-verify-tests/Inputs/no-diags.c b/clang/test/utils/update-verify-tests/Inputs/no-diags.c deleted file mode 100644 index 66d169be439402..00000000000000 --- a/clang/test/utils/update-verify-tests/Inputs/no-diags.c +++ /dev/null @@ -1,5 +0,0 @@ -void foo() { - // expected-error@+1{{asdf}} - int a = 2; -} - diff --git a/clang/test/utils/update-verify-tests/Inputs/no-diags.c.expected b/clang/test/utils/update-verify-tests/Inputs/no-diags.c.expected deleted file mode 100644 index 05230284945702..00000000000000 --- a/clang/test/utils/update-verify-tests/Inputs/no-diags.c.expected +++ /dev/null @@ -1,5 +0,0 @@ -// expected-no-diagnostics -void foo() { - int a = 2; -} - diff --git a/clang/test/utils/update-verify-tests/Inputs/no-expected-diags.c b/clang/test/utils/update-verify-tests/Inputs/no-expected-diags.c deleted file mode 100644 index 78b72e1357da76..00000000000000 --- a/clang/test/utils/update-verify-tests/Inputs/no-expected-diags.c +++ /dev/null @@ -1,4 +0,0 @@ -// expected-no-diagnostics -void foo() { - a = 2; -} diff --git a/clang/test/utils/update-verify-tests/Inputs/no-expected-diags.c.expected b/clang/test/utils/update-verify-tests/Inputs/no-expected-diags.c.expected deleted file mode 100644 index d948ffce56189a..00000000000000 --- a/clang/test/utils/update-verify-tests/Inputs/no-expected-diags.c.expected +++ /dev/null @@ -1,4 +0,0 @@ -void foo() { - // expected-error@+1{{use of undeclared identifier 'a'}} - a = 2; -} diff --git a/clang/test/utils/update-verify-tests/Inputs/non-default-prefix.c b/clang/test/utils/update-verify-tests/Inputs/non-default-prefix.c deleted file mode 100644 index 3d63eaf0f1b878..00000000000000 --- a/clang/test/utils/update-verify-tests/Inputs/non-default-prefix.c +++ /dev/null @@ -1,5 +0,0 @@ -void foo() { - a = 2; // check-error{{asdf}} - // expected-error@-1{ignored}} -} - diff --git a/clang/test/utils/update-verify-tests/Inputs/non-default-prefix.c.expected b/clang/test/utils/update-verify-tests/Inputs/non-default-prefix.c.expected deleted file mode 100644 index a877f86922123d..00000000000000 --- a/clang/test/utils/update-verify-tests/Inputs/non-default-prefix.c.expected +++ /dev/null @@ -1,5 +0,0 @@ -void foo() { - a = 2; // check-error{{use of undeclared identifier 'a'}} - // expected-error@-1{ignored}} -} - diff --git a/clang/test/utils/update-verify-tests/Inputs/update-same-line.c b/clang/test/utils/update-verify-tests/Inputs/update-same-line.c deleted file mode 100644 index 5278ce0c57c319..00000000000000 --- a/clang/test/utils/update-verify-tests/Inputs/update-same-line.c +++ /dev/null @@ -1,4 +0,0 @@ -void foo() { - bar = 2; // expected-error {{asdf}} -} - diff --git a/clang/test/utils/update-verify-tests/Inputs/update-same-line.c.expected b/clang/test/utils/update-verify-tests/Inputs/update-same-line.c.expected deleted file mode 100644 index 8ba47f788319b1..00000000000000 --- a/clang/test/utils/update-verify-tests/Inputs/update-same-line.c.expected +++ /dev/null @@ -1,4 +0,0 @@ -void foo() { - bar = 2; // expected-error {{use of undeclared identifier 'bar'}} -} - diff --git a/clang/test/utils/update-verify-tests/Inputs/update-single-check.c b/clang/test/utils/update-verify-tests/Inputs/update-single-check.c deleted file mode 100644 index 20b011bfc3d77e..00000000000000 --- a/clang/test/utils/update-verify-tests/Inputs/update-single-check.c +++ /dev/null @@ -1,4 +0,0 @@ -void foo() { - // expected-error@+1{{asdf}} - bar = 2; -} diff --git a/clang/test/utils/update-verify-tests/Inputs/update-single-check.c.expected b/clang/test/utils/update-verify-tests/Inputs/update-single-check.c.expected deleted file mode 100644 index e80548fbe50f2c..00000000000000 --- a/clang/test/utils/update-verify-tests/Inputs/update-single-check.c.expected +++ /dev/null @@ -1,4 +0,0 @@ -void foo() { - // expected-error@+1{{use of undeclared identifier 'bar'}} - bar = 2; -} diff --git a/clang/test/utils/update-verify-tests/duplicate-diag.test b/clang/test/utils/update-verify-tests/duplicate-diag.test deleted file mode 100644 index db4b0fd86f0817..00000000000000 --- a/clang/test/utils/update-verify-tests/duplicate-diag.test +++ /dev/null @@ -1,4 +0,0 @@ -# RUN: cp %S/Inputs/duplicate-diag.c %t.c && not %clang_cc1 -verify %t.c 2>&1 | %update-verify-tests -# RUN: diff --strip-trailing-cr %S/Inputs/duplicate-diag.c.expected %t.c -# RUN: %clang_cc1 -verify %t.c - diff --git a/clang/test/utils/update-verify-tests/infer-indentation.test b/clang/test/utils/update-verify-tests/infer-indentation.test deleted file mode 100644 index bd94dce4844ebf..00000000000000 --- a/clang/test/utils/update-verify-tests/infer-indentation.test +++ /dev/null @@ -1,3 +0,0 @@ -# RUN: cp %S/Inputs/infer-indentation.c %t.c && not %clang_cc1 -verify %t.c 2>&1 | %update-verify-tests -# RUN: diff --strip-trailing-cr %S/Inputs/infer-indentation.c.expected %t.c -# RUN: %clang_cc1 -verify %t.c diff --git a/clang/test/utils/update-verify-tests/leave-existing-diags.test b/clang/test/utils/update-verify-tests/leave-existing-diags.test deleted file mode 100644 index 8a723f157bf84a..00000000000000 --- a/clang/test/utils/update-verify-tests/leave-existing-diags.test +++ /dev/null @@ -1,4 +0,0 @@ -# RUN: cp %S/Inputs/leave-existing-diags.c %t.c && not %clang_cc1 -verify %t.c 2>&1 | %update-verify-tests -# RUN: diff --strip-trailing-cr %S/Inputs/leave-existing-diags.c.expected %t.c -# RUN: %clang_cc1 -verify %t.c - diff --git a/clang/test/utils/update-verify-tests/lit.local.cfg b/clang/test/utils/update-verify-tests/lit.local.cfg deleted file mode 100644 index b0eebf337da5c9..00000000000000 --- a/clang/test/utils/update-verify-tests/lit.local.cfg +++ /dev/null @@ -1,28 +0,0 @@ -import lit.util - -# python 2.7 backwards compatibility -try: - from shlex import quote as shell_quote -except ImportError: - from pipes import quote as shell_quote - -if config.standalone_build: - # These tests require the update-verify-tests.py script from the clang - # source tree, so skip these tests if we are doing standalone builds. - config.unsupported = True -else: - config.suffixes = [".test"] - - script_path = os.path.join( - config.clang_src_dir, "utils", "update-verify-tests.py" - ) - python = shell_quote(config.python_executable) - config.substitutions.append( - ( - "%update-verify-tests", - "%s %s" % (python, shell_quote(script_path)), - ) - ) - # AIX 'diff' command doesn't support --strip-trailing-cr, but the internal - # python implementation does, so use that for cross platform compatibility - config.test_format = lit.formats.ShTest() diff --git a/clang/test/utils/update-verify-tests/multiple-errors.test b/clang/test/utils/update-verify-tests/multiple-errors.test deleted file mode 100644 index 1fcb6b7f2ca096..00000000000000 --- a/clang/test/utils/update-verify-tests/multiple-errors.test +++ /dev/null @@ -1,3 +0,0 @@ -# RUN: cp %S/Inputs/multiple-errors.c %t.c && not %clang_cc1 -verify %t.c 2>&1 | %update-verify-tests -# RUN: diff --strip-trailing-cr %S/Inputs/multiple-errors.c.expected %t.c -# RUN: %clang_cc1 -verify %t.c diff --git a/clang/test/utils/update-verify-tests/multiple-missing-errors-same-line.test b/clang/test/utils/update-verify-tests/multiple-missing-errors-same-line.test deleted file mode 100644 index 00338d7595cb78..00000000000000 --- a/clang/test/utils/update-verify-tests/multiple-missing-errors-same-line.test +++ /dev/null @@ -1,3 +0,0 @@ -# RUN: cp %S/Inputs/multiple-missing-errors-same-line.c %t.c && not %clang_cc1 -verify %t.c 2>&1 | %update-verify-tests -# RUN: diff --strip-trailing-cr %S/Inputs/multiple-missing-errors-same-line.c.expected %t.c -# RUN: %clang_cc1 -verify %t.c diff --git a/clang/test/utils/update-verify-tests/no-checks.test b/clang/test/utils/update-verify-tests/no-checks.test deleted file mode 100644 index 5fdbdcbac95261..00000000000000 --- a/clang/test/utils/update-verify-tests/no-checks.test +++ /dev/null @@ -1,3 +0,0 @@ -# RUN: cp %S/Inputs/no-checks.c %t.c && not %clang_cc1 -verify %t.c 2>&1 | %update-verify-tests -# RUN: diff --strip-trailing-cr %S/Inputs/no-checks.c.expected %t.c -# RUN: %clang_cc1 -verify %t.c diff --git a/clang/test/utils/update-verify-tests/no-diags.test b/clang/test/utils/update-verify-tests/no-diags.test deleted file mode 100644 index 825fd0219debb3..00000000000000 --- a/clang/test/utils/update-verify-tests/no-diags.test +++ /dev/null @@ -1,4 +0,0 @@ -# RUN: cp %S/Inputs/no-diags.c %t.c && not %clang_cc1 -verify %t.c 2>&1 | %update-verify-tests -# RUN: diff --strip-trailing-cr %S/Inputs/no-diags.c.expected %t.c -# RUN: %clang_cc1 -verify %t.c - diff --git a/clang/test/utils/update-verify-tests/no-expected-diags.test b/clang/test/utils/update-verify-tests/no-expected-diags.test deleted file mode 100644 index be475c190da177..00000000000000 --- a/clang/test/utils/update-verify-tests/no-expected-diags.test +++ /dev/null @@ -1,4 +0,0 @@ -# RUN: cp %S/Inputs/no-expected-diags.c %t.c && not %clang_cc1 -verify %t.c 2>&1 | %update-verify-tests -# RUN: diff --strip-trailing-cr %S/Inputs/no-expected-diags.c.expected %t.c -# RUN: %clang_cc1 -verify %t.c - diff --git a/clang/test/utils/update-verify-tests/non-default-prefix.test b/clang/test/utils/update-verify-tests/non-default-prefix.test deleted file mode 100644 index 594dba4174d2e5..00000000000000 --- a/clang/test/utils/update-verify-tests/non-default-prefix.test +++ /dev/null @@ -1,4 +0,0 @@ -# RUN: cp %S/Inputs/non-default-prefix.c %t.c && not %clang_cc1 -verify=check %t.c 2>&1 | %update-verify-tests --prefix check -# RUN: diff --strip-trailing-cr %S/Inputs/non-default-prefix.c.expected %t.c -# RUN: %clang_cc1 -verify=check %t.c - diff --git a/clang/test/utils/update-verify-tests/update-same-line.test b/clang/test/utils/update-verify-tests/update-same-line.test deleted file mode 100644 index b7e5d7a574eca5..00000000000000 --- a/clang/test/utils/update-verify-tests/update-same-line.test +++ /dev/null @@ -1,4 +0,0 @@ -# RUN: cp %S/Inputs/update-same-line.c %t.c && not %clang_cc1 -verify %t.c 2>&1 | %update-verify-tests -# RUN: diff --strip-trailing-cr %S/Inputs/update-same-line.c.expected %t.c -# RUN: %clang_cc1 -verify %t.c - diff --git a/clang/test/utils/update-verify-tests/update-single-check.test b/clang/test/utils/update-verify-tests/update-single-check.test deleted file mode 100644 index b958d66b099db4..00000000000000 --- a/clang/test/utils/update-verify-tests/update-single-check.test +++ /dev/null @@ -1,3 +0,0 @@ -# RUN: cp %S/Inputs/update-single-check.c %t.c && not %clang_cc1 -verify %t.c 2>&1 | %update-verify-tests -# RUN: diff --strip-trailing-cr %S/Inputs/update-single-check.c.expected %t.c -# RUN: %clang_cc1 -verify %t.c diff --git a/clang/utils/UpdateVerifyTests/core.py b/clang/utils/UpdateVerifyTests/core.py deleted file mode 100644 index d1350cdbb698b6..00000000000000 --- a/clang/utils/UpdateVerifyTests/core.py +++ /dev/null @@ -1,452 +0,0 @@ -import sys -import re - -DEBUG = False - - -def dprint(*args): - if DEBUG: - print(*args, file=sys.stderr) - - -class KnownException(Exception): - pass - - -def parse_error_category(s, prefix): - if "no expected directives found" in s: - return None - parts = s.split("diagnostics") - diag_category = parts[0] - category_parts = parts[0].strip().strip("'").split("-") - expected = category_parts[0] - if expected != prefix: - raise Exception( - f"expected prefix '{prefix}', but found '{expected}'. Multiple verify prefixes are not supported." - ) - diag_category = category_parts[1] - if "seen but not expected" in parts[1]: - seen = True - elif "expected but not seen" in parts[1]: - seen = False - else: - raise KnownException(f"unexpected category '{parts[1]}'") - return (diag_category, seen) - - -diag_error_re = re.compile(r"File (\S+) Line (\d+): (.+)") -diag_error_re2 = re.compile(r"File \S+ Line \d+ \(directive at (\S+):(\d+)\): (.+)") - - -def parse_diag_error(s): - m = diag_error_re2.match(s) - if not m: - m = diag_error_re.match(s) - if not m: - return None - return (m.group(1), int(m.group(2)), m.group(3)) - - -class Line: - def __init__(self, content, line_n): - self.content = content - self.diag = None - self.line_n = line_n - self.targeting_diags = [] - - def update_line_n(self, n): - self.line_n = n - - def render(self): - if not self.diag: - return self.content - assert "{{DIAG}}" in self.content - res = self.content.replace("{{DIAG}}", self.diag.render()) - if not res.strip(): - return "" - return res - - -class Diag: - def __init__( - self, - prefix, - diag_content, - category, - parsed_target_line_n, - line_is_absolute, - count, - line, - is_re, - whitespace_strings, - is_from_source_file, - ): - self.prefix = prefix - self.diag_content = diag_content - self.category = category - self.parsed_target_line_n = parsed_target_line_n - self.line_is_absolute = line_is_absolute - self.count = count - self.line = line - self.target = None - self.is_re = is_re - self.absolute_target() - self.whitespace_strings = whitespace_strings - self.is_from_source_file = is_from_source_file - - def decrement_count(self): - self.count -= 1 - assert self.count >= 0 - - def increment_count(self): - assert self.count >= 0 - self.count += 1 - - def unset_target(self): - assert self.target is not None - self.target.targeting_diags.remove(self) - self.target = None - - def set_target(self, target): - if self.target: - self.unset_target() - self.target = target - self.target.targeting_diags.append(self) - - def absolute_target(self): - if self.target: - return self.target.line_n - if self.line_is_absolute: - return self.parsed_target_line_n - return self.line.line_n + self.parsed_target_line_n - - def relative_target(self): - return self.absolute_target() - self.line.line_n - - def take(self, other_diag): - assert self.count == 0 - assert other_diag.count > 0 - assert other_diag.target == self.target - assert not other_diag.line_is_absolute - assert not other_diag.is_re and not self.is_re - self.line_is_absolute = False - self.diag_content = other_diag.diag_content - self.count = other_diag.count - self.category = other_diag.category - self.count = other_diag.count - other_diag.count = 0 - - def render(self): - assert self.count >= 0 - if self.count == 0: - return "" - line_location_s = "" - if self.relative_target() != 0: - if self.line_is_absolute: - line_location_s = f"@{self.absolute_target()}" - elif self.relative_target() > 0: - line_location_s = f"@+{self.relative_target()}" - else: - line_location_s = ( - f"@{self.relative_target()}" # the minus sign is implicit - ) - count_s = "" if self.count == 1 else f"{self.count}" - re_s = "-re" if self.is_re else "" - if self.whitespace_strings: - whitespace1_s = self.whitespace_strings[0] - whitespace2_s = self.whitespace_strings[1] - whitespace3_s = self.whitespace_strings[2] - else: - whitespace1_s = " " - whitespace2_s = "" - whitespace3_s = "" - if count_s and not whitespace2_s: - whitespace2_s = " " # required to parse correctly - elif not count_s and whitespace2_s == " ": - """Don't emit a weird extra space. - However if the whitespace is something other than the - standard single space, let it be to avoid disrupting manual formatting. - The existence of a non-empty whitespace2_s implies this was parsed with - a count > 1 and then decremented, otherwise this whitespace would have - been parsed as whitespace3_s. - """ - whitespace2_s = "" - return f"//{whitespace1_s}{self.prefix}-{self.category}{re_s}{line_location_s}{whitespace2_s}{count_s}{whitespace3_s}{{{{{self.diag_content}}}}}" - - -expected_diag_re = re.compile( - r"//(\s*)([a-zA-Z]+)-(note|warning|error)(-re)?(@[+-]?\d+)?(?:(\s*)(\d+))?(\s*)\{\{(.*)\}\}" -) - - -def parse_diag(line, filename, lines, prefix): - s = line.content - ms = expected_diag_re.findall(s) - if not ms: - return None - if len(ms) > 1: - raise KnownException( - f"multiple diags on line {filename}:{line.line_n}. Aborting due to missing implementation." - ) - [ - whitespace1_s, - check_prefix, - category_s, - re_s, - target_line_s, - whitespace2_s, - count_s, - whitespace3_s, - diag_s, - ] = ms[0] - if check_prefix != prefix: - return None - if not target_line_s: - target_line_n = 0 - is_absolute = False - elif target_line_s.startswith("@+"): - target_line_n = int(target_line_s[2:]) - is_absolute = False - elif target_line_s.startswith("@-"): - target_line_n = int(target_line_s[1:]) - is_absolute = False - else: - target_line_n = int(target_line_s[1:]) - is_absolute = True - count = int(count_s) if count_s else 1 - line.content = expected_diag_re.sub("{{DIAG}}", s) - - return Diag( - prefix, - diag_s, - category_s, - target_line_n, - is_absolute, - count, - line, - bool(re_s), - [whitespace1_s, whitespace2_s, whitespace3_s], - True, - ) - - -def add_line(new_line, lines): - lines.insert(new_line.line_n - 1, new_line) - for i in range(new_line.line_n, len(lines)): - line = lines[i] - assert line.line_n == i - line.update_line_n(i + 1) - assert all(line.line_n == i + 1 for i, line in enumerate(lines)) - - -def remove_line(old_line, lines): - lines.remove(old_line) - for i in range(old_line.line_n - 1, len(lines)): - line = lines[i] - assert line.line_n == i + 2 - line.update_line_n(i + 1) - assert all(line.line_n == i + 1 for i, line in enumerate(lines)) - - -indent_re = re.compile(r"\s*") - - -def get_indent(s): - return indent_re.match(s).group(0) - - -def orig_line_n_to_new_line_n(line_n, orig_lines): - return orig_lines[line_n - 1].line_n - - -def add_diag(orig_line_n, diag_s, diag_category, lines, orig_lines, prefix): - line_n = orig_line_n_to_new_line_n(orig_line_n, orig_lines) - target = lines[line_n - 1] - for other in target.targeting_diags: - if other.is_re: - raise KnownException( - "mismatching diag on line with regex matcher. Skipping due to missing implementation" - ) - reverse = ( - True - if [other for other in target.targeting_diags if other.relative_target() < 0] - else False - ) - - targeting = [ - other for other in target.targeting_diags if not other.line_is_absolute - ] - targeting.sort(reverse=reverse, key=lambda d: d.relative_target()) - prev_offset = 0 - prev_line = target - direction = -1 if reverse else 1 - for d in targeting: - if d.relative_target() != prev_offset + direction: - break - prev_offset = d.relative_target() - prev_line = d.line - total_offset = prev_offset - 1 if reverse else prev_offset + 1 - if reverse: - new_line_n = prev_line.line_n + 1 - else: - new_line_n = prev_line.line_n - assert new_line_n == line_n + (not reverse) - total_offset - - new_line = Line(get_indent(prev_line.content) + "{{DIAG}}\n", new_line_n) - add_line(new_line, lines) - - whitespace_strings = prev_line.diag.whitespace_strings if prev_line.diag else None - new_diag = Diag( - prefix, - diag_s, - diag_category, - total_offset, - False, - 1, - new_line, - False, - whitespace_strings, - False, - ) - new_line.diag = new_diag - new_diag.set_target(target) - - -def remove_dead_diags(lines): - for line in lines: - if not line.diag or line.diag.count != 0: - continue - if line.render() == "": - remove_line(line, lines) - else: - assert line.diag.is_from_source_file - for other_diag in line.targeting_diags: - if ( - other_diag.is_from_source_file - or other_diag.count == 0 - or other_diag.category != line.diag.category - ): - continue - if other_diag.is_re or line.diag.is_re: - continue - line.diag.take(other_diag) - remove_line(other_diag.line, lines) - - -def has_live_diags(lines): - for line in lines: - if line.diag and line.diag.count > 0: - return True - return False - - -def get_expected_no_diags_line_n(lines, prefix): - for line in lines: - if f"{prefix}-no-diagnostics" in line.content: - return line.line_n - return None - - -def update_test_file(filename, diag_errors, prefix, updated_test_files): - dprint(f"updating test file {filename}") - if filename in updated_test_files: - raise KnownException(f"{filename} already updated, but got new output") - else: - updated_test_files.add(filename) - with open(filename, "r") as f: - lines = [Line(line, i + 1) for i, line in enumerate(f.readlines())] - orig_lines = list(lines) - expected_no_diags_line_n = get_expected_no_diags_line_n(orig_lines, prefix) - - for line in lines: - diag = parse_diag(line, filename, lines, prefix) - if diag: - line.diag = diag - diag.set_target(lines[diag.absolute_target() - 1]) - - for line_n, diag_s, diag_category, seen in diag_errors: - if seen: - continue - # this is a diagnostic expected but not seen - assert lines[line_n - 1].diag - if diag_s != lines[line_n - 1].diag.diag_content: - raise KnownException( - f"{filename}:{line_n} - found diag {lines[line_n - 1].diag.diag_content} but expected {diag_s}" - ) - if diag_category != lines[line_n - 1].diag.category: - raise KnownException( - f"{filename}:{line_n} - found {lines[line_n - 1].diag.category} diag but expected {diag_category}" - ) - lines[line_n - 1].diag.decrement_count() - diag_errors_left = [] - diag_errors.sort(reverse=True, key=lambda t: t[0]) - for line_n, diag_s, diag_category, seen in diag_errors: - if not seen: - continue - target = orig_lines[line_n - 1] - other_diags = [ - d - for d in target.targeting_diags - if d.diag_content == diag_s and d.category == diag_category - ] - other_diag = other_diags[0] if other_diags else None - if other_diag: - other_diag.increment_count() - else: - add_diag(line_n, diag_s, diag_category, lines, orig_lines, prefix) - remove_dead_diags(lines) - has_diags = has_live_diags(lines) - with open(filename, "w") as f: - if not has_diags and expected_no_diags_line_n is None: - f.write("// expected-no-diagnostics\n") - for line in lines: - if has_diags and line.line_n == expected_no_diags_line_n: - continue - f.write(line.render()) - - -def update_test_files(errors, prefix): - errors_by_file = {} - for (filename, line, diag_s), (diag_category, seen) in errors: - if filename not in errors_by_file: - errors_by_file[filename] = [] - errors_by_file[filename].append((line, diag_s, diag_category, seen)) - updated_test_files = set() - for filename, diag_errors in errors_by_file.items(): - try: - update_test_file(filename, diag_errors, prefix, updated_test_files) - except KnownException as e: - return f"Error in update-verify-tests while updating {filename}: {e}" - updated_files = list(updated_test_files) - assert updated_files - if len(updated_files) == 1: - return f"updated file {updated_files[0]}" - updated_files_s = "\n\t".join(updated_files) - return "updated files:\n\t{updated_files_s}" - - -def check_expectations(tool_output, prefix): - """ - The entry point function. - Called by the stand-alone update-verify-tests.py as well as litplugin.py. - """ - curr = [] - curr_category = None - try: - for line in tool_output: - if line.startswith("error: "): - curr_category = parse_error_category(line[len("error: ") :], prefix) - continue - - diag_error = parse_diag_error(line.strip()) - if diag_error: - curr.append((diag_error, curr_category)) - else: - dprint("no match") - dprint(line.strip()) - except KnownException as e: - return f"Error in update-verify-tests while parsing tool output: {e}" - if curr: - return update_test_files(curr, prefix) - else: - return "no mismatching diagnostics found" diff --git a/clang/utils/update-verify-tests.py b/clang/utils/update-verify-tests.py deleted file mode 100644 index e2874a8c049ef3..00000000000000 --- a/clang/utils/update-verify-tests.py +++ /dev/null @@ -1,38 +0,0 @@ -import sys -import argparse -from UpdateVerifyTests.core import check_expectations - -""" - Pipe output from clang's -verify into this script to have the test case updated to expect the actual diagnostic output. - When inserting new expected-* checks it will place them on the line before the location of the diagnostic, with an @+1, - or @+N for some N if there are multiple diagnostics emitted on the same line. If the current checks are using @-N for - this line, the new check will follow that convention also. - Existing checks will be left untouched as much as possible, including their location and whitespace content, to minimize - diffs. If inaccurate their count will be updated, or the check removed entirely. - - Missing features: - - multiple prefixes on the same line (-verify=my-prefix,my-other-prefix) - - multiple prefixes on separate RUN lines (RUN: -verify=my-prefix\nRUN: -verify my-other-prefix) - - regexes with expected-*-re: existing ones will be left untouched if accurate, but the script will abort if there are any - diagnostic mismatches on the same line. - - multiple checks targeting the same line are supported, but a line may only contain one check - - if multiple checks targeting the same line are failing the script is not guaranteed to produce a minimal diff - -Example usage: - clang -verify [file] | python3 update-verify-tests.py - clang -verify=check [file] | python3 update-verify-tests.py --prefix check -""" - - -def main(): - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument( - "--prefix", default="expected", help="The prefix passed to -verify" - ) - args = parser.parse_args() - output = check_expectations(sys.stdin.readlines(), args.prefix) - print(output) - - -if __name__ == "__main__": - main() From be187a6812fb6e8984886c28a502ec69bdaa4ad4 Mon Sep 17 00:00:00 2001 From: Slava Zakharin Date: Wed, 18 Sep 2024 10:59:05 -0700 Subject: [PATCH 097/321] [flang][runtime] Use cuda::std::complex in F18 runtime CUDA build. (#109078) `std::complex` operators do not work for the CUDA device compilation of F18 runtime. This change makes use of `cuda::std::complex` from `libcudacxx`. `cuda::std::complex` does not have specializations for `long double`, so the change is accompanied with a clean-up for `long double` usage. --- flang/include/flang/Common/float80.h | 43 ++++ flang/include/flang/Runtime/complex.h | 31 +++ flang/include/flang/Runtime/cpp-type.h | 9 +- .../flang/Runtime/matmul-instances.inc | 6 +- flang/include/flang/Runtime/numeric.h | 32 +-- flang/include/flang/Runtime/reduce.h | 214 +++++++++++------- flang/include/flang/Runtime/reduction.h | 112 ++++----- .../include/flang/Runtime/transformational.h | 20 +- flang/runtime/complex-powi.cpp | 23 +- flang/runtime/complex-reduction.c | 8 +- flang/runtime/dot-product.cpp | 21 +- flang/runtime/extrema.cpp | 10 +- flang/runtime/matmul-transpose.cpp | 17 -- flang/runtime/matmul.cpp | 34 +-- flang/runtime/numeric.cpp | 36 +-- flang/runtime/product.cpp | 15 +- flang/runtime/random.cpp | 2 +- flang/runtime/reduce.cpp | 180 ++++++++------- flang/runtime/reduction-templates.h | 4 +- flang/runtime/sum.cpp | 22 +- flang/runtime/transformational.cpp | 8 +- flang/unittests/Runtime/Numeric.cpp | 4 +- flang/unittests/Runtime/Transformational.cpp | 10 +- 23 files changed, 480 insertions(+), 381 deletions(-) create mode 100644 flang/include/flang/Common/float80.h create mode 100644 flang/include/flang/Runtime/complex.h diff --git a/flang/include/flang/Common/float80.h b/flang/include/flang/Common/float80.h new file mode 100644 index 00000000000000..1838f7b13c8bb2 --- /dev/null +++ b/flang/include/flang/Common/float80.h @@ -0,0 +1,43 @@ +/*===-- flang/Common/float80.h --------------------------------------*- C -*-=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===----------------------------------------------------------------------===*/ + +/* This header is usable in both C and C++ code. + * Isolates build compiler checks to determine if the 80-bit + * floating point format is supported via a particular C type. + * It defines CFloat80Type and CppFloat80Type aliases for this + * C type. + */ + +#ifndef FORTRAN_COMMON_FLOAT80_H_ +#define FORTRAN_COMMON_FLOAT80_H_ + +#include "api-attrs.h" +#include + +#if LDBL_MANT_DIG == 64 +#undef HAS_FLOAT80 +#define HAS_FLOAT80 1 +#endif + +#if defined(RT_DEVICE_COMPILATION) && defined(__CUDACC__) +/* + * 'long double' is treated as 'double' in the CUDA device code, + * and there is no support for 80-bit floating point format. + * This is probably true for most offload devices, so RT_DEVICE_COMPILATION + * check should be enough. For the time being, guard it with __CUDACC__ + * as well. + */ +#undef HAS_FLOAT80 +#endif + +#if HAS_FLOAT80 +typedef long double CFloat80Type; +typedef long double CppFloat80Type; +#endif + +#endif /* FORTRAN_COMMON_FLOAT80_H_ */ diff --git a/flang/include/flang/Runtime/complex.h b/flang/include/flang/Runtime/complex.h new file mode 100644 index 00000000000000..b7ad1376bffbf1 --- /dev/null +++ b/flang/include/flang/Runtime/complex.h @@ -0,0 +1,31 @@ +//===-- include/flang/Runtime/complex.h -------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// A single way to expose C++ complex class in files that can be used +// in F18 runtime build. With inclusion of this file std::complex +// and the related names become available, though, they may correspond +// to alternative definitions (e.g. from cuda::std namespace). + +#ifndef FORTRAN_RUNTIME_COMPLEX_H +#define FORTRAN_RUNTIME_COMPLEX_H + +#if RT_USE_LIBCUDACXX +#include +namespace Fortran::runtime::rtcmplx { +using cuda::std::complex; +using cuda::std::conj; +} // namespace Fortran::runtime::rtcmplx +#else // !RT_USE_LIBCUDACXX +#include +namespace Fortran::runtime::rtcmplx { +using std::complex; +using std::conj; +} // namespace Fortran::runtime::rtcmplx +#endif // !RT_USE_LIBCUDACXX + +#endif // FORTRAN_RUNTIME_COMPLEX_H diff --git a/flang/include/flang/Runtime/cpp-type.h b/flang/include/flang/Runtime/cpp-type.h index fe21dd544cf7d8..aef0fbd7ede586 100644 --- a/flang/include/flang/Runtime/cpp-type.h +++ b/flang/include/flang/Runtime/cpp-type.h @@ -13,8 +13,9 @@ #include "flang/Common/Fortran.h" #include "flang/Common/float128.h" +#include "flang/Common/float80.h" #include "flang/Common/uint128.h" -#include +#include "flang/Runtime/complex.h" #include #if __cplusplus >= 202302 #include @@ -70,9 +71,9 @@ template <> struct CppTypeForHelper { using type = double; #endif }; -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 template <> struct CppTypeForHelper { - using type = long double; + using type = CppFloat80Type; }; #endif #if __STDCPP_FLOAT128_T__ @@ -89,7 +90,7 @@ template <> struct CppTypeForHelper { #endif template struct CppTypeForHelper { - using type = std::complex>; + using type = rtcmplx::complex>; }; template <> struct CppTypeForHelper { diff --git a/flang/include/flang/Runtime/matmul-instances.inc b/flang/include/flang/Runtime/matmul-instances.inc index 32c6ab06d25219..88e3067ca029d4 100644 --- a/flang/include/flang/Runtime/matmul-instances.inc +++ b/flang/include/flang/Runtime/matmul-instances.inc @@ -111,7 +111,7 @@ FOREACH_MATMUL_TYPE_PAIR(MATMUL_DIRECT_INSTANCE) FOREACH_MATMUL_TYPE_PAIR_WITH_INT16(MATMUL_INSTANCE) FOREACH_MATMUL_TYPE_PAIR_WITH_INT16(MATMUL_DIRECT_INSTANCE) -#if MATMUL_FORCE_ALL_TYPES || LDBL_MANT_DIG == 64 +#if MATMUL_FORCE_ALL_TYPES || HAS_FLOAT80 MATMUL_INSTANCE(Integer, 16, Real, 10) MATMUL_INSTANCE(Integer, 16, Complex, 10) MATMUL_INSTANCE(Real, 10, Integer, 16) @@ -133,7 +133,7 @@ MATMUL_DIRECT_INSTANCE(Complex, 16, Integer, 16) #endif #endif // MATMUL_FORCE_ALL_TYPES || (defined __SIZEOF_INT128__ && !AVOID_NATIVE_UINT128_T) -#if MATMUL_FORCE_ALL_TYPES || LDBL_MANT_DIG == 64 +#if MATMUL_FORCE_ALL_TYPES || HAS_FLOAT80 #define FOREACH_MATMUL_TYPE_PAIR_WITH_REAL10(macro) \ macro(Integer, 1, Real, 10) \ macro(Integer, 1, Complex, 10) \ @@ -193,7 +193,7 @@ MATMUL_DIRECT_INSTANCE(Complex, 10, Complex, 16) MATMUL_DIRECT_INSTANCE(Complex, 16, Real, 10) MATMUL_DIRECT_INSTANCE(Complex, 16, Complex, 10) #endif -#endif // MATMUL_FORCE_ALL_TYPES || LDBL_MANT_DIG == 64 +#endif // MATMUL_FORCE_ALL_TYPES || HAS_FLOAT80 #if MATMUL_FORCE_ALL_TYPES || (LDBL_MANT_DIG == 113 || HAS_FLOAT128) #define FOREACH_MATMUL_TYPE_PAIR_WITH_REAL16(macro) \ diff --git a/flang/include/flang/Runtime/numeric.h b/flang/include/flang/Runtime/numeric.h index 84a5a7cd7a361c..c3923ee2e0d889 100644 --- a/flang/include/flang/Runtime/numeric.h +++ b/flang/include/flang/Runtime/numeric.h @@ -44,7 +44,7 @@ CppTypeFor RTDECL(Ceiling8_8)( CppTypeFor RTDECL(Ceiling8_16)( CppTypeFor); #endif -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDECL(Ceiling10_1)( CppTypeFor); CppTypeFor RTDECL(Ceiling10_2)( @@ -78,7 +78,7 @@ CppTypeFor RTDECL(ErfcScaled4)( CppTypeFor); CppTypeFor RTDECL(ErfcScaled8)( CppTypeFor); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDECL(ErfcScaled10)( CppTypeFor); #endif @@ -96,7 +96,7 @@ CppTypeFor RTDECL(Exponent8_4)( CppTypeFor); CppTypeFor RTDECL(Exponent8_8)( CppTypeFor); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDECL(Exponent10_4)( CppTypeFor); CppTypeFor RTDECL(Exponent10_8)( @@ -134,7 +134,7 @@ CppTypeFor RTDECL(Floor8_8)( CppTypeFor RTDECL(Floor8_16)( CppTypeFor); #endif -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDECL(Floor10_1)( CppTypeFor); CppTypeFor RTDECL(Floor10_2)( @@ -168,7 +168,7 @@ CppTypeFor RTDECL(Fraction4)( CppTypeFor); CppTypeFor RTDECL(Fraction8)( CppTypeFor); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDECL(Fraction10)( CppTypeFor); #endif @@ -180,7 +180,7 @@ CppTypeFor RTDECL(Fraction16)( // ISNAN / IEEE_IS_NAN bool RTDECL(IsNaN4)(CppTypeFor); bool RTDECL(IsNaN8)(CppTypeFor); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 bool RTDECL(IsNaN10)(CppTypeFor); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 @@ -212,7 +212,7 @@ CppTypeFor RTDECL(ModReal4)( CppTypeFor RTDECL(ModReal8)( CppTypeFor, CppTypeFor, const char *sourceFile = nullptr, int sourceLine = 0); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDECL(ModReal10)( CppTypeFor, CppTypeFor, const char *sourceFile = nullptr, int sourceLine = 0); @@ -247,7 +247,7 @@ CppTypeFor RTDECL(ModuloReal4)( CppTypeFor RTDECL(ModuloReal8)( CppTypeFor, CppTypeFor, const char *sourceFile = nullptr, int sourceLine = 0); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDECL(ModuloReal10)( CppTypeFor, CppTypeFor, const char *sourceFile = nullptr, int sourceLine = 0); @@ -283,7 +283,7 @@ CppTypeFor RTDECL(Nint8_8)( CppTypeFor RTDECL(Nint8_16)( CppTypeFor); #endif -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDECL(Nint10_1)( CppTypeFor); CppTypeFor RTDECL(Nint10_2)( @@ -319,7 +319,7 @@ CppTypeFor RTDECL(Nearest4)( CppTypeFor, bool positive); CppTypeFor RTDECL(Nearest8)( CppTypeFor, bool positive); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDECL(Nearest10)( CppTypeFor, bool positive); #endif @@ -333,7 +333,7 @@ CppTypeFor RTDECL(RRSpacing4)( CppTypeFor); CppTypeFor RTDECL(RRSpacing8)( CppTypeFor); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDECL(RRSpacing10)( CppTypeFor); #endif @@ -347,7 +347,7 @@ CppTypeFor RTDECL(SetExponent4)( CppTypeFor, std::int64_t); CppTypeFor RTDECL(SetExponent8)( CppTypeFor, std::int64_t); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDECL(SetExponent10)( CppTypeFor, std::int64_t); #endif @@ -361,7 +361,7 @@ CppTypeFor RTDECL(Scale4)( CppTypeFor, std::int64_t); CppTypeFor RTDECL(Scale8)( CppTypeFor, std::int64_t); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDECL(Scale10)( CppTypeFor, std::int64_t); #endif @@ -410,7 +410,7 @@ CppTypeFor RTDECL(Spacing4)( CppTypeFor); CppTypeFor RTDECL(Spacing8)( CppTypeFor); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDECL(Spacing10)( CppTypeFor); #endif @@ -425,7 +425,7 @@ CppTypeFor RTDECL(FPow4i)( CppTypeFor RTDECL(FPow8i)( CppTypeFor b, CppTypeFor e); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDECL(FPow10i)( CppTypeFor b, CppTypeFor e); @@ -442,7 +442,7 @@ CppTypeFor RTDECL(FPow4k)( CppTypeFor RTDECL(FPow8k)( CppTypeFor b, CppTypeFor e); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDECL(FPow10k)( CppTypeFor b, CppTypeFor e); diff --git a/flang/include/flang/Runtime/reduce.h b/flang/include/flang/Runtime/reduce.h index 60f54c393b4bbd..c016b37f9592a1 100644 --- a/flang/include/flang/Runtime/reduce.h +++ b/flang/include/flang/Runtime/reduce.h @@ -188,22 +188,26 @@ void RTDECL(ReduceReal8DimValue)(Descriptor &result, const Descriptor &array, ValueReductionOperation, const char *source, int line, int dim, const Descriptor *mask = nullptr, const double *identity = nullptr, bool ordered = true); -#if LDBL_MANT_DIG == 64 -long double RTDECL(ReduceReal10Ref)(const Descriptor &, - ReferenceReductionOperation, const char *source, int line, - int dim = 0, const Descriptor *mask = nullptr, - const long double *identity = nullptr, bool ordered = true); -long double RTDECL(ReduceReal10Value)(const Descriptor &, - ValueReductionOperation, const char *source, int line, - int dim = 0, const Descriptor *mask = nullptr, - const long double *identity = nullptr, bool ordered = true); +#if HAS_FLOAT80 +CppTypeFor RTDECL(ReduceReal10Ref)(const Descriptor &, + ReferenceReductionOperation>, + const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, + const CppTypeFor *identity = nullptr, + bool ordered = true); +CppTypeFor RTDECL(ReduceReal10Value)(const Descriptor &, + ValueReductionOperation>, + const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, + const CppTypeFor *identity = nullptr, + bool ordered = true); void RTDECL(ReduceReal10DimRef)(Descriptor &result, const Descriptor &array, - ReferenceReductionOperation, const char *source, int line, - int dim, const Descriptor *mask = nullptr, - const long double *identity = nullptr, bool ordered = true); + ReferenceReductionOperation>, + const char *source, int line, int dim, const Descriptor *mask = nullptr, + const CppTypeFor *identity = nullptr, + bool ordered = true); void RTDECL(ReduceReal10DimValue)(Descriptor &result, const Descriptor &array, - ValueReductionOperation, const char *source, int line, int dim, - const Descriptor *mask = nullptr, const long double *identity = nullptr, + ValueReductionOperation>, + const char *source, int line, int dim, const Descriptor *mask = nullptr, + const CppTypeFor *identity = nullptr, bool ordered = true); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 @@ -225,112 +229,152 @@ void RTDECL(ReduceReal16DimValue)(Descriptor &result, const Descriptor &array, const CppFloat128Type *identity = nullptr, bool ordered = true); #endif -void RTDECL(CppReduceComplex2Ref)(std::complex &, const Descriptor &, - ReferenceReductionOperation>, const char *source, - int line, int dim = 0, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); -void RTDECL(CppReduceComplex2Value)(std::complex &, const Descriptor &, - ValueReductionOperation>, const char *source, int line, - int dim = 0, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); +void RTDECL(CppReduceComplex2Ref)(CppTypeFor &, + const Descriptor &, + ReferenceReductionOperation>, + const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, + const CppTypeFor *identity = nullptr, + bool ordered = true); +void RTDECL(CppReduceComplex2Value)(CppTypeFor &, + const Descriptor &, + ValueReductionOperation>, + const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, + const CppTypeFor *identity = nullptr, + bool ordered = true); void RTDECL(CppReduceComplex2DimRef)(Descriptor &result, - const Descriptor &array, ReferenceReductionOperation>, + const Descriptor &array, + ReferenceReductionOperation>, const char *source, int line, int dim, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); + const CppTypeFor *identity = nullptr, + bool ordered = true); void RTDECL(CppReduceComplex2DimValue)(Descriptor &result, - const Descriptor &array, ValueReductionOperation>, + const Descriptor &array, + ValueReductionOperation>, const char *source, int line, int dim, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); -void RTDECL(CppReduceComplex3Ref)(std::complex &, const Descriptor &, - ReferenceReductionOperation>, const char *source, - int line, int dim = 0, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); -void RTDECL(CppReduceComplex3Value)(std::complex &, const Descriptor &, - ValueReductionOperation>, const char *source, int line, - int dim = 0, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); + const CppTypeFor *identity = nullptr, + bool ordered = true); +void RTDECL(CppReduceComplex3Ref)(CppTypeFor &, + const Descriptor &, + ReferenceReductionOperation>, + const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, + const CppTypeFor *identity = nullptr, + bool ordered = true); +void RTDECL(CppReduceComplex3Value)(CppTypeFor &, + const Descriptor &, + ValueReductionOperation>, + const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, + const CppTypeFor *identity = nullptr, + bool ordered = true); void RTDECL(CppReduceComplex3DimRef)(Descriptor &result, - const Descriptor &array, ReferenceReductionOperation>, + const Descriptor &array, + ReferenceReductionOperation>, const char *source, int line, int dim, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); + const CppTypeFor *identity = nullptr, + bool ordered = true); void RTDECL(CppReduceComplex3DimValue)(Descriptor &result, - const Descriptor &array, ValueReductionOperation>, + const Descriptor &array, + ValueReductionOperation>, const char *source, int line, int dim, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); -void RTDECL(CppReduceComplex4Ref)(std::complex &, const Descriptor &, - ReferenceReductionOperation>, const char *source, - int line, int dim = 0, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); -void RTDECL(CppReduceComplex4Value)(std::complex &, const Descriptor &, - ValueReductionOperation>, const char *source, int line, - int dim = 0, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); + const CppTypeFor *identity = nullptr, + bool ordered = true); +void RTDECL(CppReduceComplex4Ref)(CppTypeFor &, + const Descriptor &, + ReferenceReductionOperation>, + const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, + const CppTypeFor *identity = nullptr, + bool ordered = true); +void RTDECL(CppReduceComplex4Value)(CppTypeFor &, + const Descriptor &, + ValueReductionOperation>, + const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, + const CppTypeFor *identity = nullptr, + bool ordered = true); void RTDECL(CppReduceComplex4DimRef)(Descriptor &result, - const Descriptor &array, ReferenceReductionOperation>, + const Descriptor &array, + ReferenceReductionOperation>, const char *source, int line, int dim, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); + const CppTypeFor *identity = nullptr, + bool ordered = true); void RTDECL(CppReduceComplex4DimValue)(Descriptor &result, - const Descriptor &array, ValueReductionOperation>, + const Descriptor &array, + ValueReductionOperation>, const char *source, int line, int dim, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); -void RTDECL(CppReduceComplex8Ref)(std::complex &, const Descriptor &, - ReferenceReductionOperation>, const char *source, - int line, int dim = 0, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); -void RTDECL(CppReduceComplex8Value)(std::complex &, const Descriptor &, - ValueReductionOperation>, const char *source, int line, - int dim = 0, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); + const CppTypeFor *identity = nullptr, + bool ordered = true); +void RTDECL(CppReduceComplex8Ref)(CppTypeFor &, + const Descriptor &, + ReferenceReductionOperation>, + const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, + const CppTypeFor *identity = nullptr, + bool ordered = true); +void RTDECL(CppReduceComplex8Value)(CppTypeFor &, + const Descriptor &, + ValueReductionOperation>, + const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, + const CppTypeFor *identity = nullptr, + bool ordered = true); void RTDECL(CppReduceComplex8DimRef)(Descriptor &result, - const Descriptor &array, ReferenceReductionOperation>, + const Descriptor &array, + ReferenceReductionOperation>, const char *source, int line, int dim, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); + const CppTypeFor *identity = nullptr, + bool ordered = true); void RTDECL(CppReduceComplex8DimValue)(Descriptor &result, - const Descriptor &array, ValueReductionOperation>, + const Descriptor &array, + ValueReductionOperation>, const char *source, int line, int dim, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); -#if LDBL_MANT_DIG == 64 -void RTDECL(CppReduceComplex10Ref)(std::complex &, - const Descriptor &, ReferenceReductionOperation>, + const CppTypeFor *identity = nullptr, + bool ordered = true); +#if HAS_FLOAT80 +void RTDECL(CppReduceComplex10Ref)(CppTypeFor &, + const Descriptor &, + ReferenceReductionOperation>, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); -void RTDECL(CppReduceComplex10Value)(std::complex &, - const Descriptor &, ValueReductionOperation>, + const CppTypeFor *identity = nullptr, + bool ordered = true); +void RTDECL(CppReduceComplex10Value)(CppTypeFor &, + const Descriptor &, + ValueReductionOperation>, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); + const CppTypeFor *identity = nullptr, + bool ordered = true); void RTDECL(CppReduceComplex10DimRef)(Descriptor &result, const Descriptor &array, - ReferenceReductionOperation>, const char *source, - int line, int dim, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); + ReferenceReductionOperation>, + const char *source, int line, int dim, const Descriptor *mask = nullptr, + const CppTypeFor *identity = nullptr, + bool ordered = true); void RTDECL(CppReduceComplex10DimValue)(Descriptor &result, - const Descriptor &array, ValueReductionOperation>, + const Descriptor &array, + ValueReductionOperation>, const char *source, int line, int dim, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); + const CppTypeFor *identity = nullptr, + bool ordered = true); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 -void RTDECL(CppReduceComplex16Ref)(std::complex &, +void RTDECL(CppReduceComplex16Ref)(CppTypeFor &, const Descriptor &, - ReferenceReductionOperation>, + ReferenceReductionOperation>, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, + const CppTypeFor *identity = nullptr, bool ordered = true); -void RTDECL(CppReduceComplex16Value)(std::complex &, - const Descriptor &, ValueReductionOperation>, +void RTDECL(CppReduceComplex16Value)(CppTypeFor &, + const Descriptor &, + ValueReductionOperation>, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, + const CppTypeFor *identity = nullptr, bool ordered = true); void RTDECL(CppReduceComplex16DimRef)(Descriptor &result, const Descriptor &array, - ReferenceReductionOperation>, + ReferenceReductionOperation>, const char *source, int line, int dim, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, + const CppTypeFor *identity = nullptr, bool ordered = true); void RTDECL(CppReduceComplex16DimValue)(Descriptor &result, const Descriptor &array, - ValueReductionOperation>, const char *source, - int line, int dim, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, + ValueReductionOperation>, + const char *source, int line, int dim, const Descriptor *mask = nullptr, + const CppTypeFor *identity = nullptr, bool ordered = true); #endif diff --git a/flang/include/flang/Runtime/reduction.h b/flang/include/flang/Runtime/reduction.h index 97986c12e8a10e..7eafacee69d034 100644 --- a/flang/include/flang/Runtime/reduction.h +++ b/flang/include/flang/Runtime/reduction.h @@ -68,34 +68,35 @@ float RTDECL(SumReal4)(const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); double RTDECL(SumReal8)(const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -#if LDBL_MANT_DIG == 64 -long double RTDECL(SumReal10)(const Descriptor &, const char *source, int line, - int dim = 0, const Descriptor *mask = nullptr); +#if HAS_FLOAT80 +CppTypeFor RTDECL(SumReal10)(const Descriptor &, + const char *source, int line, int dim = 0, + const Descriptor *mask = nullptr); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 CppFloat128Type RTDECL(SumReal16)(const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); #endif -void RTDECL(CppSumComplex2)(std::complex &, const Descriptor &, - const char *source, int line, int dim = 0, +void RTDECL(CppSumComplex2)(CppTypeFor &, + const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -void RTDECL(CppSumComplex3)(std::complex &, const Descriptor &, - const char *source, int line, int dim = 0, +void RTDECL(CppSumComplex3)(CppTypeFor &, + const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -void RTDECL(CppSumComplex4)(std::complex &, const Descriptor &, - const char *source, int line, int dim = 0, +void RTDECL(CppSumComplex4)(CppTypeFor &, + const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -void RTDECL(CppSumComplex8)(std::complex &, const Descriptor &, - const char *source, int line, int dim = 0, +void RTDECL(CppSumComplex8)(CppTypeFor &, + const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -#if LDBL_MANT_DIG == 64 -void RTDECL(CppSumComplex10)(std::complex &, const Descriptor &, - const char *source, int line, int dim = 0, +#if HAS_FLOAT80 +void RTDECL(CppSumComplex10)(CppTypeFor &, + const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 -void RTDECL(CppSumComplex16)(std::complex &, +void RTDECL(CppSumComplex16)(CppTypeFor &, const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); #endif @@ -128,34 +129,35 @@ float RTDECL(ProductReal4)(const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); double RTDECL(ProductReal8)(const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -#if LDBL_MANT_DIG == 64 -long double RTDECL(ProductReal10)(const Descriptor &, const char *source, - int line, int dim = 0, const Descriptor *mask = nullptr); +#if HAS_FLOAT80 +CppTypeFor RTDECL(ProductReal10)(const Descriptor &, + const char *source, int line, int dim = 0, + const Descriptor *mask = nullptr); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 CppFloat128Type RTDECL(ProductReal16)(const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); #endif -void RTDECL(CppProductComplex2)(std::complex &, const Descriptor &, - const char *source, int line, int dim = 0, +void RTDECL(CppProductComplex2)(CppTypeFor &, + const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -void RTDECL(CppProductComplex3)(std::complex &, const Descriptor &, - const char *source, int line, int dim = 0, +void RTDECL(CppProductComplex3)(CppTypeFor &, + const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -void RTDECL(CppProductComplex4)(std::complex &, const Descriptor &, - const char *source, int line, int dim = 0, +void RTDECL(CppProductComplex4)(CppTypeFor &, + const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -void RTDECL(CppProductComplex8)(std::complex &, const Descriptor &, - const char *source, int line, int dim = 0, +void RTDECL(CppProductComplex8)(CppTypeFor &, + const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -#if LDBL_MANT_DIG == 64 -void RTDECL(CppProductComplex10)(std::complex &, +#if HAS_FLOAT80 +void RTDECL(CppProductComplex10)(CppTypeFor &, const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 -void RTDECL(CppProductComplex16)(std::complex &, +void RTDECL(CppProductComplex16)(CppTypeFor &, const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); #endif @@ -307,9 +309,10 @@ float RTDECL(MaxvalReal4)(const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); double RTDECL(MaxvalReal8)(const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -#if LDBL_MANT_DIG == 64 -long double RTDECL(MaxvalReal10)(const Descriptor &, const char *source, - int line, int dim = 0, const Descriptor *mask = nullptr); +#if HAS_FLOAT80 +CppTypeFor RTDECL(MaxvalReal10)(const Descriptor &, + const char *source, int line, int dim = 0, + const Descriptor *mask = nullptr); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 CppFloat128Type RTDECL(MaxvalReal16)(const Descriptor &, const char *source, @@ -338,9 +341,10 @@ float RTDECL(MinvalReal4)(const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); double RTDECL(MinvalReal8)(const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -#if LDBL_MANT_DIG == 64 -long double RTDECL(MinvalReal10)(const Descriptor &, const char *source, - int line, int dim = 0, const Descriptor *mask = nullptr); +#if HAS_FLOAT80 +CppTypeFor RTDECL(MinvalReal10)(const Descriptor &, + const char *source, int line, int dim = 0, + const Descriptor *mask = nullptr); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 CppFloat128Type RTDECL(MinvalReal16)(const Descriptor &, const char *source, @@ -363,12 +367,12 @@ float RTDECL(Norm2_4)( const Descriptor &, const char *source, int line, int dim = 0); double RTDECL(Norm2_8)( const Descriptor &, const char *source, int line, int dim = 0); -#if LDBL_MANT_DIG == 64 -long double RTDECL(Norm2_10)( +#if HAS_FLOAT80 +CppTypeFor RTDECL(Norm2_10)( const Descriptor &, const char *source, int line, int dim = 0); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 -long double RTDECL(Norm2_16)( +CppFloat128Type RTDECL(Norm2_16)( const Descriptor &, const char *source, int line, int dim = 0); void RTDECL(Norm2DimReal16)( Descriptor &, const Descriptor &, int dim, const char *source, int line); @@ -413,29 +417,33 @@ float RTDECL(DotProductReal4)(const Descriptor &, const Descriptor &, const char *source = nullptr, int line = 0); double RTDECL(DotProductReal8)(const Descriptor &, const Descriptor &, const char *source = nullptr, int line = 0); -#if LDBL_MANT_DIG == 64 -long double RTDECL(DotProductReal10)(const Descriptor &, const Descriptor &, - const char *source = nullptr, int line = 0); +#if HAS_FLOAT80 +CppTypeFor RTDECL(DotProductReal10)(const Descriptor &, + const Descriptor &, const char *source = nullptr, int line = 0); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 CppFloat128Type RTDECL(DotProductReal16)(const Descriptor &, const Descriptor &, const char *source = nullptr, int line = 0); #endif -void RTDECL(CppDotProductComplex2)(std::complex &, const Descriptor &, - const Descriptor &, const char *source = nullptr, int line = 0); -void RTDECL(CppDotProductComplex3)(std::complex &, const Descriptor &, - const Descriptor &, const char *source = nullptr, int line = 0); -void RTDECL(CppDotProductComplex4)(std::complex &, const Descriptor &, - const Descriptor &, const char *source = nullptr, int line = 0); -void RTDECL(CppDotProductComplex8)(std::complex &, const Descriptor &, - const Descriptor &, const char *source = nullptr, int line = 0); -#if LDBL_MANT_DIG == 64 -void RTDECL(CppDotProductComplex10)(std::complex &, +void RTDECL(CppDotProductComplex2)(CppTypeFor &, + const Descriptor &, const Descriptor &, const char *source = nullptr, + int line = 0); +void RTDECL(CppDotProductComplex3)(CppTypeFor &, + const Descriptor &, const Descriptor &, const char *source = nullptr, + int line = 0); +void RTDECL(CppDotProductComplex4)(CppTypeFor &, + const Descriptor &, const Descriptor &, const char *source = nullptr, + int line = 0); +void RTDECL(CppDotProductComplex8)(CppTypeFor &, + const Descriptor &, const Descriptor &, const char *source = nullptr, + int line = 0); +#if HAS_FLOAT80 +void RTDECL(CppDotProductComplex10)(CppTypeFor &, const Descriptor &, const Descriptor &, const char *source = nullptr, int line = 0); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 -void RTDECL(CppDotProductComplex16)(std::complex &, +void RTDECL(CppDotProductComplex16)(CppTypeFor &, const Descriptor &, const Descriptor &, const char *source = nullptr, int line = 0); #endif diff --git a/flang/include/flang/Runtime/transformational.h b/flang/include/flang/Runtime/transformational.h index a39b872f376a69..faeaa1baa39ae2 100644 --- a/flang/include/flang/Runtime/transformational.h +++ b/flang/include/flang/Runtime/transformational.h @@ -45,10 +45,12 @@ void RTDECL(BesselJn_4)(Descriptor &result, int32_t n1, int32_t n2, float x, void RTDECL(BesselJn_8)(Descriptor &result, int32_t n1, int32_t n2, double x, double bn2, double bn2_1, const char *sourceFile = nullptr, int line = 0); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 void RTDECL(BesselJn_10)(Descriptor &result, int32_t n1, int32_t n2, - long double x, long double bn2, long double bn2_1, - const char *sourceFile = nullptr, int line = 0); + CppTypeFor x, + CppTypeFor bn2, + CppTypeFor bn2_1, const char *sourceFile = nullptr, + int line = 0); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 @@ -69,7 +71,7 @@ void RTDECL(BesselJnX0_4)(Descriptor &result, int32_t n1, int32_t n2, void RTDECL(BesselJnX0_8)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 void RTDECL(BesselJnX0_10)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); #endif @@ -91,10 +93,12 @@ void RTDECL(BesselYn_4)(Descriptor &result, int32_t n1, int32_t n2, float x, void RTDECL(BesselYn_8)(Descriptor &result, int32_t n1, int32_t n2, double x, double bn1, double bn1_1, const char *sourceFile = nullptr, int line = 0); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 void RTDECL(BesselYn_10)(Descriptor &result, int32_t n1, int32_t n2, - long double x, long double bn1, long double bn1_1, - const char *sourceFile = nullptr, int line = 0); + CppTypeFor x, + CppTypeFor bn1, + CppTypeFor bn1_1, const char *sourceFile = nullptr, + int line = 0); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 @@ -115,7 +119,7 @@ void RTDECL(BesselYnX0_4)(Descriptor &result, int32_t n1, int32_t n2, void RTDECL(BesselYnX0_8)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 void RTDECL(BesselYnX0_10)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); #endif diff --git a/flang/runtime/complex-powi.cpp b/flang/runtime/complex-powi.cpp index 77031e40242791..d7a63724b96c8f 100644 --- a/flang/runtime/complex-powi.cpp +++ b/flang/runtime/complex-powi.cpp @@ -7,11 +7,13 @@ * ===-----------------------------------------------------------------------=== */ #include "flang/Common/float128.h" +#include "flang/Runtime/cpp-type.h" #include "flang/Runtime/entry-names.h" #include #include #include +namespace Fortran::runtime { #ifdef __clang_major__ #pragma clang diagnostic ignored "-Wc99-extensions" #endif @@ -114,35 +116,35 @@ extern "C" Qcomplex RTNAME(cqpowk)(Qcomplex base, std::int64_t exp) { // MSVC doesn't allow including or in C++17 mode to get // the Windows definitions of these structs so just redefine here. struct Fcomplex { - float re; - float im; + CppTypeFor re; + CppTypeFor im; }; struct Dcomplex { - double re; - double im; + CppTypeFor re; + CppTypeFor im; }; extern "C" Fcomplex RTNAME(cpowi)(Fcomplex base, std::int32_t exp) { - auto cppbase = *(std::complex *)(&base); + auto cppbase = *(CppTypeFor *)(&base); auto cppres = tgpowi(cppbase, exp); return *(Fcomplex *)(&cppres); } extern "C" Dcomplex RTNAME(zpowi)(Dcomplex base, std::int32_t exp) { - auto cppbase = *(std::complex *)(&base); + auto cppbase = *(CppTypeFor *)(&base); auto cppres = tgpowi(cppbase, exp); return *(Dcomplex *)(&cppres); } extern "C" Fcomplex RTNAME(cpowk)(Fcomplex base, std::int64_t exp) { - auto cppbase = *(std::complex *)(&base); + auto cppbase = *(CppTypeFor *)(&base); auto cppres = tgpowi(cppbase, exp); return *(Fcomplex *)(&cppres); } extern "C" Dcomplex RTNAME(zpowk)(Dcomplex base, std::int64_t exp) { - auto cppbase = *(std::complex *)(&base); + auto cppbase = *(CppTypeFor *)(&base); auto cppres = tgpowi(cppbase, exp); return *(Dcomplex *)(&cppres); } @@ -154,15 +156,16 @@ struct Qcomplex { }; extern "C" Dcomplex RTNAME(cqpowi)(Qcomplex base, std::int32_t exp) { - auto cppbase = *(std::complex *)(&base); + auto cppbase = *(rtcmplx::complex *)(&base); auto cppres = tgpowi(cppbase, exp); return *(Qcomplex *)(&cppres); } extern "C" Dcomplex RTNAME(cqpowk)(Qcomplex base, std::int64_t exp) { - auto cppbase = *(std::complex *)(&base); + auto cppbase = *(rtcmplx::complex *)(&base); auto cppres = tgpowi(cppbase, exp); return *(Qcomplex *)(&cppres); } #endif #endif +} // namespace Fortran::runtime diff --git a/flang/runtime/complex-reduction.c b/flang/runtime/complex-reduction.c index 37ce3fa410016b..232c5452488f1a 100644 --- a/flang/runtime/complex-reduction.c +++ b/flang/runtime/complex-reduction.c @@ -119,7 +119,7 @@ ADAPT_REDUCTION(SumComplex4, float_Complex_t, CppComplexFloat, CMPLXF, REDUCTION_ARGS, REDUCTION_ARG_NAMES) ADAPT_REDUCTION(SumComplex8, double_Complex_t, CppComplexDouble, CMPLX, REDUCTION_ARGS, REDUCTION_ARG_NAMES) -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 ADAPT_REDUCTION(SumComplex10, long_double_Complex_t, CppComplexLongDouble, CMPLXL, REDUCTION_ARGS, REDUCTION_ARG_NAMES) #endif @@ -133,7 +133,7 @@ ADAPT_REDUCTION(ProductComplex4, float_Complex_t, CppComplexFloat, CMPLXF, REDUCTION_ARGS, REDUCTION_ARG_NAMES) ADAPT_REDUCTION(ProductComplex8, double_Complex_t, CppComplexDouble, CMPLX, REDUCTION_ARGS, REDUCTION_ARG_NAMES) -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 ADAPT_REDUCTION(ProductComplex10, long_double_Complex_t, CppComplexLongDouble, CMPLXL, REDUCTION_ARGS, REDUCTION_ARG_NAMES) #endif @@ -147,7 +147,7 @@ ADAPT_REDUCTION(DotProductComplex4, float_Complex_t, CppComplexFloat, CMPLXF, DOT_PRODUCT_ARGS, DOT_PRODUCT_ARG_NAMES) ADAPT_REDUCTION(DotProductComplex8, double_Complex_t, CppComplexDouble, CMPLX, DOT_PRODUCT_ARGS, DOT_PRODUCT_ARG_NAMES) -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 ADAPT_REDUCTION(DotProductComplex10, long_double_Complex_t, CppComplexLongDouble, CMPLXL, DOT_PRODUCT_ARGS, DOT_PRODUCT_ARG_NAMES) #endif @@ -173,7 +173,7 @@ ADAPT_REDUCTION(ReduceComplex8Ref, double_Complex_t, CppComplexDouble, CMPLX, ADAPT_REDUCTION(ReduceComplex8Value, double_Complex_t, CppComplexDouble, CMPLX, RARGS, REDUCE_ARG_NAMES) #undef RARGS -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 #define RARGS REDUCE_ARGS(long_double_Complex_t, long_double_Complex_t_ref_op) ADAPT_REDUCTION(ReduceComplex10Ref, long_double_Complex_t, CppComplexLongDouble, CMPLXL, RARGS, REDUCE_ARG_NAMES) diff --git a/flang/runtime/dot-product.cpp b/flang/runtime/dot-product.cpp index 977698269bcb46..aafef379fad43c 100644 --- a/flang/runtime/dot-product.cpp +++ b/flang/runtime/dot-product.cpp @@ -21,11 +21,6 @@ namespace Fortran::runtime { // Beware: DOT_PRODUCT of COMPLEX data uses the complex conjugate of the first // argument; MATMUL does not. -// Suppress the warnings about calling __host__-only std::complex operators, -// defined in C++ STD header files, from __device__ code. -RT_DIAG_PUSH -RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN - // General accumulator for any type and stride; this is not used for // contiguous numeric vectors. template @@ -42,7 +37,7 @@ class Accumulator { const XT &xElement{*x_.Element(&xAt)}; const YT &yElement{*y_.Element(&yAt)}; if constexpr (RCAT == TypeCategory::Complex) { - sum_ += std::conj(static_cast(xElement)) * + sum_ += rtcmplx::conj(static_cast(xElement)) * static_cast(yElement); } else { sum_ += static_cast(xElement) * static_cast(yElement); @@ -77,9 +72,9 @@ static inline RT_API_ATTRS CppTypeFor DoDotProduct( // TODO: call BLAS-1 SDOT or SDSDOT } else if constexpr (std::is_same_v) { // TODO: call BLAS-1 DDOT - } else if constexpr (std::is_same_v>) { + } else if constexpr (std::is_same_v>) { // TODO: call BLAS-1 CDOTC - } else if constexpr (std::is_same_v>) { + } else if constexpr (std::is_same_v>) { // TODO: call BLAS-1 ZDOTC } } @@ -89,12 +84,12 @@ static inline RT_API_ATTRS CppTypeFor DoDotProduct( AccumType accum{}; if constexpr (RCAT == TypeCategory::Complex) { for (SubscriptValue j{0}; j < n; ++j) { - // std::conj() may instantiate its argument twice, + // conj() may instantiate its argument twice, // so xp has to be incremented separately. // This is a workaround for an alleged bug in clang, // that shows up as: // warning: multiple unsequenced modifications to 'xp' - accum += std::conj(static_cast(*xp)) * + accum += rtcmplx::conj(static_cast(*xp)) * static_cast(*yp++); xp++; } @@ -117,8 +112,6 @@ static inline RT_API_ATTRS CppTypeFor DoDotProduct( return static_cast(accumulator.GetResult()); } -RT_DIAG_POP - template struct DotProduct { using Result = CppTypeFor; template struct DP1 { @@ -197,7 +190,7 @@ CppTypeFor RTDEF(DotProductReal8)( const Descriptor &x, const Descriptor &y, const char *source, int line) { return DotProduct{}(x, y, source, line); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(DotProductReal10)( const Descriptor &x, const Descriptor &y, const char *source, int line) { return DotProduct{}(x, y, source, line); @@ -218,7 +211,7 @@ void RTDEF(CppDotProductComplex8)(CppTypeFor &result, const Descriptor &x, const Descriptor &y, const char *source, int line) { result = DotProduct{}(x, y, source, line); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 void RTDEF(CppDotProductComplex10)( CppTypeFor &result, const Descriptor &x, const Descriptor &y, const char *source, int line) { diff --git a/flang/runtime/extrema.cpp b/flang/runtime/extrema.cpp index d6e9633372f524..2658709b7de86b 100644 --- a/flang/runtime/extrema.cpp +++ b/flang/runtime/extrema.cpp @@ -236,7 +236,7 @@ void RTDEF(MaxlocReal8)(Descriptor &result, const Descriptor &x, int kind, TotalNumericMaxOrMinLoc( "MAXLOC", result, x, kind, source, line, mask, back); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 void RTDEF(MaxlocReal10)(Descriptor &result, const Descriptor &x, int kind, const char *source, int line, const Descriptor *mask, bool back) { TotalNumericMaxOrMinLoc( @@ -292,7 +292,7 @@ void RTDEF(MinlocReal8)(Descriptor &result, const Descriptor &x, int kind, TotalNumericMaxOrMinLoc( "MINLOC", result, x, kind, source, line, mask, back); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 void RTDEF(MinlocReal10)(Descriptor &result, const Descriptor &x, int kind, const char *source, int line, const Descriptor *mask, bool back) { TotalNumericMaxOrMinLoc( @@ -614,7 +614,7 @@ CppTypeFor RTDEF(MaxvalReal8)(const Descriptor &x, return TotalNumericMaxOrMin( x, source, line, dim, mask, "MAXVAL"); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(MaxvalReal10)(const Descriptor &x, const char *source, int line, int dim, const Descriptor *mask) { return TotalNumericMaxOrMin( @@ -674,7 +674,7 @@ CppTypeFor RTDEF(MinvalReal8)(const Descriptor &x, return TotalNumericMaxOrMin( x, source, line, dim, mask, "MINVAL"); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(MinvalReal10)(const Descriptor &x, const char *source, int line, int dim, const Descriptor *mask) { return TotalNumericMaxOrMin( @@ -730,7 +730,7 @@ CppTypeFor RTDEF(Norm2_8)( return GetTotalReduction( x, source, line, dim, nullptr, Norm2Accumulator<8>{x}, "NORM2"); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(Norm2_10)( const Descriptor &x, const char *source, int line, int dim) { return GetTotalReduction( diff --git a/flang/runtime/matmul-transpose.cpp b/flang/runtime/matmul-transpose.cpp index 283472650a1c69..bafa05056bebc4 100644 --- a/flang/runtime/matmul-transpose.cpp +++ b/flang/runtime/matmul-transpose.cpp @@ -32,11 +32,6 @@ namespace { using namespace Fortran::runtime; -// Suppress the warnings about calling __host__-only std::complex operators, -// defined in C++ STD header files, from __device__ code. -RT_DIAG_PUSH -RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN - // Contiguous numeric TRANSPOSE(matrix)*matrix multiplication // TRANSPOSE(matrix(n, rows)) * matrix(n,cols) -> // matrix(rows, n) * matrix(n,cols) -> matrix(rows,cols) @@ -91,8 +86,6 @@ inline static RT_API_ATTRS void MatrixTransposedTimesMatrix( } } -RT_DIAG_POP - template inline static RT_API_ATTRS void MatrixTransposedTimesMatrixHelper( CppTypeFor *RESTRICT product, SubscriptValue rows, @@ -118,9 +111,6 @@ inline static RT_API_ATTRS void MatrixTransposedTimesMatrixHelper( } } -RT_DIAG_PUSH -RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN - // Contiguous numeric matrix*vector multiplication // matrix(rows,n) * column vector(n) -> column vector(rows) // Straightforward algorithm: @@ -158,8 +148,6 @@ inline static RT_API_ATTRS void MatrixTransposedTimesVector( } } -RT_DIAG_POP - template inline static RT_API_ATTRS void MatrixTransposedTimesVectorHelper( CppTypeFor *RESTRICT product, SubscriptValue rows, @@ -174,9 +162,6 @@ inline static RT_API_ATTRS void MatrixTransposedTimesVectorHelper( } } -RT_DIAG_PUSH -RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN - // Implements an instance of MATMUL for given argument types. template @@ -341,8 +326,6 @@ inline static RT_API_ATTRS void DoMatmulTranspose( } } -RT_DIAG_POP - template struct MatmulTransposeHelper { diff --git a/flang/runtime/matmul.cpp b/flang/runtime/matmul.cpp index 252557e2f9e7ad..a5737a9bc62075 100644 --- a/flang/runtime/matmul.cpp +++ b/flang/runtime/matmul.cpp @@ -31,11 +31,6 @@ namespace { using namespace Fortran::runtime; -// Suppress the warnings about calling __host__-only std::complex operators, -// defined in C++ STD header files, from __device__ code. -RT_DIAG_PUSH -RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN - // General accumulator for any type and stride; this is not used for // contiguous numeric cases. template @@ -112,8 +107,6 @@ inline RT_API_ATTRS void MatrixTimesMatrix( } } -RT_DIAG_POP - template inline RT_API_ATTRS void MatrixTimesMatrixHelper( CppTypeFor *RESTRICT product, SubscriptValue rows, @@ -139,9 +132,6 @@ inline RT_API_ATTRS void MatrixTimesMatrixHelper( } } -RT_DIAG_PUSH -RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN - // Contiguous numeric matrix*vector multiplication // matrix(rows,n) * column vector(n) -> column vector(rows) // Straightforward algorithm: @@ -179,8 +169,6 @@ inline RT_API_ATTRS void MatrixTimesVector( } } -RT_DIAG_POP - template inline RT_API_ATTRS void MatrixTimesVectorHelper( CppTypeFor *RESTRICT product, SubscriptValue rows, @@ -194,9 +182,6 @@ inline RT_API_ATTRS void MatrixTimesVectorHelper( } } -RT_DIAG_PUSH -RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN - // Contiguous numeric vector*matrix multiplication // row vector(n) * matrix(n,cols) -> row vector(cols) // Straightforward algorithm: @@ -235,8 +220,6 @@ inline RT_API_ATTRS void VectorTimesMatrix( } } -RT_DIAG_POP - template inline RT_API_ATTRS void VectorTimesMatrixHelper( @@ -251,9 +234,6 @@ inline RT_API_ATTRS void VectorTimesMatrixHelper( } } -RT_DIAG_PUSH -RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN - // Implements an instance of MATMUL for given argument types. template @@ -344,9 +324,9 @@ static inline RT_API_ATTRS void DoMatmul( // TODO: try using CUTLASS for device. } else if constexpr (std::is_same_v) { // TODO: call BLAS-3 DGEMM - } else if constexpr (std::is_same_v>) { + } else if constexpr (std::is_same_v>) { // TODO: call BLAS-3 CGEMM - } else if constexpr (std::is_same_v>) { + } else if constexpr (std::is_same_v>) { // TODO: call BLAS-3 ZGEMM } } @@ -361,9 +341,9 @@ static inline RT_API_ATTRS void DoMatmul( // TODO: call BLAS-2 SGEMV(x,y) } else if constexpr (std::is_same_v) { // TODO: call BLAS-2 DGEMV(x,y) - } else if constexpr (std::is_same_v>) { + } else if constexpr (std::is_same_v>) { // TODO: call BLAS-2 CGEMV(x,y) - } else if constexpr (std::is_same_v>) { + } else if constexpr (std::is_same_v>) { // TODO: call BLAS-2 ZGEMV(x,y) } } @@ -377,9 +357,9 @@ static inline RT_API_ATTRS void DoMatmul( // TODO: call BLAS-2 SGEMV(y,x) } else if constexpr (std::is_same_v) { // TODO: call BLAS-2 DGEMV(y,x) - } else if constexpr (std::is_same_v>) { + } else if constexpr (std::is_same_v>) { // TODO: call BLAS-2 CGEMV(y,x) - } else if constexpr (std::is_same_v>) { + } else if constexpr (std::is_same_v>) { // TODO: call BLAS-2 ZGEMV(y,x) } } @@ -441,8 +421,6 @@ static inline RT_API_ATTRS void DoMatmul( } } -RT_DIAG_POP - template struct MatmulHelper { diff --git a/flang/runtime/numeric.cpp b/flang/runtime/numeric.cpp index 9a8ddc6615564d..23f8da3f81f176 100644 --- a/flang/runtime/numeric.cpp +++ b/flang/runtime/numeric.cpp @@ -144,7 +144,7 @@ inline RT_API_ATTRS CppTypeFor SelectedRealKind( #ifdef FLANG_RUNTIME_NO_REAL_3 mask &= ~(1 << 3); #endif -#if LDBL_MANT_DIG < 64 || defined FLANG_RUNTIME_NO_REAL_10 +#if !HAS_FLOAT80 || defined FLANG_RUNTIME_NO_REAL_10 mask &= ~(1 << 10); #endif #if LDBL_MANT_DIG < 64 || defined FLANG_RUNTIME_NO_REAL_16 @@ -276,7 +276,7 @@ CppTypeFor RTDEF(Ceiling8_16)( return Ceiling>(x); } #endif -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(Ceiling10_1)( CppTypeFor x) { return Ceiling>(x); @@ -332,7 +332,7 @@ CppTypeFor RTDEF(ErfcScaled8)( CppTypeFor x) { return ErfcScaled(x); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(ErfcScaled10)( CppTypeFor x) { return ErfcScaled(x); @@ -361,7 +361,7 @@ CppTypeFor RTDEF(Exponent8_8)( CppTypeFor x) { return Exponent>(x); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(Exponent10_4)( CppTypeFor x) { return Exponent>(x); @@ -416,7 +416,7 @@ CppTypeFor RTDEF(Floor8_16)( return Floor>(x); } #endif -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(Floor10_1)( CppTypeFor x) { return Floor>(x); @@ -472,7 +472,7 @@ CppTypeFor RTDEF(Fraction8)( CppTypeFor x) { return Fraction(x); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(Fraction10)( CppTypeFor x) { return Fraction(x); @@ -485,7 +485,7 @@ bool RTDEF(IsFinite4)(CppTypeFor x) { bool RTDEF(IsFinite8)(CppTypeFor x) { return std::isfinite(x); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 bool RTDEF(IsFinite10)(CppTypeFor x) { return std::isfinite(x); } @@ -501,7 +501,7 @@ bool RTDEF(IsNaN4)(CppTypeFor x) { bool RTDEF(IsNaN8)(CppTypeFor x) { return std::isnan(x); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 bool RTDEF(IsNaN10)(CppTypeFor x) { return std::isnan(x); } @@ -553,7 +553,7 @@ CppTypeFor RTDEF(ModReal8)( const char *sourceFile, int sourceLine) { return RealMod(x, p, sourceFile, sourceLine); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(ModReal10)( CppTypeFor x, CppTypeFor p, const char *sourceFile, int sourceLine) { @@ -603,7 +603,7 @@ CppTypeFor RTDEF(ModuloReal8)( const char *sourceFile, int sourceLine) { return RealMod(x, p, sourceFile, sourceLine); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(ModuloReal10)( CppTypeFor x, CppTypeFor p, const char *sourceFile, int sourceLine) { @@ -619,7 +619,7 @@ CppTypeFor RTDEF(Nearest8)( CppTypeFor x, bool positive) { return Nearest<53>(x, positive); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(Nearest10)( CppTypeFor x, bool positive) { return Nearest<64>(x, positive); @@ -670,7 +670,7 @@ CppTypeFor RTDEF(Nint8_16)( return Nint>(x); } #endif -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(Nint10_1)( CppTypeFor x) { return Nint>(x); @@ -726,7 +726,7 @@ CppTypeFor RTDEF(RRSpacing8)( CppTypeFor x) { return RRSpacing<53>(x); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(RRSpacing10)( CppTypeFor x) { return RRSpacing<64>(x); @@ -741,7 +741,7 @@ CppTypeFor RTDEF(SetExponent8)( CppTypeFor x, std::int64_t p) { return SetExponent(x, p); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(SetExponent10)( CppTypeFor x, std::int64_t p) { return SetExponent(x, p); @@ -756,7 +756,7 @@ CppTypeFor RTDEF(Scale8)( CppTypeFor x, std::int64_t p) { return Scale(x, p); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(Scale10)( CppTypeFor x, std::int64_t p) { return Scale(x, p); @@ -876,7 +876,7 @@ CppTypeFor RTDEF(Spacing8)( CppTypeFor x) { return Spacing<53>(x); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(Spacing10)( CppTypeFor x) { return Spacing<64>(x); @@ -893,7 +893,7 @@ CppTypeFor RTDEF(FPow8i)( CppTypeFor e) { return FPowI(b, e); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(FPow10i)( CppTypeFor b, CppTypeFor e) { @@ -918,7 +918,7 @@ CppTypeFor RTDEF(FPow8k)( CppTypeFor e) { return FPowI(b, e); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(FPow10k)( CppTypeFor b, CppTypeFor e) { diff --git a/flang/runtime/product.cpp b/flang/runtime/product.cpp index 7fc0fcd3b107de..39b40d82b05401 100644 --- a/flang/runtime/product.cpp +++ b/flang/runtime/product.cpp @@ -36,16 +36,11 @@ template class NonComplexProductAccumulator { INTERMEDIATE product_{1}; }; -// Suppress the warnings about calling __host__-only std::complex operators, -// defined in C++ STD header files, from __device__ code. -RT_DIAG_PUSH -RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN - template class ComplexProductAccumulator { public: explicit RT_API_ATTRS ComplexProductAccumulator(const Descriptor &array) : array_{array} {} - RT_API_ATTRS void Reinitialize() { product_ = std::complex{1, 0}; } + RT_API_ATTRS void Reinitialize() { product_ = rtcmplx::complex{1, 0}; } template RT_API_ATTRS void GetResult(A *p, int /*zeroBasedDim*/ = -1) const { using ResultPart = typename A::value_type; @@ -60,11 +55,9 @@ template class ComplexProductAccumulator { private: const Descriptor &array_; - std::complex product_{1, 0}; + rtcmplx::complex product_{1, 0}; }; -RT_DIAG_POP - extern "C" { RT_EXT_API_GROUP_BEGIN @@ -116,7 +109,7 @@ CppTypeFor RTDEF(ProductReal8)(const Descriptor &x, NonComplexProductAccumulator>{x}, "PRODUCT"); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(ProductReal10)(const Descriptor &x, const char *source, int line, int dim, const Descriptor *mask) { return GetTotalReduction(x, source, line, dim, mask, @@ -147,7 +140,7 @@ void RTDEF(CppProductComplex8)(CppTypeFor &result, mask, ComplexProductAccumulator>{x}, "PRODUCT"); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 void RTDEF(CppProductComplex10)(CppTypeFor &result, const Descriptor &x, const char *source, int line, int dim, const Descriptor *mask) { diff --git a/flang/runtime/random.cpp b/flang/runtime/random.cpp index 69de9b8c96fb5d..9ec961fd058745 100644 --- a/flang/runtime/random.cpp +++ b/flang/runtime/random.cpp @@ -66,7 +66,7 @@ void RTNAME(RandomNumber)( return; case 10: if constexpr (HasCppTypeFor) { -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 Generate, 64>(harvest); return; #endif diff --git a/flang/runtime/reduce.cpp b/flang/runtime/reduce.cpp index 2f4bb6ea159cf4..6b62e1cf1e76f1 100644 --- a/flang/runtime/reduce.cpp +++ b/flang/runtime/reduce.cpp @@ -395,45 +395,49 @@ void RTDEF(ReduceReal8DimValue)(Descriptor &result, const Descriptor &array, PartialReduction(result, array, array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator); } -#if LDBL_MANT_DIG == 64 -long double RTDEF(ReduceReal10Ref)(const Descriptor &array, - ReferenceReductionOperation operation, const char *source, - int line, int dim, const Descriptor *mask, const long double *identity, - bool ordered) { +#if HAS_FLOAT80 +CppTypeFor RTDEF(ReduceReal10Ref)( + const Descriptor &array, + ReferenceReductionOperation> operation, + const char *source, int line, int dim, const Descriptor *mask, + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; return GetTotalReduction(array, source, line, dim, mask, - ReduceAccumulator{ + ReduceAccumulator, false>{ array, operation, identity, terminator}, "REDUCE"); } -long double RTDEF(ReduceReal10Value)(const Descriptor &array, - ValueReductionOperation operation, const char *source, - int line, int dim, const Descriptor *mask, const long double *identity, - bool ordered) { +CppTypeFor RTDEF(ReduceReal10Value)( + const Descriptor &array, + ValueReductionOperation> operation, + const char *source, int line, int dim, const Descriptor *mask, + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; return GetTotalReduction(array, source, line, dim, mask, - ReduceAccumulator{ + ReduceAccumulator, true>{ array, operation, identity, terminator}, "REDUCE"); } void RTDEF(ReduceReal10DimRef)(Descriptor &result, const Descriptor &array, - ReferenceReductionOperation operation, const char *source, - int line, int dim, const Descriptor *mask, const long double *identity, - bool ordered) { + ReferenceReductionOperation> operation, + const char *source, int line, int dim, const Descriptor *mask, + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; - using Accumulator = ReduceAccumulator; + using Accumulator = + ReduceAccumulator, false>; Accumulator accumulator{array, operation, identity, terminator}; PartialReduction(result, array, array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator); } void RTDEF(ReduceReal10DimValue)(Descriptor &result, const Descriptor &array, - ValueReductionOperation operation, const char *source, - int line, int dim, const Descriptor *mask, const long double *identity, - bool ordered) { + ValueReductionOperation> operation, + const char *source, int line, int dim, const Descriptor *mask, + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; - using Accumulator = ReduceAccumulator; + using Accumulator = + ReduceAccumulator, true>; Accumulator accumulator{array, operation, identity, terminator}; PartialReduction(result, array, array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator); @@ -484,187 +488,199 @@ void RTDEF(ReduceReal16DimValue)(Descriptor &result, const Descriptor &array, } #endif -void RTDEF(CppReduceComplex4Ref)(std::complex &result, +void RTDEF(CppReduceComplex4Ref)(CppTypeFor &result, const Descriptor &array, - ReferenceReductionOperation> operation, + ReferenceReductionOperation> operation, const char *source, int line, int dim, const Descriptor *mask, - const std::complex *identity, bool ordered) { + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; result = GetTotalReduction(array, source, line, dim, mask, - ReduceAccumulator, false>{ + ReduceAccumulator, false>{ array, operation, identity, terminator}, "REDUCE"); } -void RTDEF(CppReduceComplex4Value)(std::complex &result, +void RTDEF(CppReduceComplex4Value)(CppTypeFor &result, const Descriptor &array, - ValueReductionOperation> operation, const char *source, - int line, int dim, const Descriptor *mask, - const std::complex *identity, bool ordered) { + ValueReductionOperation> operation, + const char *source, int line, int dim, const Descriptor *mask, + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; result = GetTotalReduction(array, source, line, dim, mask, - ReduceAccumulator, true>{ + ReduceAccumulator, true>{ array, operation, identity, terminator}, "REDUCE"); } void RTDEF(CppReduceComplex4DimRef)(Descriptor &result, const Descriptor &array, - ReferenceReductionOperation> operation, + ReferenceReductionOperation> operation, const char *source, int line, int dim, const Descriptor *mask, - const std::complex *identity, bool ordered) { + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; - using Accumulator = ReduceAccumulator, false>; + using Accumulator = + ReduceAccumulator, false>; Accumulator accumulator{array, operation, identity, terminator}; PartialReduction(result, array, array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator); } void RTDEF(CppReduceComplex4DimValue)(Descriptor &result, const Descriptor &array, - ValueReductionOperation> operation, const char *source, - int line, int dim, const Descriptor *mask, - const std::complex *identity, bool ordered) { + ValueReductionOperation> operation, + const char *source, int line, int dim, const Descriptor *mask, + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; - using Accumulator = ReduceAccumulator, true>; + using Accumulator = + ReduceAccumulator, true>; Accumulator accumulator{array, operation, identity, terminator}; PartialReduction(result, array, array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator); } -void RTDEF(CppReduceComplex8Ref)(std::complex &result, +void RTDEF(CppReduceComplex8Ref)(CppTypeFor &result, const Descriptor &array, - ReferenceReductionOperation> operation, + ReferenceReductionOperation> operation, const char *source, int line, int dim, const Descriptor *mask, - const std::complex *identity, bool ordered) { + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; result = GetTotalReduction(array, source, line, dim, mask, - ReduceAccumulator, false>{ + ReduceAccumulator, false>{ array, operation, identity, terminator}, "REDUCE"); } -void RTDEF(CppReduceComplex8Value)(std::complex &result, +void RTDEF(CppReduceComplex8Value)(CppTypeFor &result, const Descriptor &array, - ValueReductionOperation> operation, const char *source, - int line, int dim, const Descriptor *mask, - const std::complex *identity, bool ordered) { + ValueReductionOperation> operation, + const char *source, int line, int dim, const Descriptor *mask, + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; result = GetTotalReduction(array, source, line, dim, mask, - ReduceAccumulator, true>{ + ReduceAccumulator, true>{ array, operation, identity, terminator}, "REDUCE"); } void RTDEF(CppReduceComplex8DimRef)(Descriptor &result, const Descriptor &array, - ReferenceReductionOperation> operation, + ReferenceReductionOperation> operation, const char *source, int line, int dim, const Descriptor *mask, - const std::complex *identity, bool ordered) { + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; - using Accumulator = ReduceAccumulator, false>; + using Accumulator = + ReduceAccumulator, false>; Accumulator accumulator{array, operation, identity, terminator}; PartialReduction(result, array, array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator); } void RTDEF(CppReduceComplex8DimValue)(Descriptor &result, const Descriptor &array, - ValueReductionOperation> operation, const char *source, - int line, int dim, const Descriptor *mask, - const std::complex *identity, bool ordered) { + ValueReductionOperation> operation, + const char *source, int line, int dim, const Descriptor *mask, + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; - using Accumulator = ReduceAccumulator, true>; + using Accumulator = + ReduceAccumulator, true>; Accumulator accumulator{array, operation, identity, terminator}; PartialReduction(result, array, array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator); } -#if LDBL_MANT_DIG == 64 -void RTDEF(CppReduceComplex10Ref)(std::complex &result, +#if HAS_FLOAT80 +void RTDEF(CppReduceComplex10Ref)(CppTypeFor &result, const Descriptor &array, - ReferenceReductionOperation> operation, + ReferenceReductionOperation> + operation, const char *source, int line, int dim, const Descriptor *mask, - const std::complex *identity, bool ordered) { + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; result = GetTotalReduction(array, source, line, dim, mask, - ReduceAccumulator, false>{ + ReduceAccumulator, false>{ array, operation, identity, terminator}, "REDUCE"); } -void RTDEF(CppReduceComplex10Value)(std::complex &result, - const Descriptor &array, - ValueReductionOperation> operation, +void RTDEF(CppReduceComplex10Value)( + CppTypeFor &result, const Descriptor &array, + ValueReductionOperation> operation, const char *source, int line, int dim, const Descriptor *mask, - const std::complex *identity, bool ordered) { + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; result = GetTotalReduction(array, source, line, dim, mask, - ReduceAccumulator, true>{ + ReduceAccumulator, true>{ array, operation, identity, terminator}, "REDUCE"); } void RTDEF(CppReduceComplex10DimRef)(Descriptor &result, const Descriptor &array, - ReferenceReductionOperation> operation, + ReferenceReductionOperation> + operation, const char *source, int line, int dim, const Descriptor *mask, - const std::complex *identity, bool ordered) { + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; - using Accumulator = ReduceAccumulator, false>; + using Accumulator = + ReduceAccumulator, false>; Accumulator accumulator{array, operation, identity, terminator}; PartialReduction(result, array, array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator); } void RTDEF(CppReduceComplex10DimValue)(Descriptor &result, const Descriptor &array, - ValueReductionOperation> operation, + ValueReductionOperation> operation, const char *source, int line, int dim, const Descriptor *mask, - const std::complex *identity, bool ordered) { + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; - using Accumulator = ReduceAccumulator, true>; + using Accumulator = + ReduceAccumulator, true>; Accumulator accumulator{array, operation, identity, terminator}; PartialReduction(result, array, array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator); } #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 -void RTDEF(CppReduceComplex16Ref)(std::complex &result, +void RTDEF(CppReduceComplex16Ref)(CppTypeFor &result, const Descriptor &array, - ReferenceReductionOperation> operation, + ReferenceReductionOperation> + operation, const char *source, int line, int dim, const Descriptor *mask, - const std::complex *identity, bool ordered) { + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; result = GetTotalReduction(array, source, line, dim, mask, - ReduceAccumulator, false>{ + ReduceAccumulator, false>{ array, operation, identity, terminator}, "REDUCE"); } -void RTDEF(CppReduceComplex16Value)(std::complex &result, - const Descriptor &array, - ValueReductionOperation> operation, +void RTDEF(CppReduceComplex16Value)( + CppTypeFor &result, const Descriptor &array, + ValueReductionOperation> operation, const char *source, int line, int dim, const Descriptor *mask, - const std::complex *identity, bool ordered) { + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; result = GetTotalReduction(array, source, line, dim, mask, - ReduceAccumulator, true>{ + ReduceAccumulator, true>{ array, operation, identity, terminator}, "REDUCE"); } void RTDEF(CppReduceComplex16DimRef)(Descriptor &result, const Descriptor &array, - ReferenceReductionOperation> operation, + ReferenceReductionOperation> + operation, const char *source, int line, int dim, const Descriptor *mask, - const std::complex *identity, bool ordered) { + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; - using Accumulator = ReduceAccumulator, false>; + using Accumulator = + ReduceAccumulator, false>; Accumulator accumulator{array, operation, identity, terminator}; PartialReduction(result, array, array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator); } void RTDEF(CppReduceComplex16DimValue)(Descriptor &result, const Descriptor &array, - ValueReductionOperation> operation, + ValueReductionOperation> operation, const char *source, int line, int dim, const Descriptor *mask, - const std::complex *identity, bool ordered) { + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; - using Accumulator = ReduceAccumulator, true>; + using Accumulator = + ReduceAccumulator, true>; Accumulator accumulator{array, operation, identity, terminator}; PartialReduction(result, array, array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator); diff --git a/flang/runtime/reduction-templates.h b/flang/runtime/reduction-templates.h index a51404c9637620..6b7d57f98384ae 100644 --- a/flang/runtime/reduction-templates.h +++ b/flang/runtime/reduction-templates.h @@ -321,8 +321,8 @@ RT_VAR_GROUP_BEGIN static constexpr RT_CONST_VAR_ATTRS int Norm2LargestLDKind { #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 16 -#elif LDBL_MANT_DIG == 64 - 10 +#elif HAS_FLOAT80 + 10 #else 8 #endif diff --git a/flang/runtime/sum.cpp b/flang/runtime/sum.cpp index 63d8c9029a0ef5..88c6c914e1e243 100644 --- a/flang/runtime/sum.cpp +++ b/flang/runtime/sum.cpp @@ -141,18 +141,18 @@ CppTypeFor RTDEF(SumReal8)(const Descriptor &x, return GetTotalReduction( x, source, line, dim, mask, RealSumAccumulator{x}, "SUM"); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(SumReal10)(const Descriptor &x, const char *source, int line, int dim, const Descriptor *mask) { - return GetTotalReduction( - x, source, line, dim, mask, RealSumAccumulator{x}, "SUM"); + return GetTotalReduction(x, source, line, dim, mask, + RealSumAccumulator>{x}, "SUM"); } #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 CppTypeFor RTDEF(SumReal16)(const Descriptor &x, const char *source, int line, int dim, const Descriptor *mask) { - return GetTotalReduction( - x, source, line, dim, mask, RealSumAccumulator{x}, "SUM"); + return GetTotalReduction(x, source, line, dim, mask, + RealSumAccumulator>{x}, "SUM"); } #endif @@ -168,20 +168,22 @@ void RTDEF(CppSumComplex8)(CppTypeFor &result, result = GetTotalReduction( x, source, line, dim, mask, ComplexSumAccumulator{x}, "SUM"); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 void RTDEF(CppSumComplex10)(CppTypeFor &result, const Descriptor &x, const char *source, int line, int dim, const Descriptor *mask) { - result = GetTotalReduction( - x, source, line, dim, mask, ComplexSumAccumulator{x}, "SUM"); + result = + GetTotalReduction(x, source, line, dim, mask, + ComplexSumAccumulator>{x}, "SUM"); } #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 void RTDEF(CppSumComplex16)(CppTypeFor &result, const Descriptor &x, const char *source, int line, int dim, const Descriptor *mask) { - result = GetTotalReduction( - x, source, line, dim, mask, ComplexSumAccumulator{x}, "SUM"); + result = + GetTotalReduction(x, source, line, dim, mask, + ComplexSumAccumulator>{x}, "SUM"); } #endif diff --git a/flang/runtime/transformational.cpp b/flang/runtime/transformational.cpp index b6b204be4418c9..0ce18171274e42 100644 --- a/flang/runtime/transformational.cpp +++ b/flang/runtime/transformational.cpp @@ -342,7 +342,7 @@ void RTDEF(BesselJn_8)(Descriptor &result, int32_t n1, int32_t n2, result, n1, n2, x, bn2, bn2_1, sourceFile, line); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 void RTDEF(BesselJn_10)(Descriptor &result, int32_t n1, int32_t n2, CppTypeFor x, CppTypeFor bn2, @@ -375,7 +375,7 @@ void RTDEF(BesselJnX0_8)(Descriptor &result, int32_t n1, int32_t n2, DoBesselJnX0(result, n1, n2, sourceFile, line); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 void RTDEF(BesselJnX0_10)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile, int line) { DoBesselJnX0(result, n1, n2, sourceFile, line); @@ -405,7 +405,7 @@ void RTDEF(BesselYn_8)(Descriptor &result, int32_t n1, int32_t n2, result, n1, n2, x, bn1, bn1_1, sourceFile, line); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 void RTDEF(BesselYn_10)(Descriptor &result, int32_t n1, int32_t n2, CppTypeFor x, CppTypeFor bn1, @@ -438,7 +438,7 @@ void RTDEF(BesselYnX0_8)(Descriptor &result, int32_t n1, int32_t n2, DoBesselYnX0(result, n1, n2, sourceFile, line); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 void RTDEF(BesselYnX0_10)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile, int line) { DoBesselYnX0(result, n1, n2, sourceFile, line); diff --git a/flang/unittests/Runtime/Numeric.cpp b/flang/unittests/Runtime/Numeric.cpp index 799756aab3839a..3e574c06b091e8 100644 --- a/flang/unittests/Runtime/Numeric.cpp +++ b/flang/unittests/Runtime/Numeric.cpp @@ -34,7 +34,7 @@ TEST(Numeric, Floor) { TEST(Numeric, Erfc_scaled) { EXPECT_NEAR(RTNAME(ErfcScaled4)(Real<4>{20.0}), 0.02817434874, 1.0e-8); EXPECT_NEAR(RTNAME(ErfcScaled8)(Real<8>{20.0}), 0.02817434874, 1.0e-11); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 EXPECT_NEAR(RTNAME(ErfcScaled10)(Real<10>{20.0}), 0.02817434874, 1.0e-8); #endif } @@ -295,7 +295,7 @@ TEST(Numeric, FPowI) { EXPECT_EQ(RTNAME(FPow8k)(Real<8>{-3}, Int<8>{3}), Real<8>{-27}); EXPECT_EQ(RTNAME(FPow8k)(Real<8>{-2}, Int<8>{-3}), Real<8>{-0.125}); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 EXPECT_EQ(RTNAME(FPow10i)(Real<10>{0}, Int<4>{0}), Real<10>{1}); EXPECT_EQ(RTNAME(FPow10i)(Real<10>{0.3}, Int<4>{0}), Real<10>{1}); EXPECT_EQ(RTNAME(FPow10i)(Real<10>{2}, Int<4>{-1}), Real<10>{0.5}); diff --git a/flang/unittests/Runtime/Transformational.cpp b/flang/unittests/Runtime/Transformational.cpp index 5836e70c740f9a..b36ea0a60c670c 100644 --- a/flang/unittests/Runtime/Transformational.cpp +++ b/flang/unittests/Runtime/Transformational.cpp @@ -108,7 +108,7 @@ template static void testBesselJnX0(BesselX0FuncType rtFunc) { static void testBesselJn() { testBesselJn<4>(RTNAME(BesselJn_4)); testBesselJn<8>(RTNAME(BesselJn_8)); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 testBesselJn<10>(RTNAME(BesselJn_10)); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 @@ -117,7 +117,7 @@ static void testBesselJn() { testBesselJnX0<4>(RTNAME(BesselJnX0_4)); testBesselJnX0<8>(RTNAME(BesselJnX0_8)); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 testBesselJnX0<10>(RTNAME(BesselJnX0_10)); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 @@ -201,7 +201,7 @@ template static void testBesselYnX0(BesselX0FuncType rtFunc) { static void testBesselYn() { testBesselYn<4>(RTNAME(BesselYn_4)); testBesselYn<8>(RTNAME(BesselYn_8)); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 testBesselYn<10>(RTNAME(BesselYn_10)); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 @@ -210,7 +210,7 @@ static void testBesselYn() { testBesselYnX0<4>(RTNAME(BesselYnX0_4)); testBesselYnX0<8>(RTNAME(BesselYnX0_8)); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 testBesselYnX0<10>(RTNAME(BesselYnX0_10)); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 @@ -523,7 +523,7 @@ TEST(Transformational, Unpack) { result.Destroy(); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 // Make sure the destination descriptor is created by the runtime // with proper element size, when REAL*10 maps to 'long double'. #define Real10CppType long double From f5d62d76479f1788be92ee9a588766e1d5c79d8d Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Tue, 17 Sep 2024 10:18:29 -0500 Subject: [PATCH 098/321] [SimplifyCFG] Add tests for deducing paths unreachable if they cause div/rem UB; NFC --- .../SimplifyCFG/UnreachableEliminate.ll | 283 ++++++++++++++++++ 1 file changed, 283 insertions(+) diff --git a/llvm/test/Transforms/SimplifyCFG/UnreachableEliminate.ll b/llvm/test/Transforms/SimplifyCFG/UnreachableEliminate.ll index c4602e72ecbce0..8d3b35bfb740aa 100644 --- a/llvm/test/Transforms/SimplifyCFG/UnreachableEliminate.ll +++ b/llvm/test/Transforms/SimplifyCFG/UnreachableEliminate.ll @@ -918,6 +918,289 @@ bb5: ; preds = %bb3, %bb ret i32 %i7 } +declare void @side.effect() +declare i8 @get.i8() + +define i8 @udiv_by_zero(i8 %x, i8 %i, i8 %v) { +; CHECK-LABEL: @udiv_by_zero( +; CHECK-NEXT: entry: +; CHECK-NEXT: switch i8 [[I:%.*]], label [[SW_DEFAULT:%.*]] [ +; CHECK-NEXT: i8 0, label [[RETURN:%.*]] +; CHECK-NEXT: i8 2, label [[SW_BB1:%.*]] +; CHECK-NEXT: i8 9, label [[SW_BB2:%.*]] +; CHECK-NEXT: ] +; CHECK: sw.bb1: +; CHECK-NEXT: br label [[RETURN]] +; CHECK: sw.bb2: +; CHECK-NEXT: br label [[RETURN]] +; CHECK: sw.default: +; CHECK-NEXT: br label [[RETURN]] +; CHECK: return: +; CHECK-NEXT: [[Y:%.*]] = phi i8 [ 2, [[SW_BB1]] ], [ 9, [[SW_BB2]] ], [ [[V:%.*]], [[SW_DEFAULT]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[R:%.*]] = udiv i8 [[X:%.*]], [[Y]] +; CHECK-NEXT: ret i8 [[R]] +; +entry: + switch i8 %i, label %sw.default [ + i8 0, label %sw.bb0 + i8 2, label %sw.bb1 + i8 9, label %sw.bb2 + ] + +sw.bb0: + br label %return + +sw.bb1: + br label %return +sw.bb2: + br label %return +sw.default: + br label %return + +return: + %y = phi i8 [ 0, %sw.bb0 ], [ 2, %sw.bb1 ], [ 9, %sw.bb2 ], [ %v, %sw.default ] + %r = udiv i8 %x, %y + ret i8 %r +} + +define i8 @urem_by_zero(i8 %x, i8 %i, i8 %v) { +; CHECK-LABEL: @urem_by_zero( +; CHECK-NEXT: entry: +; CHECK-NEXT: switch i8 [[I:%.*]], label [[SW_DEFAULT:%.*]] [ +; CHECK-NEXT: i8 0, label [[RETURN:%.*]] +; CHECK-NEXT: i8 2, label [[SW_BB1:%.*]] +; CHECK-NEXT: i8 9, label [[SW_BB2:%.*]] +; CHECK-NEXT: ] +; CHECK: sw.bb1: +; CHECK-NEXT: br label [[RETURN]] +; CHECK: sw.bb2: +; CHECK-NEXT: br label [[RETURN]] +; CHECK: sw.default: +; CHECK-NEXT: br label [[RETURN]] +; CHECK: return: +; CHECK-NEXT: [[Y:%.*]] = phi i8 [ 2, [[SW_BB1]] ], [ 9, [[SW_BB2]] ], [ 0, [[SW_DEFAULT]] ], [ [[V:%.*]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[R:%.*]] = urem i8 [[X:%.*]], [[Y]] +; CHECK-NEXT: ret i8 [[R]] +; +entry: + switch i8 %i, label %sw.default [ + i8 0, label %sw.bb0 + i8 2, label %sw.bb1 + i8 9, label %sw.bb2 + ] + +sw.bb0: + br label %return + +sw.bb1: + br label %return +sw.bb2: + br label %return +sw.default: + br label %return + +return: + %y = phi i8 [ %v, %sw.bb0 ], [ 2, %sw.bb1 ], [ 9, %sw.bb2 ], [ 0, %sw.default ] + %r = urem i8 %x, %y + ret i8 %r +} + +define i8 @udiv_of_zero_okay(i8 %x, i8 %i, i8 %v) { +; CHECK-LABEL: @udiv_of_zero_okay( +; CHECK-NEXT: entry: +; CHECK-NEXT: switch i8 [[I:%.*]], label [[SW_DEFAULT:%.*]] [ +; CHECK-NEXT: i8 0, label [[RETURN:%.*]] +; CHECK-NEXT: i8 2, label [[SW_BB1:%.*]] +; CHECK-NEXT: i8 9, label [[SW_BB2:%.*]] +; CHECK-NEXT: ] +; CHECK: sw.bb1: +; CHECK-NEXT: br label [[RETURN]] +; CHECK: sw.bb2: +; CHECK-NEXT: br label [[RETURN]] +; CHECK: sw.default: +; CHECK-NEXT: br label [[RETURN]] +; CHECK: return: +; CHECK-NEXT: [[Y:%.*]] = phi i8 [ 2, [[SW_BB1]] ], [ 9, [[SW_BB2]] ], [ [[V:%.*]], [[SW_DEFAULT]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[R:%.*]] = udiv i8 [[Y]], [[X:%.*]] +; CHECK-NEXT: ret i8 [[R]] +; +entry: + switch i8 %i, label %sw.default [ + i8 0, label %sw.bb0 + i8 2, label %sw.bb1 + i8 9, label %sw.bb2 + ] + +sw.bb0: + br label %return + +sw.bb1: + br label %return +sw.bb2: + br label %return +sw.default: + br label %return + +return: + %y = phi i8 [ 0, %sw.bb0 ], [ 2, %sw.bb1 ], [ 9, %sw.bb2 ], [ %v, %sw.default ] + %r = udiv i8 %y, %x + ret i8 %r +} + +define i8 @srem_by_zero(i8 %x, i8 %i) { +; CHECK-LABEL: @srem_by_zero( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[I:%.*]], 9 +; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: call void @side.effect() +; CHECK-NEXT: br label [[IF_END:%.*]] +; CHECK: if.else: +; CHECK-NEXT: [[V:%.*]] = call i8 @get.i8() +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[Y:%.*]] = phi i8 [ 0, [[IF_THEN]] ], [ [[V]], [[IF_ELSE]] ] +; CHECK-NEXT: [[R:%.*]] = srem i8 [[X:%.*]], [[Y]] +; CHECK-NEXT: ret i8 [[R]] +; +entry: + %cmp = icmp ult i8 %i, 9 + br i1 %cmp, label %if.then, label %if.else + +if.then: + call void @side.effect() + br label %if.end + +if.else: + %v = call i8 @get.i8() + br label %if.end + +if.end: + %y = phi i8 [ 0, %if.then ], [ %v, %if.else ] + %r = srem i8 %x, %y + ret i8 %r +} + +define i8 @srem_no_overflow_okay(i8 %i) { +; CHECK-LABEL: @srem_no_overflow_okay( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[I:%.*]], 9 +; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: call void @side.effect() +; CHECK-NEXT: br label [[IF_END:%.*]] +; CHECK: if.else: +; CHECK-NEXT: [[V:%.*]] = call i8 @get.i8() +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[Y:%.*]] = phi i8 [ -1, [[IF_THEN]] ], [ [[V]], [[IF_ELSE]] ] +; CHECK-NEXT: [[R:%.*]] = srem i8 [[Y]], -128 +; CHECK-NEXT: ret i8 [[R]] +; +entry: + %cmp = icmp ult i8 %i, 9 + br i1 %cmp, label %if.then, label %if.else + +if.then: + call void @side.effect() + br label %if.end + +if.else: + %v = call i8 @get.i8() + br label %if.end + +if.end: + %y = phi i8 [ -1, %if.then ], [ %v, %if.else ] + %r = srem i8 %y, 128 + ret i8 %r +} + +define i8 @sdiv_overflow_ub(i8 %i) { +; CHECK-LABEL: @sdiv_overflow_ub( +; CHECK-NEXT: entry: +; CHECK-NEXT: switch i8 [[I:%.*]], label [[SW_DEFAULT:%.*]] [ +; CHECK-NEXT: i8 0, label [[RETURN:%.*]] +; CHECK-NEXT: i8 2, label [[SW_BB1:%.*]] +; CHECK-NEXT: i8 9, label [[SW_BB2:%.*]] +; CHECK-NEXT: ] +; CHECK: sw.bb1: +; CHECK-NEXT: [[V:%.*]] = call i8 @get.i8() +; CHECK-NEXT: br label [[RETURN]] +; CHECK: sw.bb2: +; CHECK-NEXT: br label [[RETURN]] +; CHECK: sw.default: +; CHECK-NEXT: unreachable +; CHECK: return: +; CHECK-NEXT: [[Y:%.*]] = phi i8 [ [[V]], [[SW_BB1]] ], [ -1, [[SW_BB2]] ], [ 4, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[R:%.*]] = sdiv i8 -128, [[Y]] +; CHECK-NEXT: ret i8 [[R]] +; +entry: + switch i8 %i, label %sw.default [ + i8 0, label %sw.bb0 + i8 2, label %sw.bb1 + i8 9, label %sw.bb2 + ] + +sw.bb0: + br label %return +sw.bb1: + %v = call i8 @get.i8() + br label %return +sw.bb2: + br label %return +sw.default: + unreachable + +return: + %y = phi i8 [ 4, %sw.bb0 ], [ %v, %sw.bb1 ], [ -1, %sw.bb2 ] + %r = sdiv i8 128, %y + ret i8 %r +} + +define i8 @sdiv_overflow_ub_2x(i8 %i) { +; CHECK-LABEL: @sdiv_overflow_ub_2x( +; CHECK-NEXT: entry: +; CHECK-NEXT: switch i8 [[I:%.*]], label [[SW_DEFAULT:%.*]] [ +; CHECK-NEXT: i8 0, label [[RETURN:%.*]] +; CHECK-NEXT: i8 2, label [[SW_BB1:%.*]] +; CHECK-NEXT: i8 9, label [[SW_BB2:%.*]] +; CHECK-NEXT: ] +; CHECK: sw.bb1: +; CHECK-NEXT: [[V:%.*]] = call i8 @get.i8() +; CHECK-NEXT: br label [[RETURN]] +; CHECK: sw.bb2: +; CHECK-NEXT: br label [[RETURN]] +; CHECK: sw.default: +; CHECK-NEXT: unreachable +; CHECK: return: +; CHECK-NEXT: [[Y:%.*]] = phi i8 [ [[V]], [[SW_BB1]] ], [ -1, [[SW_BB2]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[R:%.*]] = sdiv i8 -128, [[Y]] +; CHECK-NEXT: ret i8 [[R]] +; +entry: + switch i8 %i, label %sw.default [ + i8 0, label %sw.bb0 + i8 2, label %sw.bb1 + i8 9, label %sw.bb2 + ] + +sw.bb0: + br label %return +sw.bb1: + %v = call i8 @get.i8() + br label %return +sw.bb2: + br label %return +sw.default: + unreachable + +return: + %y = phi i8 [ 0, %sw.bb0 ], [ %v, %sw.bb1 ], [ -1, %sw.bb2 ] + %r = sdiv i8 128, %y + ret i8 %r +} + attributes #0 = { null_pointer_is_valid } ;. ; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } From 37932643abab699e8bb1def08b7eb4eae7ff1448 Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Tue, 17 Sep 2024 09:40:49 -0500 Subject: [PATCH 099/321] [SimplifyCFG] Deduce paths unreachable if they cause div/rem UB Same we way mark a path unreachable if it may cause a nullptr dereference, div/rem by zero or signed div/rem of INT_MIN by -1 cause immediate UB. Closes #109008 --- llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 10 ++++++++ .../SimplifyCFG/UnreachableEliminate.ll | 25 ++++++------------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index c63618d9dd1297..09461e65e2dc21 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -7885,6 +7885,13 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I, bool PtrValu case Instruction::Call: case Instruction::CallBr: case Instruction::Invoke: + case Instruction::UDiv: + case Instruction::URem: + // Note: signed div/rem of INT_MIN / -1 is also immediate UB, not + // implemented to avoid code complexity as it is unclear how useful such + // logic is. + case Instruction::SDiv: + case Instruction::SRem: return true; } }); @@ -7986,6 +7993,9 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I, bool PtrValu } } } + // Div/Rem by zero is immediate UB + if (match(Use, m_BinOp(m_Value(), m_Specific(I))) && Use->isIntDivRem()) + return true; } return false; } diff --git a/llvm/test/Transforms/SimplifyCFG/UnreachableEliminate.ll b/llvm/test/Transforms/SimplifyCFG/UnreachableEliminate.ll index 8d3b35bfb740aa..aae1ab032f36e4 100644 --- a/llvm/test/Transforms/SimplifyCFG/UnreachableEliminate.ll +++ b/llvm/test/Transforms/SimplifyCFG/UnreachableEliminate.ll @@ -925,18 +925,15 @@ define i8 @udiv_by_zero(i8 %x, i8 %i, i8 %v) { ; CHECK-LABEL: @udiv_by_zero( ; CHECK-NEXT: entry: ; CHECK-NEXT: switch i8 [[I:%.*]], label [[SW_DEFAULT:%.*]] [ -; CHECK-NEXT: i8 0, label [[RETURN:%.*]] -; CHECK-NEXT: i8 2, label [[SW_BB1:%.*]] ; CHECK-NEXT: i8 9, label [[SW_BB2:%.*]] +; CHECK-NEXT: i8 2, label [[RETURN:%.*]] ; CHECK-NEXT: ] -; CHECK: sw.bb1: -; CHECK-NEXT: br label [[RETURN]] ; CHECK: sw.bb2: ; CHECK-NEXT: br label [[RETURN]] ; CHECK: sw.default: ; CHECK-NEXT: br label [[RETURN]] ; CHECK: return: -; CHECK-NEXT: [[Y:%.*]] = phi i8 [ 2, [[SW_BB1]] ], [ 9, [[SW_BB2]] ], [ [[V:%.*]], [[SW_DEFAULT]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[Y:%.*]] = phi i8 [ 9, [[SW_BB2]] ], [ [[V:%.*]], [[SW_DEFAULT]] ], [ 2, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[R:%.*]] = udiv i8 [[X:%.*]], [[Y]] ; CHECK-NEXT: ret i8 [[R]] ; @@ -976,9 +973,9 @@ define i8 @urem_by_zero(i8 %x, i8 %i, i8 %v) { ; CHECK: sw.bb2: ; CHECK-NEXT: br label [[RETURN]] ; CHECK: sw.default: -; CHECK-NEXT: br label [[RETURN]] +; CHECK-NEXT: unreachable ; CHECK: return: -; CHECK-NEXT: [[Y:%.*]] = phi i8 [ 2, [[SW_BB1]] ], [ 9, [[SW_BB2]] ], [ 0, [[SW_DEFAULT]] ], [ [[V:%.*]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[Y:%.*]] = phi i8 [ 2, [[SW_BB1]] ], [ 9, [[SW_BB2]] ], [ [[V:%.*]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[R:%.*]] = urem i8 [[X:%.*]], [[Y]] ; CHECK-NEXT: ret i8 [[R]] ; @@ -1054,13 +1051,10 @@ define i8 @srem_by_zero(i8 %x, i8 %i) { ; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; CHECK: if.then: ; CHECK-NEXT: call void @side.effect() -; CHECK-NEXT: br label [[IF_END:%.*]] +; CHECK-NEXT: unreachable ; CHECK: if.else: ; CHECK-NEXT: [[V:%.*]] = call i8 @get.i8() -; CHECK-NEXT: br label [[IF_END]] -; CHECK: if.end: -; CHECK-NEXT: [[Y:%.*]] = phi i8 [ 0, [[IF_THEN]] ], [ [[V]], [[IF_ELSE]] ] -; CHECK-NEXT: [[R:%.*]] = srem i8 [[X:%.*]], [[Y]] +; CHECK-NEXT: [[R:%.*]] = srem i8 [[X:%.*]], [[V]] ; CHECK-NEXT: ret i8 [[R]] ; entry: @@ -1162,19 +1156,16 @@ define i8 @sdiv_overflow_ub_2x(i8 %i) { ; CHECK-LABEL: @sdiv_overflow_ub_2x( ; CHECK-NEXT: entry: ; CHECK-NEXT: switch i8 [[I:%.*]], label [[SW_DEFAULT:%.*]] [ -; CHECK-NEXT: i8 0, label [[RETURN:%.*]] +; CHECK-NEXT: i8 9, label [[RETURN:%.*]] ; CHECK-NEXT: i8 2, label [[SW_BB1:%.*]] -; CHECK-NEXT: i8 9, label [[SW_BB2:%.*]] ; CHECK-NEXT: ] ; CHECK: sw.bb1: ; CHECK-NEXT: [[V:%.*]] = call i8 @get.i8() ; CHECK-NEXT: br label [[RETURN]] -; CHECK: sw.bb2: -; CHECK-NEXT: br label [[RETURN]] ; CHECK: sw.default: ; CHECK-NEXT: unreachable ; CHECK: return: -; CHECK-NEXT: [[Y:%.*]] = phi i8 [ [[V]], [[SW_BB1]] ], [ -1, [[SW_BB2]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[Y:%.*]] = phi i8 [ [[V]], [[SW_BB1]] ], [ -1, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[R:%.*]] = sdiv i8 -128, [[Y]] ; CHECK-NEXT: ret i8 [[R]] ; From 36192fdfb91c64e97702ee431d246600862871d2 Mon Sep 17 00:00:00 2001 From: Slava Zakharin Date: Wed, 18 Sep 2024 11:22:31 -0700 Subject: [PATCH 100/321] Revert "[flang][runtime] Use cuda::std::complex in F18 runtime CUDA build." (#109173) Reverts llvm/llvm-project#109078 --- flang/include/flang/Common/float80.h | 43 ---- flang/include/flang/Runtime/complex.h | 31 --- flang/include/flang/Runtime/cpp-type.h | 9 +- .../flang/Runtime/matmul-instances.inc | 6 +- flang/include/flang/Runtime/numeric.h | 32 +-- flang/include/flang/Runtime/reduce.h | 214 +++++++----------- flang/include/flang/Runtime/reduction.h | 112 +++++---- .../include/flang/Runtime/transformational.h | 20 +- flang/runtime/complex-powi.cpp | 23 +- flang/runtime/complex-reduction.c | 8 +- flang/runtime/dot-product.cpp | 21 +- flang/runtime/extrema.cpp | 10 +- flang/runtime/matmul-transpose.cpp | 17 ++ flang/runtime/matmul.cpp | 34 ++- flang/runtime/numeric.cpp | 36 +-- flang/runtime/product.cpp | 15 +- flang/runtime/random.cpp | 2 +- flang/runtime/reduce.cpp | 180 +++++++-------- flang/runtime/reduction-templates.h | 4 +- flang/runtime/sum.cpp | 22 +- flang/runtime/transformational.cpp | 8 +- flang/unittests/Runtime/Numeric.cpp | 4 +- flang/unittests/Runtime/Transformational.cpp | 10 +- 23 files changed, 381 insertions(+), 480 deletions(-) delete mode 100644 flang/include/flang/Common/float80.h delete mode 100644 flang/include/flang/Runtime/complex.h diff --git a/flang/include/flang/Common/float80.h b/flang/include/flang/Common/float80.h deleted file mode 100644 index 1838f7b13c8bb2..00000000000000 --- a/flang/include/flang/Common/float80.h +++ /dev/null @@ -1,43 +0,0 @@ -/*===-- flang/Common/float80.h --------------------------------------*- C -*-=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===----------------------------------------------------------------------===*/ - -/* This header is usable in both C and C++ code. - * Isolates build compiler checks to determine if the 80-bit - * floating point format is supported via a particular C type. - * It defines CFloat80Type and CppFloat80Type aliases for this - * C type. - */ - -#ifndef FORTRAN_COMMON_FLOAT80_H_ -#define FORTRAN_COMMON_FLOAT80_H_ - -#include "api-attrs.h" -#include - -#if LDBL_MANT_DIG == 64 -#undef HAS_FLOAT80 -#define HAS_FLOAT80 1 -#endif - -#if defined(RT_DEVICE_COMPILATION) && defined(__CUDACC__) -/* - * 'long double' is treated as 'double' in the CUDA device code, - * and there is no support for 80-bit floating point format. - * This is probably true for most offload devices, so RT_DEVICE_COMPILATION - * check should be enough. For the time being, guard it with __CUDACC__ - * as well. - */ -#undef HAS_FLOAT80 -#endif - -#if HAS_FLOAT80 -typedef long double CFloat80Type; -typedef long double CppFloat80Type; -#endif - -#endif /* FORTRAN_COMMON_FLOAT80_H_ */ diff --git a/flang/include/flang/Runtime/complex.h b/flang/include/flang/Runtime/complex.h deleted file mode 100644 index b7ad1376bffbf1..00000000000000 --- a/flang/include/flang/Runtime/complex.h +++ /dev/null @@ -1,31 +0,0 @@ -//===-- include/flang/Runtime/complex.h -------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// A single way to expose C++ complex class in files that can be used -// in F18 runtime build. With inclusion of this file std::complex -// and the related names become available, though, they may correspond -// to alternative definitions (e.g. from cuda::std namespace). - -#ifndef FORTRAN_RUNTIME_COMPLEX_H -#define FORTRAN_RUNTIME_COMPLEX_H - -#if RT_USE_LIBCUDACXX -#include -namespace Fortran::runtime::rtcmplx { -using cuda::std::complex; -using cuda::std::conj; -} // namespace Fortran::runtime::rtcmplx -#else // !RT_USE_LIBCUDACXX -#include -namespace Fortran::runtime::rtcmplx { -using std::complex; -using std::conj; -} // namespace Fortran::runtime::rtcmplx -#endif // !RT_USE_LIBCUDACXX - -#endif // FORTRAN_RUNTIME_COMPLEX_H diff --git a/flang/include/flang/Runtime/cpp-type.h b/flang/include/flang/Runtime/cpp-type.h index aef0fbd7ede586..fe21dd544cf7d8 100644 --- a/flang/include/flang/Runtime/cpp-type.h +++ b/flang/include/flang/Runtime/cpp-type.h @@ -13,9 +13,8 @@ #include "flang/Common/Fortran.h" #include "flang/Common/float128.h" -#include "flang/Common/float80.h" #include "flang/Common/uint128.h" -#include "flang/Runtime/complex.h" +#include #include #if __cplusplus >= 202302 #include @@ -71,9 +70,9 @@ template <> struct CppTypeForHelper { using type = double; #endif }; -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 template <> struct CppTypeForHelper { - using type = CppFloat80Type; + using type = long double; }; #endif #if __STDCPP_FLOAT128_T__ @@ -90,7 +89,7 @@ template <> struct CppTypeForHelper { #endif template struct CppTypeForHelper { - using type = rtcmplx::complex>; + using type = std::complex>; }; template <> struct CppTypeForHelper { diff --git a/flang/include/flang/Runtime/matmul-instances.inc b/flang/include/flang/Runtime/matmul-instances.inc index 88e3067ca029d4..32c6ab06d25219 100644 --- a/flang/include/flang/Runtime/matmul-instances.inc +++ b/flang/include/flang/Runtime/matmul-instances.inc @@ -111,7 +111,7 @@ FOREACH_MATMUL_TYPE_PAIR(MATMUL_DIRECT_INSTANCE) FOREACH_MATMUL_TYPE_PAIR_WITH_INT16(MATMUL_INSTANCE) FOREACH_MATMUL_TYPE_PAIR_WITH_INT16(MATMUL_DIRECT_INSTANCE) -#if MATMUL_FORCE_ALL_TYPES || HAS_FLOAT80 +#if MATMUL_FORCE_ALL_TYPES || LDBL_MANT_DIG == 64 MATMUL_INSTANCE(Integer, 16, Real, 10) MATMUL_INSTANCE(Integer, 16, Complex, 10) MATMUL_INSTANCE(Real, 10, Integer, 16) @@ -133,7 +133,7 @@ MATMUL_DIRECT_INSTANCE(Complex, 16, Integer, 16) #endif #endif // MATMUL_FORCE_ALL_TYPES || (defined __SIZEOF_INT128__ && !AVOID_NATIVE_UINT128_T) -#if MATMUL_FORCE_ALL_TYPES || HAS_FLOAT80 +#if MATMUL_FORCE_ALL_TYPES || LDBL_MANT_DIG == 64 #define FOREACH_MATMUL_TYPE_PAIR_WITH_REAL10(macro) \ macro(Integer, 1, Real, 10) \ macro(Integer, 1, Complex, 10) \ @@ -193,7 +193,7 @@ MATMUL_DIRECT_INSTANCE(Complex, 10, Complex, 16) MATMUL_DIRECT_INSTANCE(Complex, 16, Real, 10) MATMUL_DIRECT_INSTANCE(Complex, 16, Complex, 10) #endif -#endif // MATMUL_FORCE_ALL_TYPES || HAS_FLOAT80 +#endif // MATMUL_FORCE_ALL_TYPES || LDBL_MANT_DIG == 64 #if MATMUL_FORCE_ALL_TYPES || (LDBL_MANT_DIG == 113 || HAS_FLOAT128) #define FOREACH_MATMUL_TYPE_PAIR_WITH_REAL16(macro) \ diff --git a/flang/include/flang/Runtime/numeric.h b/flang/include/flang/Runtime/numeric.h index c3923ee2e0d889..84a5a7cd7a361c 100644 --- a/flang/include/flang/Runtime/numeric.h +++ b/flang/include/flang/Runtime/numeric.h @@ -44,7 +44,7 @@ CppTypeFor RTDECL(Ceiling8_8)( CppTypeFor RTDECL(Ceiling8_16)( CppTypeFor); #endif -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDECL(Ceiling10_1)( CppTypeFor); CppTypeFor RTDECL(Ceiling10_2)( @@ -78,7 +78,7 @@ CppTypeFor RTDECL(ErfcScaled4)( CppTypeFor); CppTypeFor RTDECL(ErfcScaled8)( CppTypeFor); -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDECL(ErfcScaled10)( CppTypeFor); #endif @@ -96,7 +96,7 @@ CppTypeFor RTDECL(Exponent8_4)( CppTypeFor); CppTypeFor RTDECL(Exponent8_8)( CppTypeFor); -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDECL(Exponent10_4)( CppTypeFor); CppTypeFor RTDECL(Exponent10_8)( @@ -134,7 +134,7 @@ CppTypeFor RTDECL(Floor8_8)( CppTypeFor RTDECL(Floor8_16)( CppTypeFor); #endif -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDECL(Floor10_1)( CppTypeFor); CppTypeFor RTDECL(Floor10_2)( @@ -168,7 +168,7 @@ CppTypeFor RTDECL(Fraction4)( CppTypeFor); CppTypeFor RTDECL(Fraction8)( CppTypeFor); -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDECL(Fraction10)( CppTypeFor); #endif @@ -180,7 +180,7 @@ CppTypeFor RTDECL(Fraction16)( // ISNAN / IEEE_IS_NAN bool RTDECL(IsNaN4)(CppTypeFor); bool RTDECL(IsNaN8)(CppTypeFor); -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 bool RTDECL(IsNaN10)(CppTypeFor); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 @@ -212,7 +212,7 @@ CppTypeFor RTDECL(ModReal4)( CppTypeFor RTDECL(ModReal8)( CppTypeFor, CppTypeFor, const char *sourceFile = nullptr, int sourceLine = 0); -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDECL(ModReal10)( CppTypeFor, CppTypeFor, const char *sourceFile = nullptr, int sourceLine = 0); @@ -247,7 +247,7 @@ CppTypeFor RTDECL(ModuloReal4)( CppTypeFor RTDECL(ModuloReal8)( CppTypeFor, CppTypeFor, const char *sourceFile = nullptr, int sourceLine = 0); -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDECL(ModuloReal10)( CppTypeFor, CppTypeFor, const char *sourceFile = nullptr, int sourceLine = 0); @@ -283,7 +283,7 @@ CppTypeFor RTDECL(Nint8_8)( CppTypeFor RTDECL(Nint8_16)( CppTypeFor); #endif -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDECL(Nint10_1)( CppTypeFor); CppTypeFor RTDECL(Nint10_2)( @@ -319,7 +319,7 @@ CppTypeFor RTDECL(Nearest4)( CppTypeFor, bool positive); CppTypeFor RTDECL(Nearest8)( CppTypeFor, bool positive); -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDECL(Nearest10)( CppTypeFor, bool positive); #endif @@ -333,7 +333,7 @@ CppTypeFor RTDECL(RRSpacing4)( CppTypeFor); CppTypeFor RTDECL(RRSpacing8)( CppTypeFor); -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDECL(RRSpacing10)( CppTypeFor); #endif @@ -347,7 +347,7 @@ CppTypeFor RTDECL(SetExponent4)( CppTypeFor, std::int64_t); CppTypeFor RTDECL(SetExponent8)( CppTypeFor, std::int64_t); -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDECL(SetExponent10)( CppTypeFor, std::int64_t); #endif @@ -361,7 +361,7 @@ CppTypeFor RTDECL(Scale4)( CppTypeFor, std::int64_t); CppTypeFor RTDECL(Scale8)( CppTypeFor, std::int64_t); -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDECL(Scale10)( CppTypeFor, std::int64_t); #endif @@ -410,7 +410,7 @@ CppTypeFor RTDECL(Spacing4)( CppTypeFor); CppTypeFor RTDECL(Spacing8)( CppTypeFor); -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDECL(Spacing10)( CppTypeFor); #endif @@ -425,7 +425,7 @@ CppTypeFor RTDECL(FPow4i)( CppTypeFor RTDECL(FPow8i)( CppTypeFor b, CppTypeFor e); -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDECL(FPow10i)( CppTypeFor b, CppTypeFor e); @@ -442,7 +442,7 @@ CppTypeFor RTDECL(FPow4k)( CppTypeFor RTDECL(FPow8k)( CppTypeFor b, CppTypeFor e); -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDECL(FPow10k)( CppTypeFor b, CppTypeFor e); diff --git a/flang/include/flang/Runtime/reduce.h b/flang/include/flang/Runtime/reduce.h index c016b37f9592a1..60f54c393b4bbd 100644 --- a/flang/include/flang/Runtime/reduce.h +++ b/flang/include/flang/Runtime/reduce.h @@ -188,26 +188,22 @@ void RTDECL(ReduceReal8DimValue)(Descriptor &result, const Descriptor &array, ValueReductionOperation, const char *source, int line, int dim, const Descriptor *mask = nullptr, const double *identity = nullptr, bool ordered = true); -#if HAS_FLOAT80 -CppTypeFor RTDECL(ReduceReal10Ref)(const Descriptor &, - ReferenceReductionOperation>, - const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, - const CppTypeFor *identity = nullptr, - bool ordered = true); -CppTypeFor RTDECL(ReduceReal10Value)(const Descriptor &, - ValueReductionOperation>, - const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, - const CppTypeFor *identity = nullptr, - bool ordered = true); +#if LDBL_MANT_DIG == 64 +long double RTDECL(ReduceReal10Ref)(const Descriptor &, + ReferenceReductionOperation, const char *source, int line, + int dim = 0, const Descriptor *mask = nullptr, + const long double *identity = nullptr, bool ordered = true); +long double RTDECL(ReduceReal10Value)(const Descriptor &, + ValueReductionOperation, const char *source, int line, + int dim = 0, const Descriptor *mask = nullptr, + const long double *identity = nullptr, bool ordered = true); void RTDECL(ReduceReal10DimRef)(Descriptor &result, const Descriptor &array, - ReferenceReductionOperation>, - const char *source, int line, int dim, const Descriptor *mask = nullptr, - const CppTypeFor *identity = nullptr, - bool ordered = true); + ReferenceReductionOperation, const char *source, int line, + int dim, const Descriptor *mask = nullptr, + const long double *identity = nullptr, bool ordered = true); void RTDECL(ReduceReal10DimValue)(Descriptor &result, const Descriptor &array, - ValueReductionOperation>, - const char *source, int line, int dim, const Descriptor *mask = nullptr, - const CppTypeFor *identity = nullptr, + ValueReductionOperation, const char *source, int line, int dim, + const Descriptor *mask = nullptr, const long double *identity = nullptr, bool ordered = true); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 @@ -229,152 +225,112 @@ void RTDECL(ReduceReal16DimValue)(Descriptor &result, const Descriptor &array, const CppFloat128Type *identity = nullptr, bool ordered = true); #endif -void RTDECL(CppReduceComplex2Ref)(CppTypeFor &, - const Descriptor &, - ReferenceReductionOperation>, - const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, - const CppTypeFor *identity = nullptr, - bool ordered = true); -void RTDECL(CppReduceComplex2Value)(CppTypeFor &, - const Descriptor &, - ValueReductionOperation>, - const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, - const CppTypeFor *identity = nullptr, - bool ordered = true); +void RTDECL(CppReduceComplex2Ref)(std::complex &, const Descriptor &, + ReferenceReductionOperation>, const char *source, + int line, int dim = 0, const Descriptor *mask = nullptr, + const std::complex *identity = nullptr, bool ordered = true); +void RTDECL(CppReduceComplex2Value)(std::complex &, const Descriptor &, + ValueReductionOperation>, const char *source, int line, + int dim = 0, const Descriptor *mask = nullptr, + const std::complex *identity = nullptr, bool ordered = true); void RTDECL(CppReduceComplex2DimRef)(Descriptor &result, - const Descriptor &array, - ReferenceReductionOperation>, + const Descriptor &array, ReferenceReductionOperation>, const char *source, int line, int dim, const Descriptor *mask = nullptr, - const CppTypeFor *identity = nullptr, - bool ordered = true); + const std::complex *identity = nullptr, bool ordered = true); void RTDECL(CppReduceComplex2DimValue)(Descriptor &result, - const Descriptor &array, - ValueReductionOperation>, + const Descriptor &array, ValueReductionOperation>, const char *source, int line, int dim, const Descriptor *mask = nullptr, - const CppTypeFor *identity = nullptr, - bool ordered = true); -void RTDECL(CppReduceComplex3Ref)(CppTypeFor &, - const Descriptor &, - ReferenceReductionOperation>, - const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, - const CppTypeFor *identity = nullptr, - bool ordered = true); -void RTDECL(CppReduceComplex3Value)(CppTypeFor &, - const Descriptor &, - ValueReductionOperation>, - const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, - const CppTypeFor *identity = nullptr, - bool ordered = true); + const std::complex *identity = nullptr, bool ordered = true); +void RTDECL(CppReduceComplex3Ref)(std::complex &, const Descriptor &, + ReferenceReductionOperation>, const char *source, + int line, int dim = 0, const Descriptor *mask = nullptr, + const std::complex *identity = nullptr, bool ordered = true); +void RTDECL(CppReduceComplex3Value)(std::complex &, const Descriptor &, + ValueReductionOperation>, const char *source, int line, + int dim = 0, const Descriptor *mask = nullptr, + const std::complex *identity = nullptr, bool ordered = true); void RTDECL(CppReduceComplex3DimRef)(Descriptor &result, - const Descriptor &array, - ReferenceReductionOperation>, + const Descriptor &array, ReferenceReductionOperation>, const char *source, int line, int dim, const Descriptor *mask = nullptr, - const CppTypeFor *identity = nullptr, - bool ordered = true); + const std::complex *identity = nullptr, bool ordered = true); void RTDECL(CppReduceComplex3DimValue)(Descriptor &result, - const Descriptor &array, - ValueReductionOperation>, + const Descriptor &array, ValueReductionOperation>, const char *source, int line, int dim, const Descriptor *mask = nullptr, - const CppTypeFor *identity = nullptr, - bool ordered = true); -void RTDECL(CppReduceComplex4Ref)(CppTypeFor &, - const Descriptor &, - ReferenceReductionOperation>, - const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, - const CppTypeFor *identity = nullptr, - bool ordered = true); -void RTDECL(CppReduceComplex4Value)(CppTypeFor &, - const Descriptor &, - ValueReductionOperation>, - const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, - const CppTypeFor *identity = nullptr, - bool ordered = true); + const std::complex *identity = nullptr, bool ordered = true); +void RTDECL(CppReduceComplex4Ref)(std::complex &, const Descriptor &, + ReferenceReductionOperation>, const char *source, + int line, int dim = 0, const Descriptor *mask = nullptr, + const std::complex *identity = nullptr, bool ordered = true); +void RTDECL(CppReduceComplex4Value)(std::complex &, const Descriptor &, + ValueReductionOperation>, const char *source, int line, + int dim = 0, const Descriptor *mask = nullptr, + const std::complex *identity = nullptr, bool ordered = true); void RTDECL(CppReduceComplex4DimRef)(Descriptor &result, - const Descriptor &array, - ReferenceReductionOperation>, + const Descriptor &array, ReferenceReductionOperation>, const char *source, int line, int dim, const Descriptor *mask = nullptr, - const CppTypeFor *identity = nullptr, - bool ordered = true); + const std::complex *identity = nullptr, bool ordered = true); void RTDECL(CppReduceComplex4DimValue)(Descriptor &result, - const Descriptor &array, - ValueReductionOperation>, + const Descriptor &array, ValueReductionOperation>, const char *source, int line, int dim, const Descriptor *mask = nullptr, - const CppTypeFor *identity = nullptr, - bool ordered = true); -void RTDECL(CppReduceComplex8Ref)(CppTypeFor &, - const Descriptor &, - ReferenceReductionOperation>, - const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, - const CppTypeFor *identity = nullptr, - bool ordered = true); -void RTDECL(CppReduceComplex8Value)(CppTypeFor &, - const Descriptor &, - ValueReductionOperation>, - const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, - const CppTypeFor *identity = nullptr, - bool ordered = true); + const std::complex *identity = nullptr, bool ordered = true); +void RTDECL(CppReduceComplex8Ref)(std::complex &, const Descriptor &, + ReferenceReductionOperation>, const char *source, + int line, int dim = 0, const Descriptor *mask = nullptr, + const std::complex *identity = nullptr, bool ordered = true); +void RTDECL(CppReduceComplex8Value)(std::complex &, const Descriptor &, + ValueReductionOperation>, const char *source, int line, + int dim = 0, const Descriptor *mask = nullptr, + const std::complex *identity = nullptr, bool ordered = true); void RTDECL(CppReduceComplex8DimRef)(Descriptor &result, - const Descriptor &array, - ReferenceReductionOperation>, + const Descriptor &array, ReferenceReductionOperation>, const char *source, int line, int dim, const Descriptor *mask = nullptr, - const CppTypeFor *identity = nullptr, - bool ordered = true); + const std::complex *identity = nullptr, bool ordered = true); void RTDECL(CppReduceComplex8DimValue)(Descriptor &result, - const Descriptor &array, - ValueReductionOperation>, + const Descriptor &array, ValueReductionOperation>, const char *source, int line, int dim, const Descriptor *mask = nullptr, - const CppTypeFor *identity = nullptr, - bool ordered = true); -#if HAS_FLOAT80 -void RTDECL(CppReduceComplex10Ref)(CppTypeFor &, - const Descriptor &, - ReferenceReductionOperation>, + const std::complex *identity = nullptr, bool ordered = true); +#if LDBL_MANT_DIG == 64 +void RTDECL(CppReduceComplex10Ref)(std::complex &, + const Descriptor &, ReferenceReductionOperation>, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, - const CppTypeFor *identity = nullptr, - bool ordered = true); -void RTDECL(CppReduceComplex10Value)(CppTypeFor &, - const Descriptor &, - ValueReductionOperation>, + const std::complex *identity = nullptr, bool ordered = true); +void RTDECL(CppReduceComplex10Value)(std::complex &, + const Descriptor &, ValueReductionOperation>, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, - const CppTypeFor *identity = nullptr, - bool ordered = true); + const std::complex *identity = nullptr, bool ordered = true); void RTDECL(CppReduceComplex10DimRef)(Descriptor &result, const Descriptor &array, - ReferenceReductionOperation>, - const char *source, int line, int dim, const Descriptor *mask = nullptr, - const CppTypeFor *identity = nullptr, - bool ordered = true); + ReferenceReductionOperation>, const char *source, + int line, int dim, const Descriptor *mask = nullptr, + const std::complex *identity = nullptr, bool ordered = true); void RTDECL(CppReduceComplex10DimValue)(Descriptor &result, - const Descriptor &array, - ValueReductionOperation>, + const Descriptor &array, ValueReductionOperation>, const char *source, int line, int dim, const Descriptor *mask = nullptr, - const CppTypeFor *identity = nullptr, - bool ordered = true); + const std::complex *identity = nullptr, bool ordered = true); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 -void RTDECL(CppReduceComplex16Ref)(CppTypeFor &, +void RTDECL(CppReduceComplex16Ref)(std::complex &, const Descriptor &, - ReferenceReductionOperation>, + ReferenceReductionOperation>, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, - const CppTypeFor *identity = nullptr, + const std::complex *identity = nullptr, bool ordered = true); -void RTDECL(CppReduceComplex16Value)(CppTypeFor &, - const Descriptor &, - ValueReductionOperation>, +void RTDECL(CppReduceComplex16Value)(std::complex &, + const Descriptor &, ValueReductionOperation>, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, - const CppTypeFor *identity = nullptr, + const std::complex *identity = nullptr, bool ordered = true); void RTDECL(CppReduceComplex16DimRef)(Descriptor &result, const Descriptor &array, - ReferenceReductionOperation>, + ReferenceReductionOperation>, const char *source, int line, int dim, const Descriptor *mask = nullptr, - const CppTypeFor *identity = nullptr, + const std::complex *identity = nullptr, bool ordered = true); void RTDECL(CppReduceComplex16DimValue)(Descriptor &result, const Descriptor &array, - ValueReductionOperation>, - const char *source, int line, int dim, const Descriptor *mask = nullptr, - const CppTypeFor *identity = nullptr, + ValueReductionOperation>, const char *source, + int line, int dim, const Descriptor *mask = nullptr, + const std::complex *identity = nullptr, bool ordered = true); #endif diff --git a/flang/include/flang/Runtime/reduction.h b/flang/include/flang/Runtime/reduction.h index 7eafacee69d034..97986c12e8a10e 100644 --- a/flang/include/flang/Runtime/reduction.h +++ b/flang/include/flang/Runtime/reduction.h @@ -68,35 +68,34 @@ float RTDECL(SumReal4)(const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); double RTDECL(SumReal8)(const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -#if HAS_FLOAT80 -CppTypeFor RTDECL(SumReal10)(const Descriptor &, - const char *source, int line, int dim = 0, - const Descriptor *mask = nullptr); +#if LDBL_MANT_DIG == 64 +long double RTDECL(SumReal10)(const Descriptor &, const char *source, int line, + int dim = 0, const Descriptor *mask = nullptr); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 CppFloat128Type RTDECL(SumReal16)(const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); #endif -void RTDECL(CppSumComplex2)(CppTypeFor &, - const Descriptor &, const char *source, int line, int dim = 0, +void RTDECL(CppSumComplex2)(std::complex &, const Descriptor &, + const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -void RTDECL(CppSumComplex3)(CppTypeFor &, - const Descriptor &, const char *source, int line, int dim = 0, +void RTDECL(CppSumComplex3)(std::complex &, const Descriptor &, + const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -void RTDECL(CppSumComplex4)(CppTypeFor &, - const Descriptor &, const char *source, int line, int dim = 0, +void RTDECL(CppSumComplex4)(std::complex &, const Descriptor &, + const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -void RTDECL(CppSumComplex8)(CppTypeFor &, - const Descriptor &, const char *source, int line, int dim = 0, +void RTDECL(CppSumComplex8)(std::complex &, const Descriptor &, + const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -#if HAS_FLOAT80 -void RTDECL(CppSumComplex10)(CppTypeFor &, - const Descriptor &, const char *source, int line, int dim = 0, +#if LDBL_MANT_DIG == 64 +void RTDECL(CppSumComplex10)(std::complex &, const Descriptor &, + const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 -void RTDECL(CppSumComplex16)(CppTypeFor &, +void RTDECL(CppSumComplex16)(std::complex &, const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); #endif @@ -129,35 +128,34 @@ float RTDECL(ProductReal4)(const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); double RTDECL(ProductReal8)(const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -#if HAS_FLOAT80 -CppTypeFor RTDECL(ProductReal10)(const Descriptor &, - const char *source, int line, int dim = 0, - const Descriptor *mask = nullptr); +#if LDBL_MANT_DIG == 64 +long double RTDECL(ProductReal10)(const Descriptor &, const char *source, + int line, int dim = 0, const Descriptor *mask = nullptr); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 CppFloat128Type RTDECL(ProductReal16)(const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); #endif -void RTDECL(CppProductComplex2)(CppTypeFor &, - const Descriptor &, const char *source, int line, int dim = 0, +void RTDECL(CppProductComplex2)(std::complex &, const Descriptor &, + const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -void RTDECL(CppProductComplex3)(CppTypeFor &, - const Descriptor &, const char *source, int line, int dim = 0, +void RTDECL(CppProductComplex3)(std::complex &, const Descriptor &, + const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -void RTDECL(CppProductComplex4)(CppTypeFor &, - const Descriptor &, const char *source, int line, int dim = 0, +void RTDECL(CppProductComplex4)(std::complex &, const Descriptor &, + const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -void RTDECL(CppProductComplex8)(CppTypeFor &, - const Descriptor &, const char *source, int line, int dim = 0, +void RTDECL(CppProductComplex8)(std::complex &, const Descriptor &, + const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -#if HAS_FLOAT80 -void RTDECL(CppProductComplex10)(CppTypeFor &, +#if LDBL_MANT_DIG == 64 +void RTDECL(CppProductComplex10)(std::complex &, const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 -void RTDECL(CppProductComplex16)(CppTypeFor &, +void RTDECL(CppProductComplex16)(std::complex &, const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); #endif @@ -309,10 +307,9 @@ float RTDECL(MaxvalReal4)(const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); double RTDECL(MaxvalReal8)(const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -#if HAS_FLOAT80 -CppTypeFor RTDECL(MaxvalReal10)(const Descriptor &, - const char *source, int line, int dim = 0, - const Descriptor *mask = nullptr); +#if LDBL_MANT_DIG == 64 +long double RTDECL(MaxvalReal10)(const Descriptor &, const char *source, + int line, int dim = 0, const Descriptor *mask = nullptr); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 CppFloat128Type RTDECL(MaxvalReal16)(const Descriptor &, const char *source, @@ -341,10 +338,9 @@ float RTDECL(MinvalReal4)(const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); double RTDECL(MinvalReal8)(const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -#if HAS_FLOAT80 -CppTypeFor RTDECL(MinvalReal10)(const Descriptor &, - const char *source, int line, int dim = 0, - const Descriptor *mask = nullptr); +#if LDBL_MANT_DIG == 64 +long double RTDECL(MinvalReal10)(const Descriptor &, const char *source, + int line, int dim = 0, const Descriptor *mask = nullptr); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 CppFloat128Type RTDECL(MinvalReal16)(const Descriptor &, const char *source, @@ -367,12 +363,12 @@ float RTDECL(Norm2_4)( const Descriptor &, const char *source, int line, int dim = 0); double RTDECL(Norm2_8)( const Descriptor &, const char *source, int line, int dim = 0); -#if HAS_FLOAT80 -CppTypeFor RTDECL(Norm2_10)( +#if LDBL_MANT_DIG == 64 +long double RTDECL(Norm2_10)( const Descriptor &, const char *source, int line, int dim = 0); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 -CppFloat128Type RTDECL(Norm2_16)( +long double RTDECL(Norm2_16)( const Descriptor &, const char *source, int line, int dim = 0); void RTDECL(Norm2DimReal16)( Descriptor &, const Descriptor &, int dim, const char *source, int line); @@ -417,33 +413,29 @@ float RTDECL(DotProductReal4)(const Descriptor &, const Descriptor &, const char *source = nullptr, int line = 0); double RTDECL(DotProductReal8)(const Descriptor &, const Descriptor &, const char *source = nullptr, int line = 0); -#if HAS_FLOAT80 -CppTypeFor RTDECL(DotProductReal10)(const Descriptor &, - const Descriptor &, const char *source = nullptr, int line = 0); +#if LDBL_MANT_DIG == 64 +long double RTDECL(DotProductReal10)(const Descriptor &, const Descriptor &, + const char *source = nullptr, int line = 0); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 CppFloat128Type RTDECL(DotProductReal16)(const Descriptor &, const Descriptor &, const char *source = nullptr, int line = 0); #endif -void RTDECL(CppDotProductComplex2)(CppTypeFor &, - const Descriptor &, const Descriptor &, const char *source = nullptr, - int line = 0); -void RTDECL(CppDotProductComplex3)(CppTypeFor &, - const Descriptor &, const Descriptor &, const char *source = nullptr, - int line = 0); -void RTDECL(CppDotProductComplex4)(CppTypeFor &, - const Descriptor &, const Descriptor &, const char *source = nullptr, - int line = 0); -void RTDECL(CppDotProductComplex8)(CppTypeFor &, - const Descriptor &, const Descriptor &, const char *source = nullptr, - int line = 0); -#if HAS_FLOAT80 -void RTDECL(CppDotProductComplex10)(CppTypeFor &, +void RTDECL(CppDotProductComplex2)(std::complex &, const Descriptor &, + const Descriptor &, const char *source = nullptr, int line = 0); +void RTDECL(CppDotProductComplex3)(std::complex &, const Descriptor &, + const Descriptor &, const char *source = nullptr, int line = 0); +void RTDECL(CppDotProductComplex4)(std::complex &, const Descriptor &, + const Descriptor &, const char *source = nullptr, int line = 0); +void RTDECL(CppDotProductComplex8)(std::complex &, const Descriptor &, + const Descriptor &, const char *source = nullptr, int line = 0); +#if LDBL_MANT_DIG == 64 +void RTDECL(CppDotProductComplex10)(std::complex &, const Descriptor &, const Descriptor &, const char *source = nullptr, int line = 0); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 -void RTDECL(CppDotProductComplex16)(CppTypeFor &, +void RTDECL(CppDotProductComplex16)(std::complex &, const Descriptor &, const Descriptor &, const char *source = nullptr, int line = 0); #endif diff --git a/flang/include/flang/Runtime/transformational.h b/flang/include/flang/Runtime/transformational.h index faeaa1baa39ae2..a39b872f376a69 100644 --- a/flang/include/flang/Runtime/transformational.h +++ b/flang/include/flang/Runtime/transformational.h @@ -45,12 +45,10 @@ void RTDECL(BesselJn_4)(Descriptor &result, int32_t n1, int32_t n2, float x, void RTDECL(BesselJn_8)(Descriptor &result, int32_t n1, int32_t n2, double x, double bn2, double bn2_1, const char *sourceFile = nullptr, int line = 0); -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 void RTDECL(BesselJn_10)(Descriptor &result, int32_t n1, int32_t n2, - CppTypeFor x, - CppTypeFor bn2, - CppTypeFor bn2_1, const char *sourceFile = nullptr, - int line = 0); + long double x, long double bn2, long double bn2_1, + const char *sourceFile = nullptr, int line = 0); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 @@ -71,7 +69,7 @@ void RTDECL(BesselJnX0_4)(Descriptor &result, int32_t n1, int32_t n2, void RTDECL(BesselJnX0_8)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 void RTDECL(BesselJnX0_10)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); #endif @@ -93,12 +91,10 @@ void RTDECL(BesselYn_4)(Descriptor &result, int32_t n1, int32_t n2, float x, void RTDECL(BesselYn_8)(Descriptor &result, int32_t n1, int32_t n2, double x, double bn1, double bn1_1, const char *sourceFile = nullptr, int line = 0); -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 void RTDECL(BesselYn_10)(Descriptor &result, int32_t n1, int32_t n2, - CppTypeFor x, - CppTypeFor bn1, - CppTypeFor bn1_1, const char *sourceFile = nullptr, - int line = 0); + long double x, long double bn1, long double bn1_1, + const char *sourceFile = nullptr, int line = 0); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 @@ -119,7 +115,7 @@ void RTDECL(BesselYnX0_4)(Descriptor &result, int32_t n1, int32_t n2, void RTDECL(BesselYnX0_8)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 void RTDECL(BesselYnX0_10)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); #endif diff --git a/flang/runtime/complex-powi.cpp b/flang/runtime/complex-powi.cpp index d7a63724b96c8f..77031e40242791 100644 --- a/flang/runtime/complex-powi.cpp +++ b/flang/runtime/complex-powi.cpp @@ -7,13 +7,11 @@ * ===-----------------------------------------------------------------------=== */ #include "flang/Common/float128.h" -#include "flang/Runtime/cpp-type.h" #include "flang/Runtime/entry-names.h" #include #include #include -namespace Fortran::runtime { #ifdef __clang_major__ #pragma clang diagnostic ignored "-Wc99-extensions" #endif @@ -116,35 +114,35 @@ extern "C" Qcomplex RTNAME(cqpowk)(Qcomplex base, std::int64_t exp) { // MSVC doesn't allow including or in C++17 mode to get // the Windows definitions of these structs so just redefine here. struct Fcomplex { - CppTypeFor re; - CppTypeFor im; + float re; + float im; }; struct Dcomplex { - CppTypeFor re; - CppTypeFor im; + double re; + double im; }; extern "C" Fcomplex RTNAME(cpowi)(Fcomplex base, std::int32_t exp) { - auto cppbase = *(CppTypeFor *)(&base); + auto cppbase = *(std::complex *)(&base); auto cppres = tgpowi(cppbase, exp); return *(Fcomplex *)(&cppres); } extern "C" Dcomplex RTNAME(zpowi)(Dcomplex base, std::int32_t exp) { - auto cppbase = *(CppTypeFor *)(&base); + auto cppbase = *(std::complex *)(&base); auto cppres = tgpowi(cppbase, exp); return *(Dcomplex *)(&cppres); } extern "C" Fcomplex RTNAME(cpowk)(Fcomplex base, std::int64_t exp) { - auto cppbase = *(CppTypeFor *)(&base); + auto cppbase = *(std::complex *)(&base); auto cppres = tgpowi(cppbase, exp); return *(Fcomplex *)(&cppres); } extern "C" Dcomplex RTNAME(zpowk)(Dcomplex base, std::int64_t exp) { - auto cppbase = *(CppTypeFor *)(&base); + auto cppbase = *(std::complex *)(&base); auto cppres = tgpowi(cppbase, exp); return *(Dcomplex *)(&cppres); } @@ -156,16 +154,15 @@ struct Qcomplex { }; extern "C" Dcomplex RTNAME(cqpowi)(Qcomplex base, std::int32_t exp) { - auto cppbase = *(rtcmplx::complex *)(&base); + auto cppbase = *(std::complex *)(&base); auto cppres = tgpowi(cppbase, exp); return *(Qcomplex *)(&cppres); } extern "C" Dcomplex RTNAME(cqpowk)(Qcomplex base, std::int64_t exp) { - auto cppbase = *(rtcmplx::complex *)(&base); + auto cppbase = *(std::complex *)(&base); auto cppres = tgpowi(cppbase, exp); return *(Qcomplex *)(&cppres); } #endif #endif -} // namespace Fortran::runtime diff --git a/flang/runtime/complex-reduction.c b/flang/runtime/complex-reduction.c index 232c5452488f1a..37ce3fa410016b 100644 --- a/flang/runtime/complex-reduction.c +++ b/flang/runtime/complex-reduction.c @@ -119,7 +119,7 @@ ADAPT_REDUCTION(SumComplex4, float_Complex_t, CppComplexFloat, CMPLXF, REDUCTION_ARGS, REDUCTION_ARG_NAMES) ADAPT_REDUCTION(SumComplex8, double_Complex_t, CppComplexDouble, CMPLX, REDUCTION_ARGS, REDUCTION_ARG_NAMES) -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 ADAPT_REDUCTION(SumComplex10, long_double_Complex_t, CppComplexLongDouble, CMPLXL, REDUCTION_ARGS, REDUCTION_ARG_NAMES) #endif @@ -133,7 +133,7 @@ ADAPT_REDUCTION(ProductComplex4, float_Complex_t, CppComplexFloat, CMPLXF, REDUCTION_ARGS, REDUCTION_ARG_NAMES) ADAPT_REDUCTION(ProductComplex8, double_Complex_t, CppComplexDouble, CMPLX, REDUCTION_ARGS, REDUCTION_ARG_NAMES) -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 ADAPT_REDUCTION(ProductComplex10, long_double_Complex_t, CppComplexLongDouble, CMPLXL, REDUCTION_ARGS, REDUCTION_ARG_NAMES) #endif @@ -147,7 +147,7 @@ ADAPT_REDUCTION(DotProductComplex4, float_Complex_t, CppComplexFloat, CMPLXF, DOT_PRODUCT_ARGS, DOT_PRODUCT_ARG_NAMES) ADAPT_REDUCTION(DotProductComplex8, double_Complex_t, CppComplexDouble, CMPLX, DOT_PRODUCT_ARGS, DOT_PRODUCT_ARG_NAMES) -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 ADAPT_REDUCTION(DotProductComplex10, long_double_Complex_t, CppComplexLongDouble, CMPLXL, DOT_PRODUCT_ARGS, DOT_PRODUCT_ARG_NAMES) #endif @@ -173,7 +173,7 @@ ADAPT_REDUCTION(ReduceComplex8Ref, double_Complex_t, CppComplexDouble, CMPLX, ADAPT_REDUCTION(ReduceComplex8Value, double_Complex_t, CppComplexDouble, CMPLX, RARGS, REDUCE_ARG_NAMES) #undef RARGS -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 #define RARGS REDUCE_ARGS(long_double_Complex_t, long_double_Complex_t_ref_op) ADAPT_REDUCTION(ReduceComplex10Ref, long_double_Complex_t, CppComplexLongDouble, CMPLXL, RARGS, REDUCE_ARG_NAMES) diff --git a/flang/runtime/dot-product.cpp b/flang/runtime/dot-product.cpp index aafef379fad43c..977698269bcb46 100644 --- a/flang/runtime/dot-product.cpp +++ b/flang/runtime/dot-product.cpp @@ -21,6 +21,11 @@ namespace Fortran::runtime { // Beware: DOT_PRODUCT of COMPLEX data uses the complex conjugate of the first // argument; MATMUL does not. +// Suppress the warnings about calling __host__-only std::complex operators, +// defined in C++ STD header files, from __device__ code. +RT_DIAG_PUSH +RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN + // General accumulator for any type and stride; this is not used for // contiguous numeric vectors. template @@ -37,7 +42,7 @@ class Accumulator { const XT &xElement{*x_.Element(&xAt)}; const YT &yElement{*y_.Element(&yAt)}; if constexpr (RCAT == TypeCategory::Complex) { - sum_ += rtcmplx::conj(static_cast(xElement)) * + sum_ += std::conj(static_cast(xElement)) * static_cast(yElement); } else { sum_ += static_cast(xElement) * static_cast(yElement); @@ -72,9 +77,9 @@ static inline RT_API_ATTRS CppTypeFor DoDotProduct( // TODO: call BLAS-1 SDOT or SDSDOT } else if constexpr (std::is_same_v) { // TODO: call BLAS-1 DDOT - } else if constexpr (std::is_same_v>) { + } else if constexpr (std::is_same_v>) { // TODO: call BLAS-1 CDOTC - } else if constexpr (std::is_same_v>) { + } else if constexpr (std::is_same_v>) { // TODO: call BLAS-1 ZDOTC } } @@ -84,12 +89,12 @@ static inline RT_API_ATTRS CppTypeFor DoDotProduct( AccumType accum{}; if constexpr (RCAT == TypeCategory::Complex) { for (SubscriptValue j{0}; j < n; ++j) { - // conj() may instantiate its argument twice, + // std::conj() may instantiate its argument twice, // so xp has to be incremented separately. // This is a workaround for an alleged bug in clang, // that shows up as: // warning: multiple unsequenced modifications to 'xp' - accum += rtcmplx::conj(static_cast(*xp)) * + accum += std::conj(static_cast(*xp)) * static_cast(*yp++); xp++; } @@ -112,6 +117,8 @@ static inline RT_API_ATTRS CppTypeFor DoDotProduct( return static_cast(accumulator.GetResult()); } +RT_DIAG_POP + template struct DotProduct { using Result = CppTypeFor; template struct DP1 { @@ -190,7 +197,7 @@ CppTypeFor RTDEF(DotProductReal8)( const Descriptor &x, const Descriptor &y, const char *source, int line) { return DotProduct{}(x, y, source, line); } -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDEF(DotProductReal10)( const Descriptor &x, const Descriptor &y, const char *source, int line) { return DotProduct{}(x, y, source, line); @@ -211,7 +218,7 @@ void RTDEF(CppDotProductComplex8)(CppTypeFor &result, const Descriptor &x, const Descriptor &y, const char *source, int line) { result = DotProduct{}(x, y, source, line); } -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 void RTDEF(CppDotProductComplex10)( CppTypeFor &result, const Descriptor &x, const Descriptor &y, const char *source, int line) { diff --git a/flang/runtime/extrema.cpp b/flang/runtime/extrema.cpp index 2658709b7de86b..d6e9633372f524 100644 --- a/flang/runtime/extrema.cpp +++ b/flang/runtime/extrema.cpp @@ -236,7 +236,7 @@ void RTDEF(MaxlocReal8)(Descriptor &result, const Descriptor &x, int kind, TotalNumericMaxOrMinLoc( "MAXLOC", result, x, kind, source, line, mask, back); } -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 void RTDEF(MaxlocReal10)(Descriptor &result, const Descriptor &x, int kind, const char *source, int line, const Descriptor *mask, bool back) { TotalNumericMaxOrMinLoc( @@ -292,7 +292,7 @@ void RTDEF(MinlocReal8)(Descriptor &result, const Descriptor &x, int kind, TotalNumericMaxOrMinLoc( "MINLOC", result, x, kind, source, line, mask, back); } -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 void RTDEF(MinlocReal10)(Descriptor &result, const Descriptor &x, int kind, const char *source, int line, const Descriptor *mask, bool back) { TotalNumericMaxOrMinLoc( @@ -614,7 +614,7 @@ CppTypeFor RTDEF(MaxvalReal8)(const Descriptor &x, return TotalNumericMaxOrMin( x, source, line, dim, mask, "MAXVAL"); } -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDEF(MaxvalReal10)(const Descriptor &x, const char *source, int line, int dim, const Descriptor *mask) { return TotalNumericMaxOrMin( @@ -674,7 +674,7 @@ CppTypeFor RTDEF(MinvalReal8)(const Descriptor &x, return TotalNumericMaxOrMin( x, source, line, dim, mask, "MINVAL"); } -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDEF(MinvalReal10)(const Descriptor &x, const char *source, int line, int dim, const Descriptor *mask) { return TotalNumericMaxOrMin( @@ -730,7 +730,7 @@ CppTypeFor RTDEF(Norm2_8)( return GetTotalReduction( x, source, line, dim, nullptr, Norm2Accumulator<8>{x}, "NORM2"); } -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDEF(Norm2_10)( const Descriptor &x, const char *source, int line, int dim) { return GetTotalReduction( diff --git a/flang/runtime/matmul-transpose.cpp b/flang/runtime/matmul-transpose.cpp index bafa05056bebc4..283472650a1c69 100644 --- a/flang/runtime/matmul-transpose.cpp +++ b/flang/runtime/matmul-transpose.cpp @@ -32,6 +32,11 @@ namespace { using namespace Fortran::runtime; +// Suppress the warnings about calling __host__-only std::complex operators, +// defined in C++ STD header files, from __device__ code. +RT_DIAG_PUSH +RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN + // Contiguous numeric TRANSPOSE(matrix)*matrix multiplication // TRANSPOSE(matrix(n, rows)) * matrix(n,cols) -> // matrix(rows, n) * matrix(n,cols) -> matrix(rows,cols) @@ -86,6 +91,8 @@ inline static RT_API_ATTRS void MatrixTransposedTimesMatrix( } } +RT_DIAG_POP + template inline static RT_API_ATTRS void MatrixTransposedTimesMatrixHelper( CppTypeFor *RESTRICT product, SubscriptValue rows, @@ -111,6 +118,9 @@ inline static RT_API_ATTRS void MatrixTransposedTimesMatrixHelper( } } +RT_DIAG_PUSH +RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN + // Contiguous numeric matrix*vector multiplication // matrix(rows,n) * column vector(n) -> column vector(rows) // Straightforward algorithm: @@ -148,6 +158,8 @@ inline static RT_API_ATTRS void MatrixTransposedTimesVector( } } +RT_DIAG_POP + template inline static RT_API_ATTRS void MatrixTransposedTimesVectorHelper( CppTypeFor *RESTRICT product, SubscriptValue rows, @@ -162,6 +174,9 @@ inline static RT_API_ATTRS void MatrixTransposedTimesVectorHelper( } } +RT_DIAG_PUSH +RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN + // Implements an instance of MATMUL for given argument types. template @@ -326,6 +341,8 @@ inline static RT_API_ATTRS void DoMatmulTranspose( } } +RT_DIAG_POP + template struct MatmulTransposeHelper { diff --git a/flang/runtime/matmul.cpp b/flang/runtime/matmul.cpp index a5737a9bc62075..252557e2f9e7ad 100644 --- a/flang/runtime/matmul.cpp +++ b/flang/runtime/matmul.cpp @@ -31,6 +31,11 @@ namespace { using namespace Fortran::runtime; +// Suppress the warnings about calling __host__-only std::complex operators, +// defined in C++ STD header files, from __device__ code. +RT_DIAG_PUSH +RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN + // General accumulator for any type and stride; this is not used for // contiguous numeric cases. template @@ -107,6 +112,8 @@ inline RT_API_ATTRS void MatrixTimesMatrix( } } +RT_DIAG_POP + template inline RT_API_ATTRS void MatrixTimesMatrixHelper( CppTypeFor *RESTRICT product, SubscriptValue rows, @@ -132,6 +139,9 @@ inline RT_API_ATTRS void MatrixTimesMatrixHelper( } } +RT_DIAG_PUSH +RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN + // Contiguous numeric matrix*vector multiplication // matrix(rows,n) * column vector(n) -> column vector(rows) // Straightforward algorithm: @@ -169,6 +179,8 @@ inline RT_API_ATTRS void MatrixTimesVector( } } +RT_DIAG_POP + template inline RT_API_ATTRS void MatrixTimesVectorHelper( CppTypeFor *RESTRICT product, SubscriptValue rows, @@ -182,6 +194,9 @@ inline RT_API_ATTRS void MatrixTimesVectorHelper( } } +RT_DIAG_PUSH +RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN + // Contiguous numeric vector*matrix multiplication // row vector(n) * matrix(n,cols) -> row vector(cols) // Straightforward algorithm: @@ -220,6 +235,8 @@ inline RT_API_ATTRS void VectorTimesMatrix( } } +RT_DIAG_POP + template inline RT_API_ATTRS void VectorTimesMatrixHelper( @@ -234,6 +251,9 @@ inline RT_API_ATTRS void VectorTimesMatrixHelper( } } +RT_DIAG_PUSH +RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN + // Implements an instance of MATMUL for given argument types. template @@ -324,9 +344,9 @@ static inline RT_API_ATTRS void DoMatmul( // TODO: try using CUTLASS for device. } else if constexpr (std::is_same_v) { // TODO: call BLAS-3 DGEMM - } else if constexpr (std::is_same_v>) { + } else if constexpr (std::is_same_v>) { // TODO: call BLAS-3 CGEMM - } else if constexpr (std::is_same_v>) { + } else if constexpr (std::is_same_v>) { // TODO: call BLAS-3 ZGEMM } } @@ -341,9 +361,9 @@ static inline RT_API_ATTRS void DoMatmul( // TODO: call BLAS-2 SGEMV(x,y) } else if constexpr (std::is_same_v) { // TODO: call BLAS-2 DGEMV(x,y) - } else if constexpr (std::is_same_v>) { + } else if constexpr (std::is_same_v>) { // TODO: call BLAS-2 CGEMV(x,y) - } else if constexpr (std::is_same_v>) { + } else if constexpr (std::is_same_v>) { // TODO: call BLAS-2 ZGEMV(x,y) } } @@ -357,9 +377,9 @@ static inline RT_API_ATTRS void DoMatmul( // TODO: call BLAS-2 SGEMV(y,x) } else if constexpr (std::is_same_v) { // TODO: call BLAS-2 DGEMV(y,x) - } else if constexpr (std::is_same_v>) { + } else if constexpr (std::is_same_v>) { // TODO: call BLAS-2 CGEMV(y,x) - } else if constexpr (std::is_same_v>) { + } else if constexpr (std::is_same_v>) { // TODO: call BLAS-2 ZGEMV(y,x) } } @@ -421,6 +441,8 @@ static inline RT_API_ATTRS void DoMatmul( } } +RT_DIAG_POP + template struct MatmulHelper { diff --git a/flang/runtime/numeric.cpp b/flang/runtime/numeric.cpp index 23f8da3f81f176..9a8ddc6615564d 100644 --- a/flang/runtime/numeric.cpp +++ b/flang/runtime/numeric.cpp @@ -144,7 +144,7 @@ inline RT_API_ATTRS CppTypeFor SelectedRealKind( #ifdef FLANG_RUNTIME_NO_REAL_3 mask &= ~(1 << 3); #endif -#if !HAS_FLOAT80 || defined FLANG_RUNTIME_NO_REAL_10 +#if LDBL_MANT_DIG < 64 || defined FLANG_RUNTIME_NO_REAL_10 mask &= ~(1 << 10); #endif #if LDBL_MANT_DIG < 64 || defined FLANG_RUNTIME_NO_REAL_16 @@ -276,7 +276,7 @@ CppTypeFor RTDEF(Ceiling8_16)( return Ceiling>(x); } #endif -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDEF(Ceiling10_1)( CppTypeFor x) { return Ceiling>(x); @@ -332,7 +332,7 @@ CppTypeFor RTDEF(ErfcScaled8)( CppTypeFor x) { return ErfcScaled(x); } -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDEF(ErfcScaled10)( CppTypeFor x) { return ErfcScaled(x); @@ -361,7 +361,7 @@ CppTypeFor RTDEF(Exponent8_8)( CppTypeFor x) { return Exponent>(x); } -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDEF(Exponent10_4)( CppTypeFor x) { return Exponent>(x); @@ -416,7 +416,7 @@ CppTypeFor RTDEF(Floor8_16)( return Floor>(x); } #endif -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDEF(Floor10_1)( CppTypeFor x) { return Floor>(x); @@ -472,7 +472,7 @@ CppTypeFor RTDEF(Fraction8)( CppTypeFor x) { return Fraction(x); } -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDEF(Fraction10)( CppTypeFor x) { return Fraction(x); @@ -485,7 +485,7 @@ bool RTDEF(IsFinite4)(CppTypeFor x) { bool RTDEF(IsFinite8)(CppTypeFor x) { return std::isfinite(x); } -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 bool RTDEF(IsFinite10)(CppTypeFor x) { return std::isfinite(x); } @@ -501,7 +501,7 @@ bool RTDEF(IsNaN4)(CppTypeFor x) { bool RTDEF(IsNaN8)(CppTypeFor x) { return std::isnan(x); } -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 bool RTDEF(IsNaN10)(CppTypeFor x) { return std::isnan(x); } @@ -553,7 +553,7 @@ CppTypeFor RTDEF(ModReal8)( const char *sourceFile, int sourceLine) { return RealMod(x, p, sourceFile, sourceLine); } -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDEF(ModReal10)( CppTypeFor x, CppTypeFor p, const char *sourceFile, int sourceLine) { @@ -603,7 +603,7 @@ CppTypeFor RTDEF(ModuloReal8)( const char *sourceFile, int sourceLine) { return RealMod(x, p, sourceFile, sourceLine); } -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDEF(ModuloReal10)( CppTypeFor x, CppTypeFor p, const char *sourceFile, int sourceLine) { @@ -619,7 +619,7 @@ CppTypeFor RTDEF(Nearest8)( CppTypeFor x, bool positive) { return Nearest<53>(x, positive); } -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDEF(Nearest10)( CppTypeFor x, bool positive) { return Nearest<64>(x, positive); @@ -670,7 +670,7 @@ CppTypeFor RTDEF(Nint8_16)( return Nint>(x); } #endif -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDEF(Nint10_1)( CppTypeFor x) { return Nint>(x); @@ -726,7 +726,7 @@ CppTypeFor RTDEF(RRSpacing8)( CppTypeFor x) { return RRSpacing<53>(x); } -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDEF(RRSpacing10)( CppTypeFor x) { return RRSpacing<64>(x); @@ -741,7 +741,7 @@ CppTypeFor RTDEF(SetExponent8)( CppTypeFor x, std::int64_t p) { return SetExponent(x, p); } -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDEF(SetExponent10)( CppTypeFor x, std::int64_t p) { return SetExponent(x, p); @@ -756,7 +756,7 @@ CppTypeFor RTDEF(Scale8)( CppTypeFor x, std::int64_t p) { return Scale(x, p); } -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDEF(Scale10)( CppTypeFor x, std::int64_t p) { return Scale(x, p); @@ -876,7 +876,7 @@ CppTypeFor RTDEF(Spacing8)( CppTypeFor x) { return Spacing<53>(x); } -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDEF(Spacing10)( CppTypeFor x) { return Spacing<64>(x); @@ -893,7 +893,7 @@ CppTypeFor RTDEF(FPow8i)( CppTypeFor e) { return FPowI(b, e); } -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDEF(FPow10i)( CppTypeFor b, CppTypeFor e) { @@ -918,7 +918,7 @@ CppTypeFor RTDEF(FPow8k)( CppTypeFor e) { return FPowI(b, e); } -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDEF(FPow10k)( CppTypeFor b, CppTypeFor e) { diff --git a/flang/runtime/product.cpp b/flang/runtime/product.cpp index 39b40d82b05401..7fc0fcd3b107de 100644 --- a/flang/runtime/product.cpp +++ b/flang/runtime/product.cpp @@ -36,11 +36,16 @@ template class NonComplexProductAccumulator { INTERMEDIATE product_{1}; }; +// Suppress the warnings about calling __host__-only std::complex operators, +// defined in C++ STD header files, from __device__ code. +RT_DIAG_PUSH +RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN + template class ComplexProductAccumulator { public: explicit RT_API_ATTRS ComplexProductAccumulator(const Descriptor &array) : array_{array} {} - RT_API_ATTRS void Reinitialize() { product_ = rtcmplx::complex{1, 0}; } + RT_API_ATTRS void Reinitialize() { product_ = std::complex{1, 0}; } template RT_API_ATTRS void GetResult(A *p, int /*zeroBasedDim*/ = -1) const { using ResultPart = typename A::value_type; @@ -55,9 +60,11 @@ template class ComplexProductAccumulator { private: const Descriptor &array_; - rtcmplx::complex product_{1, 0}; + std::complex product_{1, 0}; }; +RT_DIAG_POP + extern "C" { RT_EXT_API_GROUP_BEGIN @@ -109,7 +116,7 @@ CppTypeFor RTDEF(ProductReal8)(const Descriptor &x, NonComplexProductAccumulator>{x}, "PRODUCT"); } -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDEF(ProductReal10)(const Descriptor &x, const char *source, int line, int dim, const Descriptor *mask) { return GetTotalReduction(x, source, line, dim, mask, @@ -140,7 +147,7 @@ void RTDEF(CppProductComplex8)(CppTypeFor &result, mask, ComplexProductAccumulator>{x}, "PRODUCT"); } -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 void RTDEF(CppProductComplex10)(CppTypeFor &result, const Descriptor &x, const char *source, int line, int dim, const Descriptor *mask) { diff --git a/flang/runtime/random.cpp b/flang/runtime/random.cpp index 9ec961fd058745..69de9b8c96fb5d 100644 --- a/flang/runtime/random.cpp +++ b/flang/runtime/random.cpp @@ -66,7 +66,7 @@ void RTNAME(RandomNumber)( return; case 10: if constexpr (HasCppTypeFor) { -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 Generate, 64>(harvest); return; #endif diff --git a/flang/runtime/reduce.cpp b/flang/runtime/reduce.cpp index 6b62e1cf1e76f1..2f4bb6ea159cf4 100644 --- a/flang/runtime/reduce.cpp +++ b/flang/runtime/reduce.cpp @@ -395,49 +395,45 @@ void RTDEF(ReduceReal8DimValue)(Descriptor &result, const Descriptor &array, PartialReduction(result, array, array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator); } -#if HAS_FLOAT80 -CppTypeFor RTDEF(ReduceReal10Ref)( - const Descriptor &array, - ReferenceReductionOperation> operation, - const char *source, int line, int dim, const Descriptor *mask, - const CppTypeFor *identity, bool ordered) { +#if LDBL_MANT_DIG == 64 +long double RTDEF(ReduceReal10Ref)(const Descriptor &array, + ReferenceReductionOperation operation, const char *source, + int line, int dim, const Descriptor *mask, const long double *identity, + bool ordered) { Terminator terminator{source, line}; return GetTotalReduction(array, source, line, dim, mask, - ReduceAccumulator, false>{ + ReduceAccumulator{ array, operation, identity, terminator}, "REDUCE"); } -CppTypeFor RTDEF(ReduceReal10Value)( - const Descriptor &array, - ValueReductionOperation> operation, - const char *source, int line, int dim, const Descriptor *mask, - const CppTypeFor *identity, bool ordered) { +long double RTDEF(ReduceReal10Value)(const Descriptor &array, + ValueReductionOperation operation, const char *source, + int line, int dim, const Descriptor *mask, const long double *identity, + bool ordered) { Terminator terminator{source, line}; return GetTotalReduction(array, source, line, dim, mask, - ReduceAccumulator, true>{ + ReduceAccumulator{ array, operation, identity, terminator}, "REDUCE"); } void RTDEF(ReduceReal10DimRef)(Descriptor &result, const Descriptor &array, - ReferenceReductionOperation> operation, - const char *source, int line, int dim, const Descriptor *mask, - const CppTypeFor *identity, bool ordered) { + ReferenceReductionOperation operation, const char *source, + int line, int dim, const Descriptor *mask, const long double *identity, + bool ordered) { Terminator terminator{source, line}; - using Accumulator = - ReduceAccumulator, false>; + using Accumulator = ReduceAccumulator; Accumulator accumulator{array, operation, identity, terminator}; PartialReduction(result, array, array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator); } void RTDEF(ReduceReal10DimValue)(Descriptor &result, const Descriptor &array, - ValueReductionOperation> operation, - const char *source, int line, int dim, const Descriptor *mask, - const CppTypeFor *identity, bool ordered) { + ValueReductionOperation operation, const char *source, + int line, int dim, const Descriptor *mask, const long double *identity, + bool ordered) { Terminator terminator{source, line}; - using Accumulator = - ReduceAccumulator, true>; + using Accumulator = ReduceAccumulator; Accumulator accumulator{array, operation, identity, terminator}; PartialReduction(result, array, array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator); @@ -488,199 +484,187 @@ void RTDEF(ReduceReal16DimValue)(Descriptor &result, const Descriptor &array, } #endif -void RTDEF(CppReduceComplex4Ref)(CppTypeFor &result, +void RTDEF(CppReduceComplex4Ref)(std::complex &result, const Descriptor &array, - ReferenceReductionOperation> operation, + ReferenceReductionOperation> operation, const char *source, int line, int dim, const Descriptor *mask, - const CppTypeFor *identity, bool ordered) { + const std::complex *identity, bool ordered) { Terminator terminator{source, line}; result = GetTotalReduction(array, source, line, dim, mask, - ReduceAccumulator, false>{ + ReduceAccumulator, false>{ array, operation, identity, terminator}, "REDUCE"); } -void RTDEF(CppReduceComplex4Value)(CppTypeFor &result, +void RTDEF(CppReduceComplex4Value)(std::complex &result, const Descriptor &array, - ValueReductionOperation> operation, - const char *source, int line, int dim, const Descriptor *mask, - const CppTypeFor *identity, bool ordered) { + ValueReductionOperation> operation, const char *source, + int line, int dim, const Descriptor *mask, + const std::complex *identity, bool ordered) { Terminator terminator{source, line}; result = GetTotalReduction(array, source, line, dim, mask, - ReduceAccumulator, true>{ + ReduceAccumulator, true>{ array, operation, identity, terminator}, "REDUCE"); } void RTDEF(CppReduceComplex4DimRef)(Descriptor &result, const Descriptor &array, - ReferenceReductionOperation> operation, + ReferenceReductionOperation> operation, const char *source, int line, int dim, const Descriptor *mask, - const CppTypeFor *identity, bool ordered) { + const std::complex *identity, bool ordered) { Terminator terminator{source, line}; - using Accumulator = - ReduceAccumulator, false>; + using Accumulator = ReduceAccumulator, false>; Accumulator accumulator{array, operation, identity, terminator}; PartialReduction(result, array, array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator); } void RTDEF(CppReduceComplex4DimValue)(Descriptor &result, const Descriptor &array, - ValueReductionOperation> operation, - const char *source, int line, int dim, const Descriptor *mask, - const CppTypeFor *identity, bool ordered) { + ValueReductionOperation> operation, const char *source, + int line, int dim, const Descriptor *mask, + const std::complex *identity, bool ordered) { Terminator terminator{source, line}; - using Accumulator = - ReduceAccumulator, true>; + using Accumulator = ReduceAccumulator, true>; Accumulator accumulator{array, operation, identity, terminator}; PartialReduction(result, array, array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator); } -void RTDEF(CppReduceComplex8Ref)(CppTypeFor &result, +void RTDEF(CppReduceComplex8Ref)(std::complex &result, const Descriptor &array, - ReferenceReductionOperation> operation, + ReferenceReductionOperation> operation, const char *source, int line, int dim, const Descriptor *mask, - const CppTypeFor *identity, bool ordered) { + const std::complex *identity, bool ordered) { Terminator terminator{source, line}; result = GetTotalReduction(array, source, line, dim, mask, - ReduceAccumulator, false>{ + ReduceAccumulator, false>{ array, operation, identity, terminator}, "REDUCE"); } -void RTDEF(CppReduceComplex8Value)(CppTypeFor &result, +void RTDEF(CppReduceComplex8Value)(std::complex &result, const Descriptor &array, - ValueReductionOperation> operation, - const char *source, int line, int dim, const Descriptor *mask, - const CppTypeFor *identity, bool ordered) { + ValueReductionOperation> operation, const char *source, + int line, int dim, const Descriptor *mask, + const std::complex *identity, bool ordered) { Terminator terminator{source, line}; result = GetTotalReduction(array, source, line, dim, mask, - ReduceAccumulator, true>{ + ReduceAccumulator, true>{ array, operation, identity, terminator}, "REDUCE"); } void RTDEF(CppReduceComplex8DimRef)(Descriptor &result, const Descriptor &array, - ReferenceReductionOperation> operation, + ReferenceReductionOperation> operation, const char *source, int line, int dim, const Descriptor *mask, - const CppTypeFor *identity, bool ordered) { + const std::complex *identity, bool ordered) { Terminator terminator{source, line}; - using Accumulator = - ReduceAccumulator, false>; + using Accumulator = ReduceAccumulator, false>; Accumulator accumulator{array, operation, identity, terminator}; PartialReduction(result, array, array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator); } void RTDEF(CppReduceComplex8DimValue)(Descriptor &result, const Descriptor &array, - ValueReductionOperation> operation, - const char *source, int line, int dim, const Descriptor *mask, - const CppTypeFor *identity, bool ordered) { + ValueReductionOperation> operation, const char *source, + int line, int dim, const Descriptor *mask, + const std::complex *identity, bool ordered) { Terminator terminator{source, line}; - using Accumulator = - ReduceAccumulator, true>; + using Accumulator = ReduceAccumulator, true>; Accumulator accumulator{array, operation, identity, terminator}; PartialReduction(result, array, array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator); } -#if HAS_FLOAT80 -void RTDEF(CppReduceComplex10Ref)(CppTypeFor &result, +#if LDBL_MANT_DIG == 64 +void RTDEF(CppReduceComplex10Ref)(std::complex &result, const Descriptor &array, - ReferenceReductionOperation> - operation, + ReferenceReductionOperation> operation, const char *source, int line, int dim, const Descriptor *mask, - const CppTypeFor *identity, bool ordered) { + const std::complex *identity, bool ordered) { Terminator terminator{source, line}; result = GetTotalReduction(array, source, line, dim, mask, - ReduceAccumulator, false>{ + ReduceAccumulator, false>{ array, operation, identity, terminator}, "REDUCE"); } -void RTDEF(CppReduceComplex10Value)( - CppTypeFor &result, const Descriptor &array, - ValueReductionOperation> operation, +void RTDEF(CppReduceComplex10Value)(std::complex &result, + const Descriptor &array, + ValueReductionOperation> operation, const char *source, int line, int dim, const Descriptor *mask, - const CppTypeFor *identity, bool ordered) { + const std::complex *identity, bool ordered) { Terminator terminator{source, line}; result = GetTotalReduction(array, source, line, dim, mask, - ReduceAccumulator, true>{ + ReduceAccumulator, true>{ array, operation, identity, terminator}, "REDUCE"); } void RTDEF(CppReduceComplex10DimRef)(Descriptor &result, const Descriptor &array, - ReferenceReductionOperation> - operation, + ReferenceReductionOperation> operation, const char *source, int line, int dim, const Descriptor *mask, - const CppTypeFor *identity, bool ordered) { + const std::complex *identity, bool ordered) { Terminator terminator{source, line}; - using Accumulator = - ReduceAccumulator, false>; + using Accumulator = ReduceAccumulator, false>; Accumulator accumulator{array, operation, identity, terminator}; PartialReduction(result, array, array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator); } void RTDEF(CppReduceComplex10DimValue)(Descriptor &result, const Descriptor &array, - ValueReductionOperation> operation, + ValueReductionOperation> operation, const char *source, int line, int dim, const Descriptor *mask, - const CppTypeFor *identity, bool ordered) { + const std::complex *identity, bool ordered) { Terminator terminator{source, line}; - using Accumulator = - ReduceAccumulator, true>; + using Accumulator = ReduceAccumulator, true>; Accumulator accumulator{array, operation, identity, terminator}; PartialReduction(result, array, array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator); } #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 -void RTDEF(CppReduceComplex16Ref)(CppTypeFor &result, +void RTDEF(CppReduceComplex16Ref)(std::complex &result, const Descriptor &array, - ReferenceReductionOperation> - operation, + ReferenceReductionOperation> operation, const char *source, int line, int dim, const Descriptor *mask, - const CppTypeFor *identity, bool ordered) { + const std::complex *identity, bool ordered) { Terminator terminator{source, line}; result = GetTotalReduction(array, source, line, dim, mask, - ReduceAccumulator, false>{ + ReduceAccumulator, false>{ array, operation, identity, terminator}, "REDUCE"); } -void RTDEF(CppReduceComplex16Value)( - CppTypeFor &result, const Descriptor &array, - ValueReductionOperation> operation, +void RTDEF(CppReduceComplex16Value)(std::complex &result, + const Descriptor &array, + ValueReductionOperation> operation, const char *source, int line, int dim, const Descriptor *mask, - const CppTypeFor *identity, bool ordered) { + const std::complex *identity, bool ordered) { Terminator terminator{source, line}; result = GetTotalReduction(array, source, line, dim, mask, - ReduceAccumulator, true>{ + ReduceAccumulator, true>{ array, operation, identity, terminator}, "REDUCE"); } void RTDEF(CppReduceComplex16DimRef)(Descriptor &result, const Descriptor &array, - ReferenceReductionOperation> - operation, + ReferenceReductionOperation> operation, const char *source, int line, int dim, const Descriptor *mask, - const CppTypeFor *identity, bool ordered) { + const std::complex *identity, bool ordered) { Terminator terminator{source, line}; - using Accumulator = - ReduceAccumulator, false>; + using Accumulator = ReduceAccumulator, false>; Accumulator accumulator{array, operation, identity, terminator}; PartialReduction(result, array, array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator); } void RTDEF(CppReduceComplex16DimValue)(Descriptor &result, const Descriptor &array, - ValueReductionOperation> operation, + ValueReductionOperation> operation, const char *source, int line, int dim, const Descriptor *mask, - const CppTypeFor *identity, bool ordered) { + const std::complex *identity, bool ordered) { Terminator terminator{source, line}; - using Accumulator = - ReduceAccumulator, true>; + using Accumulator = ReduceAccumulator, true>; Accumulator accumulator{array, operation, identity, terminator}; PartialReduction(result, array, array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator); diff --git a/flang/runtime/reduction-templates.h b/flang/runtime/reduction-templates.h index 6b7d57f98384ae..a51404c9637620 100644 --- a/flang/runtime/reduction-templates.h +++ b/flang/runtime/reduction-templates.h @@ -321,8 +321,8 @@ RT_VAR_GROUP_BEGIN static constexpr RT_CONST_VAR_ATTRS int Norm2LargestLDKind { #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 16 -#elif HAS_FLOAT80 - 10 +#elif LDBL_MANT_DIG == 64 + 10 #else 8 #endif diff --git a/flang/runtime/sum.cpp b/flang/runtime/sum.cpp index 88c6c914e1e243..63d8c9029a0ef5 100644 --- a/flang/runtime/sum.cpp +++ b/flang/runtime/sum.cpp @@ -141,18 +141,18 @@ CppTypeFor RTDEF(SumReal8)(const Descriptor &x, return GetTotalReduction( x, source, line, dim, mask, RealSumAccumulator{x}, "SUM"); } -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 CppTypeFor RTDEF(SumReal10)(const Descriptor &x, const char *source, int line, int dim, const Descriptor *mask) { - return GetTotalReduction(x, source, line, dim, mask, - RealSumAccumulator>{x}, "SUM"); + return GetTotalReduction( + x, source, line, dim, mask, RealSumAccumulator{x}, "SUM"); } #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 CppTypeFor RTDEF(SumReal16)(const Descriptor &x, const char *source, int line, int dim, const Descriptor *mask) { - return GetTotalReduction(x, source, line, dim, mask, - RealSumAccumulator>{x}, "SUM"); + return GetTotalReduction( + x, source, line, dim, mask, RealSumAccumulator{x}, "SUM"); } #endif @@ -168,22 +168,20 @@ void RTDEF(CppSumComplex8)(CppTypeFor &result, result = GetTotalReduction( x, source, line, dim, mask, ComplexSumAccumulator{x}, "SUM"); } -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 void RTDEF(CppSumComplex10)(CppTypeFor &result, const Descriptor &x, const char *source, int line, int dim, const Descriptor *mask) { - result = - GetTotalReduction(x, source, line, dim, mask, - ComplexSumAccumulator>{x}, "SUM"); + result = GetTotalReduction( + x, source, line, dim, mask, ComplexSumAccumulator{x}, "SUM"); } #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 void RTDEF(CppSumComplex16)(CppTypeFor &result, const Descriptor &x, const char *source, int line, int dim, const Descriptor *mask) { - result = - GetTotalReduction(x, source, line, dim, mask, - ComplexSumAccumulator>{x}, "SUM"); + result = GetTotalReduction( + x, source, line, dim, mask, ComplexSumAccumulator{x}, "SUM"); } #endif diff --git a/flang/runtime/transformational.cpp b/flang/runtime/transformational.cpp index 0ce18171274e42..b6b204be4418c9 100644 --- a/flang/runtime/transformational.cpp +++ b/flang/runtime/transformational.cpp @@ -342,7 +342,7 @@ void RTDEF(BesselJn_8)(Descriptor &result, int32_t n1, int32_t n2, result, n1, n2, x, bn2, bn2_1, sourceFile, line); } -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 void RTDEF(BesselJn_10)(Descriptor &result, int32_t n1, int32_t n2, CppTypeFor x, CppTypeFor bn2, @@ -375,7 +375,7 @@ void RTDEF(BesselJnX0_8)(Descriptor &result, int32_t n1, int32_t n2, DoBesselJnX0(result, n1, n2, sourceFile, line); } -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 void RTDEF(BesselJnX0_10)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile, int line) { DoBesselJnX0(result, n1, n2, sourceFile, line); @@ -405,7 +405,7 @@ void RTDEF(BesselYn_8)(Descriptor &result, int32_t n1, int32_t n2, result, n1, n2, x, bn1, bn1_1, sourceFile, line); } -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 void RTDEF(BesselYn_10)(Descriptor &result, int32_t n1, int32_t n2, CppTypeFor x, CppTypeFor bn1, @@ -438,7 +438,7 @@ void RTDEF(BesselYnX0_8)(Descriptor &result, int32_t n1, int32_t n2, DoBesselYnX0(result, n1, n2, sourceFile, line); } -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 void RTDEF(BesselYnX0_10)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile, int line) { DoBesselYnX0(result, n1, n2, sourceFile, line); diff --git a/flang/unittests/Runtime/Numeric.cpp b/flang/unittests/Runtime/Numeric.cpp index 3e574c06b091e8..799756aab3839a 100644 --- a/flang/unittests/Runtime/Numeric.cpp +++ b/flang/unittests/Runtime/Numeric.cpp @@ -34,7 +34,7 @@ TEST(Numeric, Floor) { TEST(Numeric, Erfc_scaled) { EXPECT_NEAR(RTNAME(ErfcScaled4)(Real<4>{20.0}), 0.02817434874, 1.0e-8); EXPECT_NEAR(RTNAME(ErfcScaled8)(Real<8>{20.0}), 0.02817434874, 1.0e-11); -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 EXPECT_NEAR(RTNAME(ErfcScaled10)(Real<10>{20.0}), 0.02817434874, 1.0e-8); #endif } @@ -295,7 +295,7 @@ TEST(Numeric, FPowI) { EXPECT_EQ(RTNAME(FPow8k)(Real<8>{-3}, Int<8>{3}), Real<8>{-27}); EXPECT_EQ(RTNAME(FPow8k)(Real<8>{-2}, Int<8>{-3}), Real<8>{-0.125}); -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 EXPECT_EQ(RTNAME(FPow10i)(Real<10>{0}, Int<4>{0}), Real<10>{1}); EXPECT_EQ(RTNAME(FPow10i)(Real<10>{0.3}, Int<4>{0}), Real<10>{1}); EXPECT_EQ(RTNAME(FPow10i)(Real<10>{2}, Int<4>{-1}), Real<10>{0.5}); diff --git a/flang/unittests/Runtime/Transformational.cpp b/flang/unittests/Runtime/Transformational.cpp index b36ea0a60c670c..5836e70c740f9a 100644 --- a/flang/unittests/Runtime/Transformational.cpp +++ b/flang/unittests/Runtime/Transformational.cpp @@ -108,7 +108,7 @@ template static void testBesselJnX0(BesselX0FuncType rtFunc) { static void testBesselJn() { testBesselJn<4>(RTNAME(BesselJn_4)); testBesselJn<8>(RTNAME(BesselJn_8)); -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 testBesselJn<10>(RTNAME(BesselJn_10)); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 @@ -117,7 +117,7 @@ static void testBesselJn() { testBesselJnX0<4>(RTNAME(BesselJnX0_4)); testBesselJnX0<8>(RTNAME(BesselJnX0_8)); -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 testBesselJnX0<10>(RTNAME(BesselJnX0_10)); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 @@ -201,7 +201,7 @@ template static void testBesselYnX0(BesselX0FuncType rtFunc) { static void testBesselYn() { testBesselYn<4>(RTNAME(BesselYn_4)); testBesselYn<8>(RTNAME(BesselYn_8)); -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 testBesselYn<10>(RTNAME(BesselYn_10)); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 @@ -210,7 +210,7 @@ static void testBesselYn() { testBesselYnX0<4>(RTNAME(BesselYnX0_4)); testBesselYnX0<8>(RTNAME(BesselYnX0_8)); -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 testBesselYnX0<10>(RTNAME(BesselYnX0_10)); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 @@ -523,7 +523,7 @@ TEST(Transformational, Unpack) { result.Destroy(); } -#if HAS_FLOAT80 +#if LDBL_MANT_DIG == 64 // Make sure the destination descriptor is created by the runtime // with proper element size, when REAL*10 maps to 'long double'. #define Real10CppType long double From 84d7f294c485e36947b412cbfa69ad706ce6c9f0 Mon Sep 17 00:00:00 2001 From: Youngsuk Kim Date: Wed, 18 Sep 2024 13:19:27 -0500 Subject: [PATCH 101/321] [flang] Tidy uses of raw_string_ostream (NFC) As specified in the docs, 1) raw_string_ostream is always unbuffered and 2) the underlying buffer may be used directly ( 65b13610a5226b84889b923bae884ba395ad084d for further reference ) Avoid unneeded calls to raw_string_ostream::str(), to avoid excess indirection. --- flang/lib/Frontend/FrontendActions.cpp | 9 ++++----- flang/lib/Optimizer/Dialect/FIRType.cpp | 5 ++--- flang/lib/Parser/parsing.cpp | 4 ++-- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp index cda82bcb7ecc71..267c3ceb44f33e 100644 --- a/flang/lib/Frontend/FrontendActions.cpp +++ b/flang/lib/Frontend/FrontendActions.cpp @@ -425,7 +425,7 @@ void PrintPreprocessedAction::executeAction() { // If a pre-defined output stream exists, dump the preprocessed content there if (!ci.isOutputStreamNull()) { // Send the output to the pre-defined output buffer. - ci.writeOutputStream(outForPP.str()); + ci.writeOutputStream(buf); return; } @@ -436,7 +436,7 @@ void PrintPreprocessedAction::executeAction() { return; } - (*os) << outForPP.str(); + (*os) << buf; } void DebugDumpProvenanceAction::executeAction() { @@ -756,7 +756,7 @@ getRISCVVScaleRange(CompilerInstance &ci) { outputErrMsg << errMsg.getMessage(); }); ci.getDiagnostics().Report(clang::diag::err_invalid_feature_combination) - << outputErrMsg.str(); + << buffer; return std::nullopt; } @@ -1091,8 +1091,7 @@ class BackendRemarkConsumer : public llvm::DiagnosticHandler { msgStream << diagInfo.getMsg(); // Emit message. - diags.Report(diagID) << clang::AddFlagValue(diagInfo.getPassName()) - << msgStream.str(); + diags.Report(diagID) << clang::AddFlagValue(diagInfo.getPassName()) << msg; } void optimizationRemarkHandler( diff --git a/flang/lib/Optimizer/Dialect/FIRType.cpp b/flang/lib/Optimizer/Dialect/FIRType.cpp index 05f644654efe1b..7a516298e5ef4f 100644 --- a/flang/lib/Optimizer/Dialect/FIRType.cpp +++ b/flang/lib/Optimizer/Dialect/FIRType.cpp @@ -533,9 +533,8 @@ int getTypeCode(mlir::Type ty, const fir::KindMapping &kindMap) { std::string getTypeAsString(mlir::Type ty, const fir::KindMapping &kindMap, llvm::StringRef prefix) { - std::string buf; + std::string buf = prefix.str(); llvm::raw_string_ostream name{buf}; - name << prefix.str(); if (!prefix.empty()) name << "_"; while (ty) { @@ -606,7 +605,7 @@ std::string getTypeAsString(mlir::Type ty, const fir::KindMapping &kindMap, llvm::report_fatal_error("unsupported type"); } } - return name.str(); + return buf; } mlir::Type changeElementType(mlir::Type type, mlir::Type newElementType, diff --git a/flang/lib/Parser/parsing.cpp b/flang/lib/Parser/parsing.cpp index 43a898ff120c5d..37dc113436aa0e 100644 --- a/flang/lib/Parser/parsing.cpp +++ b/flang/lib/Parser/parsing.cpp @@ -42,9 +42,9 @@ const SourceFile *Parsing::Prescan(const std::string &path, Options options) { sourceFile = allSources.Open(path, fileError, "."s /*prepend to search path*/); } - if (!fileError.str().empty()) { + if (!buf.empty()) { ProvenanceRange range{allSources.AddCompilerInsertion(path)}; - messages_.Say(range, "%s"_err_en_US, fileError.str()); + messages_.Say(range, "%s"_err_en_US, buf); return sourceFile; } CHECK(sourceFile); From 71e434d302802901a59f686f878d105dad646601 Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Wed, 18 Sep 2024 11:36:45 -0700 Subject: [PATCH 102/321] [SandboxVec] Reapply "Add barebones Region class. (#108899)" (#109059) A `#ifndef NDEBUG` in the wrong place caused an error in release builds. --- .../Vectorize/SandboxVectorizer/Region.h | 104 ++++++++++++++++++ llvm/lib/Transforms/Vectorize/CMakeLists.txt | 1 + .../Vectorize/SandboxVectorizer/Region.cpp | 45 ++++++++ .../SandboxVectorizer/CMakeLists.txt | 1 + .../SandboxVectorizer/RegionTest.cpp | 81 ++++++++++++++ 5 files changed, 232 insertions(+) create mode 100644 llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Region.h create mode 100644 llvm/lib/Transforms/Vectorize/SandboxVectorizer/Region.cpp create mode 100644 llvm/unittests/Transforms/Vectorize/SandboxVectorizer/RegionTest.cpp diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Region.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Region.h new file mode 100644 index 00000000000000..2f893bac213a01 --- /dev/null +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Region.h @@ -0,0 +1,104 @@ +//===- Region.h -------------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_REGION_H +#define LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_REGION_H + +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/SandboxIR/SandboxIR.h" +#include "llvm/Support/InstructionCost.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm::sandboxir { + +/// The main job of the Region is to point to new instructions generated by +/// vectorization passes. It is the unit that RegionPasses operate on with their +/// runOnRegion() function. +/// +/// The region allows us to stack transformations horizontally, meaning that +/// each transformation operates on a single region and the resulting region is +/// the input to the next transformation, as opposed to vertically, which is the +/// common way of applying a transformation across the whole function. This +/// enables us to check for profitability and decide whether we accept or +/// rollback at a region granularity, which is much better than doing this at +/// the function level. +/// +// Traditional approach: transformations applied vertically for the whole +// function +// F +// +----+ +// | | +// | | +// | | -> Transform1 -> ... -> TransformN -> Check Cost +// | | +// | | +// +----+ +// +// Region-based approach: transformations applied horizontally, for each Region +// F +// +----+ +// |Rgn1| -> Transform1 -> ... -> TransformN -> Check Cost +// | | +// |Rgn2| -> Transform1 -> ... -> TransformN -> Check Cost +// | | +// |Rgn3| -> Transform1 -> ... -> TransformN -> Check Cost +// +----+ + +class Region { + /// All the instructions in the Region. Only new instructions generated during + /// vectorization are part of the Region. + SetVector Insts; + + /// A unique ID, used for debugging. + unsigned RegionID = 0; + + Context &Ctx; + + // TODO: Add cost modeling. + // TODO: Add a way to encode/decode region info to/from metadata. + +public: + Region(Context &Ctx); + ~Region(); + + Context &getContext() const { return Ctx; } + /// Returns the region's unique ID. + unsigned getID() const { return RegionID; } + + /// Adds I to the set. + void add(Instruction *I); + /// Removes I from the set. + void remove(Instruction *I); + /// Returns true if I is in the Region. + bool contains(Instruction *I) const { return Insts.contains(I); } + /// Returns true if the Region has no instructions. + bool empty() const { return Insts.empty(); } + + using iterator = decltype(Insts.begin()); + iterator begin() { return Insts.begin(); } + iterator end() { return Insts.end(); } + iterator_range insts() { return make_range(begin(), end()); } + +#ifndef NDEBUG + /// This is an expensive check, meant for testing. + bool operator==(const Region &Other) const; + bool operator!=(const Region &other) const { return !(*this == other); } + + void dump(raw_ostream &OS) const; + void dump() const; + friend raw_ostream &operator<<(raw_ostream &OS, const Region &Rgn) { + Rgn.dump(OS); + return OS; + } +#endif +}; + +} // namespace llvm::sandboxir + +#endif // LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_REGION_H diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt index 59d04ac3cecd00..f33906b05fedd1 100644 --- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt +++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt @@ -5,6 +5,7 @@ add_llvm_component_library(LLVMVectorize LoopVectorize.cpp SandboxVectorizer/DependencyGraph.cpp SandboxVectorizer/Passes/BottomUpVec.cpp + SandboxVectorizer/Region.cpp SandboxVectorizer/SandboxVectorizer.cpp SLPVectorizer.cpp Vectorize.cpp diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Region.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Region.cpp new file mode 100644 index 00000000000000..34aa9f3786f34c --- /dev/null +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Region.cpp @@ -0,0 +1,45 @@ +//===- Region.cpp ---------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Vectorize/SandboxVectorizer/Region.h" + +namespace llvm::sandboxir { + +Region::Region(Context &Ctx) : Ctx(Ctx) { + static unsigned StaticRegionID; + RegionID = StaticRegionID++; +} + +Region::~Region() {} + +void Region::add(Instruction *I) { Insts.insert(I); } + +void Region::remove(Instruction *I) { Insts.remove(I); } + +#ifndef NDEBUG +bool Region::operator==(const Region &Other) const { + if (Insts.size() != Other.Insts.size()) + return false; + if (!std::is_permutation(Insts.begin(), Insts.end(), Other.Insts.begin())) + return false; + return true; +} + +void Region::dump(raw_ostream &OS) const { + OS << "RegionID: " << getID() << "\n"; + for (auto *I : Insts) + OS << *I << "\n"; +} + +void Region::dump() const { + dump(dbgs()); + dbgs() << "\n"; +} +#endif // NDEBUG + +} // namespace llvm::sandboxir diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/CMakeLists.txt b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/CMakeLists.txt index 2c7bf7d7e87541..10a730290608b7 100644 --- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/CMakeLists.txt +++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/CMakeLists.txt @@ -10,4 +10,5 @@ set(LLVM_LINK_COMPONENTS add_llvm_unittest(SandboxVectorizerTests DependencyGraphTest.cpp LegalityTest.cpp + RegionTest.cpp ) diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/RegionTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/RegionTest.cpp new file mode 100644 index 00000000000000..2c7390c515f114 --- /dev/null +++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/RegionTest.cpp @@ -0,0 +1,81 @@ +//===- RegionTest.cpp -----------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Vectorize/SandboxVectorizer/Region.h" +#include "llvm/AsmParser/Parser.h" +#include "llvm/SandboxIR/SandboxIR.h" +#include "llvm/Support/SourceMgr.h" +#include "gmock/gmock-matchers.h" +#include "gtest/gtest.h" + +using namespace llvm; + +struct RegionTest : public testing::Test { + LLVMContext C; + std::unique_ptr M; + + void parseIR(LLVMContext &C, const char *IR) { + SMDiagnostic Err; + M = parseAssemblyString(IR, Err, C); + if (!M) + Err.print("RegionTest", errs()); + } +}; + +TEST_F(RegionTest, Basic) { + parseIR(C, R"IR( +define i8 @foo(i8 %v0, i8 %v1) { + %t0 = add i8 %v0, 1 + %t1 = add i8 %t0, %v1 + ret i8 %t1 +} +)IR"); + llvm::Function *LLVMF = &*M->getFunction("foo"); + sandboxir::Context Ctx(C); + auto *F = Ctx.createFunction(LLVMF); + auto *BB = &*F->begin(); + auto It = BB->begin(); + auto *T0 = cast(&*It++); + auto *T1 = cast(&*It++); + auto *Ret = cast(&*It++); + sandboxir::Region Rgn(Ctx); + + // Check getters + EXPECT_EQ(&Ctx, &Rgn.getContext()); + EXPECT_EQ(0U, Rgn.getID()); + + // Check add / remove / empty. + EXPECT_TRUE(Rgn.empty()); + Rgn.add(T0); + EXPECT_FALSE(Rgn.empty()); + Rgn.remove(T0); + EXPECT_TRUE(Rgn.empty()); + + // Check iteration. + Rgn.add(T0); + Rgn.add(T1); + Rgn.add(Ret); + // Use an ordered matcher because we're supposed to preserve the insertion + // order for determinism. + EXPECT_THAT(Rgn.insts(), testing::ElementsAre(T0, T1, Ret)); + + // Check contains + EXPECT_TRUE(Rgn.contains(T0)); + Rgn.remove(T0); + EXPECT_FALSE(Rgn.contains(T0)); + +#ifndef NDEBUG + // Check equality comparison. Insert in reverse order into `Other` to check + // that comparison is order-independent. + sandboxir::Region Other(Ctx); + Other.add(Ret); + EXPECT_NE(Rgn, Other); + Other.add(T1); + EXPECT_EQ(Rgn, Other); +#endif +} From 51df8a33275408680f869bb206413373b5ca13e1 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Wed, 18 Sep 2024 18:36:55 +0000 Subject: [PATCH 103/321] [gn build] Port 71e434d30280 --- llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn | 1 + .../unittests/Transforms/Vectorize/SandboxVectorizer/BUILD.gn | 1 + 2 files changed, 2 insertions(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn index 0900872d444939..853cf341e28447 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn @@ -15,6 +15,7 @@ static_library("Vectorize") { "SLPVectorizer.cpp", "SandboxVectorizer/DependencyGraph.cpp", "SandboxVectorizer/Passes/BottomUpVec.cpp", + "SandboxVectorizer/Region.cpp", "SandboxVectorizer/SandboxVectorizer.cpp", "VPlan.cpp", "VPlanAnalysis.cpp", diff --git a/llvm/utils/gn/secondary/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/BUILD.gn index c990fb313f557a..a91cb838c5e2c5 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/BUILD.gn @@ -12,5 +12,6 @@ unittest("SandboxVectorizerTests") { sources = [ "DependencyGraphTest.cpp", "LegalityTest.cpp", + "RegionTest.cpp", ] } From 1be4c9710bd09e2f56908ca6cee54cb80ca1774d Mon Sep 17 00:00:00 2001 From: Nicolas van Kempen Date: Wed, 18 Sep 2024 14:57:31 -0400 Subject: [PATCH 104/321] [clang-tidy][readability-container-contains] Extend to any class with contains (#107521) This check will now work out of the box with other containers that have a `contains` method, such as `folly::F14` or Abseil containers. It will also work with strings, which are basically just weird containers. `std::string` and `std::string_view` will have a `contains` method starting with C++23. `llvm::StringRef` and `folly::StringPiece` are examples of existing implementations with a `contains` method. --- .../readability/ContainerContainsCheck.cpp | 42 ++-- .../readability/ContainerContainsCheck.h | 12 +- clang-tools-extra/docs/ReleaseNotes.rst | 4 + .../checks/readability/container-contains.rst | 38 ++-- .../readability/container-contains.cpp | 187 +++++++++++++++++- 5 files changed, 237 insertions(+), 46 deletions(-) diff --git a/clang-tools-extra/clang-tidy/readability/ContainerContainsCheck.cpp b/clang-tools-extra/clang-tidy/readability/ContainerContainsCheck.cpp index dbb50a060e5960..698231d777d2d4 100644 --- a/clang-tools-extra/clang-tidy/readability/ContainerContainsCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/ContainerContainsCheck.cpp @@ -13,30 +13,40 @@ using namespace clang::ast_matchers; namespace clang::tidy::readability { - void ContainerContainsCheck::registerMatchers(MatchFinder *Finder) { - const auto SupportedContainers = hasType( - hasUnqualifiedDesugaredType(recordType(hasDeclaration(cxxRecordDecl( - hasAnyName("::std::set", "::std::unordered_set", "::std::map", - "::std::unordered_map", "::std::multiset", - "::std::unordered_multiset", "::std::multimap", - "::std::unordered_multimap")))))); + const auto HasContainsMatchingParamType = hasMethod( + cxxMethodDecl(isConst(), parameterCountIs(1), returns(booleanType()), + hasName("contains"), unless(isDeleted()), isPublic(), + hasParameter(0, hasType(hasUnqualifiedDesugaredType( + equalsBoundNode("parameterType")))))); const auto CountCall = - cxxMemberCallExpr(on(SupportedContainers), - callee(cxxMethodDecl(hasName("count"))), - argumentCountIs(1)) + cxxMemberCallExpr( + argumentCountIs(1), + callee(cxxMethodDecl( + hasName("count"), + hasParameter(0, hasType(hasUnqualifiedDesugaredType( + type().bind("parameterType")))), + ofClass(cxxRecordDecl(HasContainsMatchingParamType))))) .bind("call"); const auto FindCall = - cxxMemberCallExpr(on(SupportedContainers), - callee(cxxMethodDecl(hasName("find"))), - argumentCountIs(1)) + cxxMemberCallExpr( + argumentCountIs(1), + callee(cxxMethodDecl( + hasName("find"), + hasParameter(0, hasType(hasUnqualifiedDesugaredType( + type().bind("parameterType")))), + ofClass(cxxRecordDecl(HasContainsMatchingParamType))))) .bind("call"); - const auto EndCall = cxxMemberCallExpr(on(SupportedContainers), - callee(cxxMethodDecl(hasName("end"))), - argumentCountIs(0)); + const auto EndCall = cxxMemberCallExpr( + argumentCountIs(0), + callee( + cxxMethodDecl(hasName("end"), + // In the matchers below, FindCall should always appear + // before EndCall so 'parameterType' is properly bound. + ofClass(cxxRecordDecl(HasContainsMatchingParamType))))); const auto Literal0 = integerLiteral(equals(0)); const auto Literal1 = integerLiteral(equals(1)); diff --git a/clang-tools-extra/clang-tidy/readability/ContainerContainsCheck.h b/clang-tools-extra/clang-tidy/readability/ContainerContainsCheck.h index 2e8276d684cd79..753603ed825372 100644 --- a/clang-tools-extra/clang-tidy/readability/ContainerContainsCheck.h +++ b/clang-tools-extra/clang-tidy/readability/ContainerContainsCheck.h @@ -13,8 +13,9 @@ namespace clang::tidy::readability { -/// Finds usages of `container.count()` and `find() == end()` which should be -/// replaced by a call to the `container.contains()` method introduced in C++20. +/// Finds usages of `container.count()` and +/// `container.find() == container.end()` which should be replaced by a call +/// to the `container.contains()` method. /// /// For the user-facing documentation see: /// http://clang.llvm.org/extra/clang-tidy/checks/readability/container-contains.html @@ -24,10 +25,11 @@ class ContainerContainsCheck : public ClangTidyCheck { : ClangTidyCheck(Name, Context) {} void registerMatchers(ast_matchers::MatchFinder *Finder) final; void check(const ast_matchers::MatchFinder::MatchResult &Result) final; - -protected: bool isLanguageVersionSupported(const LangOptions &LO) const final { - return LO.CPlusPlus20; + return LO.CPlusPlus; + } + std::optional getCheckTraversalKind() const override { + return TK_AsIs; } }; diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index d284bb62f7c7f4..82a761bd7f40ab 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -165,6 +165,10 @@ Changes in existing checks ` check to use ``std::endl`` as placeholder when lexer cannot get source text. +- Improved :doc:`readability-container-contains + ` check to let it work on + any class that has a ``contains`` method. + - Improved :doc:`readability-implicit-bool-conversion ` check by adding the option `UseUpperCaseLiteralSuffix` to select the diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/container-contains.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/container-contains.rst index b28daecf7a2cf3..1cfbf4c511c588 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/readability/container-contains.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/readability/container-contains.rst @@ -3,23 +3,31 @@ readability-container-contains ============================== -Finds usages of ``container.count()`` and ``container.find() == container.end()`` which should be replaced by a call to the ``container.contains()`` method introduced in C++20. +Finds usages of ``container.count()`` and +``container.find() == container.end()`` which should be replaced by a call to +the ``container.contains()`` method. -Whether an element is contained inside a container should be checked with ``contains`` instead of ``count``/``find`` because ``contains`` conveys the intent more clearly. Furthermore, for containers which permit multiple entries per key (``multimap``, ``multiset``, ...), ``contains`` is more efficient than ``count`` because ``count`` has to do unnecessary additional work. +Whether an element is contained inside a container should be checked with +``contains`` instead of ``count``/``find`` because ``contains`` conveys the +intent more clearly. Furthermore, for containers which permit multiple entries +per key (``multimap``, ``multiset``, ...), ``contains`` is more efficient than +``count`` because ``count`` has to do unnecessary additional work. Examples: -=========================================== ============================== -Initial expression Result -------------------------------------------- ------------------------------ -``myMap.find(x) == myMap.end()`` ``!myMap.contains(x)`` -``myMap.find(x) != myMap.end()`` ``myMap.contains(x)`` -``if (myMap.count(x))`` ``if (myMap.contains(x))`` -``bool exists = myMap.count(x)`` ``bool exists = myMap.contains(x)`` -``bool exists = myMap.count(x) > 0`` ``bool exists = myMap.contains(x)`` -``bool exists = myMap.count(x) >= 1`` ``bool exists = myMap.contains(x)`` -``bool missing = myMap.count(x) == 0`` ``bool missing = !myMap.contains(x)`` -=========================================== ============================== +====================================== ===================================== +Initial expression Result +-------------------------------------- ------------------------------------- +``myMap.find(x) == myMap.end()`` ``!myMap.contains(x)`` +``myMap.find(x) != myMap.end()`` ``myMap.contains(x)`` +``if (myMap.count(x))`` ``if (myMap.contains(x))`` +``bool exists = myMap.count(x)`` ``bool exists = myMap.contains(x)`` +``bool exists = myMap.count(x) > 0`` ``bool exists = myMap.contains(x)`` +``bool exists = myMap.count(x) >= 1`` ``bool exists = myMap.contains(x)`` +``bool missing = myMap.count(x) == 0`` ``bool missing = !myMap.contains(x)`` +====================================== ===================================== -This check applies to ``std::set``, ``std::unordered_set``, ``std::map``, ``std::unordered_map`` and the corresponding multi-key variants. -It is only active for C++20 and later, as the ``contains`` method was only added in C++20. +This check will apply to any class that has a ``contains`` method, notably +including ``std::set``, ``std::unordered_set``, ``std::map``, and +``std::unordered_map`` as of C++20, and ``std::string`` and ``std::string_view`` +as of C++23. diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/container-contains.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/container-contains.cpp index 0ecb61b2e7df06..906515b075d4de 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/container-contains.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/container-contains.cpp @@ -240,7 +240,7 @@ int testMacroExpansion(std::unordered_set &MySet) { return 0; } -// The following map has the same interface like `std::map`. +// The following map has the same interface as `std::map`. template struct CustomMap { unsigned count(const Key &K) const; @@ -249,13 +249,180 @@ struct CustomMap { void *end(); }; -// The clang-tidy check is currently hard-coded against the `std::` containers -// and hence won't annotate the following instance. We might change this in the -// future and also detect the following case. -void *testDifferentCheckTypes(CustomMap &MyMap) { - if (MyMap.count(0)) - // NO-WARNING. - // CHECK-FIXES: if (MyMap.count(0)) - return nullptr; - return MyMap.find(2); +void testDifferentCheckTypes(CustomMap &MyMap) { + if (MyMap.count(0)) {}; + // CHECK-MESSAGES: :[[@LINE-1]]:{{[0-9]+}}: warning: use 'contains' to check for membership [readability-container-contains] + // CHECK-FIXES: if (MyMap.contains(0)) {}; + + MyMap.find(0) != MyMap.end(); + // CHECK-MESSAGES: :[[@LINE-1]]:{{[0-9]+}}: warning: use 'contains' to check for membership [readability-container-contains] + // CHECK-FIXES: MyMap.contains(0); +} + +struct MySubmap : public CustomMap {}; + +void testSubclass(MySubmap& MyMap) { + if (MyMap.count(0)) {}; + // CHECK-MESSAGES: :[[@LINE-1]]:{{[0-9]+}}: warning: use 'contains' to check for membership [readability-container-contains] + // CHECK-FIXES: if (MyMap.contains(0)) {}; +} + +using UsingMap = CustomMap; +struct MySubmap2 : public UsingMap {}; +using UsingMap2 = MySubmap2; + +void testUsing(UsingMap2& MyMap) { + if (MyMap.count(0)) {}; + // CHECK-MESSAGES: :[[@LINE-1]]:{{[0-9]+}}: warning: use 'contains' to check for membership [readability-container-contains] + // CHECK-FIXES: if (MyMap.contains(0)) {}; +} + +template +struct CustomMapContainsDeleted { + unsigned count(const Key &K) const; + bool contains(const Key &K) const = delete; + void *find(const Key &K); + void *end(); +}; + +struct SubmapContainsDeleted : public CustomMapContainsDeleted {}; + +void testContainsDeleted(CustomMapContainsDeleted &MyMap, + SubmapContainsDeleted &MyMap2) { + // No warning if the `contains` method is deleted. + if (MyMap.count(0)) {}; + if (MyMap2.count(0)) {}; +} + +template +struct CustomMapPrivateContains { + unsigned count(const Key &K) const; + void *find(const Key &K); + void *end(); + +private: + bool contains(const Key &K) const; +}; + +struct SubmapPrivateContains : public CustomMapPrivateContains {}; + +void testPrivateContains(CustomMapPrivateContains &MyMap, + SubmapPrivateContains &MyMap2) { + // No warning if the `contains` method is not public. + if (MyMap.count(0)) {}; + if (MyMap2.count(0)) {}; +} + +struct MyString {}; + +struct WeirdNonMatchingContains { + unsigned count(char) const; + bool contains(const MyString&) const; +}; + +void testWeirdNonMatchingContains(WeirdNonMatchingContains &MyMap) { + // No warning if there is no `contains` method with the right type. + if (MyMap.count('a')) {}; +} + +template +struct SmallPtrSet { + using ConstPtrType = const T*; + unsigned count(ConstPtrType Ptr) const; + bool contains(ConstPtrType Ptr) const; +}; + +template +struct SmallPtrPtrSet { + using ConstPtrType = const T**; + unsigned count(ConstPtrType Ptr) const; + bool contains(ConstPtrType Ptr) const; +}; + +template +struct SmallPtrPtrPtrSet { + using ConstPtrType = const T***; + unsigned count(ConstPtrType Ptr) const; + bool contains(ConstPtrType Ptr) const; +}; + +void testSmallPtrSet(const int ***Ptr, + SmallPtrSet &MySet, + SmallPtrPtrSet &MySet2, + SmallPtrPtrPtrSet &MySet3) { + if (MySet.count(**Ptr)) {}; + // CHECK-MESSAGES: :[[@LINE-1]]:{{[0-9]+}}: warning: use 'contains' to check for membership [readability-container-contains] + // CHECK-FIXES: if (MySet.contains(**Ptr)) {}; + if (MySet2.count(*Ptr)) {}; + // CHECK-MESSAGES: :[[@LINE-1]]:{{[0-9]+}}: warning: use 'contains' to check for membership [readability-container-contains] + // CHECK-FIXES: if (MySet2.contains(*Ptr)) {}; + if (MySet3.count(Ptr)) {}; + // CHECK-MESSAGES: :[[@LINE-1]]:{{[0-9]+}}: warning: use 'contains' to check for membership [readability-container-contains] + // CHECK-FIXES: if (MySet3.contains(Ptr)) {}; +} + +struct X {}; +struct Y : public X {}; + +void testSubclassEntry(SmallPtrSet& Set, Y* Entry) { + if (Set.count(Entry)) {} + // CHECK-MESSAGES: :[[@LINE-1]]:{{[0-9]+}}: warning: use 'contains' to check for membership [readability-container-contains] + // CHECK-FIXES: if (Set.contains(Entry)) {} +} + +struct WeirdPointerApi { + unsigned count(int** Ptr) const; + bool contains(int* Ptr) const; +}; + +void testWeirdApi(WeirdPointerApi& Set, int* E) { + if (Set.count(&E)) {} +} + +void testIntUnsigned(std::set& S, unsigned U) { + if (S.count(U)) {} + // CHECK-MESSAGES: :[[@LINE-1]]:{{[0-9]+}}: warning: use 'contains' to check for membership [readability-container-contains] + // CHECK-FIXES: if (S.contains(U)) {} +} + +template +struct CustomSetConvertible { + unsigned count(const T &K) const; + bool contains(const T &K) const; +}; + +struct A {}; +struct B { B() = default; B(const A&) {} }; +struct C { operator A() const; }; + +void testConvertibleTypes() { + CustomSetConvertible MyMap; + if (MyMap.count(A())) {}; + // CHECK-MESSAGES: :[[@LINE-1]]:{{[0-9]+}}: warning: use 'contains' to check for membership [readability-container-contains] + // CHECK-FIXES: if (MyMap.contains(A())) {}; + + CustomSetConvertible MyMap2; + if (MyMap2.count(C())) {}; + // CHECK-MESSAGES: :[[@LINE-1]]:{{[0-9]+}}: warning: use 'contains' to check for membership [readability-container-contains] + // CHECK-FIXES: if (MyMap2.contains(C())) {}; + + if (MyMap2.count(C()) != 0) {}; + // CHECK-MESSAGES: :[[@LINE-1]]:{{[0-9]+}}: warning: use 'contains' to check for membership [readability-container-contains] + // CHECK-FIXES: if (MyMap2.contains(C())) {}; +} + +template +using Box = const U& ; + +template +struct CustomBoxedSet { + unsigned count(Box K) const; + bool contains(Box K) const; +}; + +void testBox() { + CustomBoxedSet Set; + if (Set.count(0)) {}; + // CHECK-MESSAGES: :[[@LINE-1]]:{{[0-9]+}}: warning: use 'contains' to check for membership [readability-container-contains] + // CHECK-FIXES: if (Set.contains(0)) {}; } From d5d1417659267f7247668f46cd51bd748b368fa3 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 18 Sep 2024 12:07:44 -0700 Subject: [PATCH 105/321] [RISCV][GISel] Use libcalls for rint, nearbyint, trunc, round, and roundeven intrinsics. (#108779) --- .../CodeGen/GlobalISel/LegalizerHelper.cpp | 6 + .../Target/RISCV/GISel/RISCVLegalizerInfo.cpp | 4 +- .../legalizer/legalize-fp-ceil-floor.mir | 98 ------ .../legalizer/legalize-fp-libcall.mir | 328 ++++++++++++++++++ 4 files changed, 337 insertions(+), 99 deletions(-) delete mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fp-ceil-floor.mir create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fp-libcall.mir diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 01e47bd2fb40fb..e64d3f51a01111 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -485,6 +485,10 @@ static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) { RTLIBCASE(RINT_F); case TargetOpcode::G_FNEARBYINT: RTLIBCASE(NEARBYINT_F); + case TargetOpcode::G_INTRINSIC_TRUNC: + RTLIBCASE(TRUNC_F); + case TargetOpcode::G_INTRINSIC_ROUND: + RTLIBCASE(ROUND_F); case TargetOpcode::G_INTRINSIC_ROUNDEVEN: RTLIBCASE(ROUNDEVEN_F); case TargetOpcode::G_INTRINSIC_LRINT: @@ -1215,6 +1219,8 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) { case TargetOpcode::G_FSQRT: case TargetOpcode::G_FRINT: case TargetOpcode::G_FNEARBYINT: + case TargetOpcode::G_INTRINSIC_TRUNC: + case TargetOpcode::G_INTRINSIC_ROUND: case TargetOpcode::G_INTRINSIC_ROUNDEVEN: { LLT LLTy = MRI.getType(MI.getOperand(0).getReg()); unsigned Size = LLTy.getSizeInBits(); diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp index c204683f4e79f8..192ba375d5a5d9 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp @@ -541,7 +541,9 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) // FIXME: We can do custom inline expansion like SelectionDAG. // FIXME: Legal with Zfa. - getActionDefinitionsBuilder({G_FCEIL, G_FFLOOR}) + getActionDefinitionsBuilder({G_FCEIL, G_FFLOOR, G_FRINT, G_FNEARBYINT, + G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND, + G_INTRINSIC_ROUNDEVEN}) .libcallFor({s32, s64}); getActionDefinitionsBuilder(G_VASTART).customFor({p0}); diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fp-ceil-floor.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fp-ceil-floor.mir deleted file mode 100644 index 1e184bd0c1120f..00000000000000 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fp-ceil-floor.mir +++ /dev/null @@ -1,98 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=riscv32 -mattr=+d -run-pass=legalizer %s -o - \ -# RUN: | FileCheck %s -# RUN: llc -mtriple=riscv64 -mattr=+d -run-pass=legalizer %s -o - \ -# RUN: | FileCheck %s - ---- -name: ceil_f32 -body: | - bb.1: - liveins: $f10_f - - ; CHECK-LABEL: name: ceil_f32 - ; CHECK: liveins: $f10_f - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f - ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: $f10_f = COPY [[COPY]](s32) - ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &ceilf, csr_ilp32d_lp64d, implicit-def $x1, implicit $f10_f, implicit-def $f10_f - ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $f10_f - ; CHECK-NEXT: $f10_f = COPY [[COPY1]](s32) - ; CHECK-NEXT: PseudoRET implicit $f10_f - %0:_(s32) = COPY $f10_f - %1:_(s32) = G_FCEIL %0 - $f10_f = COPY %1(s32) - PseudoRET implicit $f10_f - -... ---- -name: floor_f32 -body: | - bb.1: - liveins: $f10_f - - ; CHECK-LABEL: name: floor_f32 - ; CHECK: liveins: $f10_f - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f - ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: $f10_f = COPY [[COPY]](s32) - ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &floorf, csr_ilp32d_lp64d, implicit-def $x1, implicit $f10_f, implicit-def $f10_f - ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $f10_f - ; CHECK-NEXT: $f10_f = COPY [[COPY1]](s32) - ; CHECK-NEXT: PseudoRET implicit $f10_f - %0:_(s32) = COPY $f10_f - %1:_(s32) = G_FFLOOR %0 - $f10_f = COPY %1(s32) - PseudoRET implicit $f10_f - -... ---- -name: ceil_f64 -body: | - bb.1: - liveins: $f10_d - - ; CHECK-LABEL: name: ceil_f64 - ; CHECK: liveins: $f10_d - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $f10_d - ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: $f10_d = COPY [[COPY]](s64) - ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &ceil, csr_ilp32d_lp64d, implicit-def $x1, implicit $f10_d, implicit-def $f10_d - ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $f10_d - ; CHECK-NEXT: $f10_d = COPY [[COPY1]](s64) - ; CHECK-NEXT: PseudoRET implicit $f10_d - %0:_(s64) = COPY $f10_d - %1:_(s64) = G_FCEIL %0 - $f10_d = COPY %1(s64) - PseudoRET implicit $f10_d - -... ---- -name: floor_f64 -body: | - bb.1: - liveins: $f10_d - - ; CHECK-LABEL: name: floor_f64 - ; CHECK: liveins: $f10_d - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $f10_d - ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: $f10_d = COPY [[COPY]](s64) - ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &floor, csr_ilp32d_lp64d, implicit-def $x1, implicit $f10_d, implicit-def $f10_d - ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $f10_d - ; CHECK-NEXT: $f10_d = COPY [[COPY1]](s64) - ; CHECK-NEXT: PseudoRET implicit $f10_d - %0:_(s64) = COPY $f10_d - %1:_(s64) = G_FFLOOR %0 - $f10_d = COPY %1(s64) - PseudoRET implicit $f10_d - -... diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fp-libcall.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fp-libcall.mir new file mode 100644 index 00000000000000..3b4f6a065d9736 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fp-libcall.mir @@ -0,0 +1,328 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=riscv32 -mattr=+d -run-pass=legalizer %s -o - \ +# RUN: | FileCheck %s +# RUN: llc -mtriple=riscv64 -mattr=+d -run-pass=legalizer %s -o - \ +# RUN: | FileCheck %s + +--- +name: ceil_f32 +body: | + bb.1: + liveins: $f10_f + + ; CHECK-LABEL: name: ceil_f32 + ; CHECK: liveins: $f10_f + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 + ; CHECK-NEXT: $f10_f = COPY [[COPY]](s32) + ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &ceilf, csr_ilp32d_lp64d, implicit-def $x1, implicit $f10_f, implicit-def $f10_f + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $f10_f + ; CHECK-NEXT: $f10_f = COPY [[COPY1]](s32) + ; CHECK-NEXT: PseudoRET implicit $f10_f + %0:_(s32) = COPY $f10_f + %1:_(s32) = G_FCEIL %0 + $f10_f = COPY %1(s32) + PseudoRET implicit $f10_f + +... +--- +name: floor_f32 +body: | + bb.1: + liveins: $f10_f + + ; CHECK-LABEL: name: floor_f32 + ; CHECK: liveins: $f10_f + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 + ; CHECK-NEXT: $f10_f = COPY [[COPY]](s32) + ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &floorf, csr_ilp32d_lp64d, implicit-def $x1, implicit $f10_f, implicit-def $f10_f + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $f10_f + ; CHECK-NEXT: $f10_f = COPY [[COPY1]](s32) + ; CHECK-NEXT: PseudoRET implicit $f10_f + %0:_(s32) = COPY $f10_f + %1:_(s32) = G_FFLOOR %0 + $f10_f = COPY %1(s32) + PseudoRET implicit $f10_f + +... +--- +name: trunc_f32 +body: | + bb.1: + liveins: $f10_f + + ; CHECK-LABEL: name: trunc_f32 + ; CHECK: liveins: $f10_f + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 + ; CHECK-NEXT: $f10_f = COPY [[COPY]](s32) + ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &truncf, csr_ilp32d_lp64d, implicit-def $x1, implicit $f10_f, implicit-def $f10_f + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $f10_f + ; CHECK-NEXT: $f10_f = COPY [[COPY1]](s32) + ; CHECK-NEXT: PseudoRET implicit $f10_f + %0:_(s32) = COPY $f10_f + %1:_(s32) = G_INTRINSIC_TRUNC %0 + $f10_f = COPY %1(s32) + PseudoRET implicit $f10_f + +... +--- +name: rint_f32 +body: | + bb.1: + liveins: $f10_f + + ; CHECK-LABEL: name: rint_f32 + ; CHECK: liveins: $f10_f + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 + ; CHECK-NEXT: $f10_f = COPY [[COPY]](s32) + ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &rintf, csr_ilp32d_lp64d, implicit-def $x1, implicit $f10_f, implicit-def $f10_f + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $f10_f + ; CHECK-NEXT: $f10_f = COPY [[COPY1]](s32) + ; CHECK-NEXT: PseudoRET implicit $f10_f + %0:_(s32) = COPY $f10_f + %1:_(s32) = G_FRINT %0 + $f10_f = COPY %1(s32) + PseudoRET implicit $f10_f + +... +--- +name: nearbyint_f32 +body: | + bb.1: + liveins: $f10_f + + ; CHECK-LABEL: name: nearbyint_f32 + ; CHECK: liveins: $f10_f + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 + ; CHECK-NEXT: $f10_f = COPY [[COPY]](s32) + ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &nearbyintf, csr_ilp32d_lp64d, implicit-def $x1, implicit $f10_f, implicit-def $f10_f + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $f10_f + ; CHECK-NEXT: $f10_f = COPY [[COPY1]](s32) + ; CHECK-NEXT: PseudoRET implicit $f10_f + %0:_(s32) = COPY $f10_f + %1:_(s32) = G_FNEARBYINT %0 + $f10_f = COPY %1(s32) + PseudoRET implicit $f10_f + +... +--- +name: round_f32 +body: | + bb.1: + liveins: $f10_f + + ; CHECK-LABEL: name: round_f32 + ; CHECK: liveins: $f10_f + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 + ; CHECK-NEXT: $f10_f = COPY [[COPY]](s32) + ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &roundf, csr_ilp32d_lp64d, implicit-def $x1, implicit $f10_f, implicit-def $f10_f + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $f10_f + ; CHECK-NEXT: $f10_f = COPY [[COPY1]](s32) + ; CHECK-NEXT: PseudoRET implicit $f10_f + %0:_(s32) = COPY $f10_f + %1:_(s32) = G_INTRINSIC_ROUND %0 + $f10_f = COPY %1(s32) + PseudoRET implicit $f10_f + +... +--- +name: roundeven_f32 +body: | + bb.1: + liveins: $f10_f + + ; CHECK-LABEL: name: roundeven_f32 + ; CHECK: liveins: $f10_f + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 + ; CHECK-NEXT: $f10_f = COPY [[COPY]](s32) + ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &roundevenf, csr_ilp32d_lp64d, implicit-def $x1, implicit $f10_f, implicit-def $f10_f + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $f10_f + ; CHECK-NEXT: $f10_f = COPY [[COPY1]](s32) + ; CHECK-NEXT: PseudoRET implicit $f10_f + %0:_(s32) = COPY $f10_f + %1:_(s32) = G_INTRINSIC_ROUNDEVEN %0 + $f10_f = COPY %1(s32) + PseudoRET implicit $f10_f + +... +--- +name: ceil_f64 +body: | + bb.1: + liveins: $f10_d + + ; CHECK-LABEL: name: ceil_f64 + ; CHECK: liveins: $f10_d + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $f10_d + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 + ; CHECK-NEXT: $f10_d = COPY [[COPY]](s64) + ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &ceil, csr_ilp32d_lp64d, implicit-def $x1, implicit $f10_d, implicit-def $f10_d + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $f10_d + ; CHECK-NEXT: $f10_d = COPY [[COPY1]](s64) + ; CHECK-NEXT: PseudoRET implicit $f10_d + %0:_(s64) = COPY $f10_d + %1:_(s64) = G_FCEIL %0 + $f10_d = COPY %1(s64) + PseudoRET implicit $f10_d + +... +--- +name: floor_f64 +body: | + bb.1: + liveins: $f10_d + + ; CHECK-LABEL: name: floor_f64 + ; CHECK: liveins: $f10_d + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $f10_d + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 + ; CHECK-NEXT: $f10_d = COPY [[COPY]](s64) + ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &floor, csr_ilp32d_lp64d, implicit-def $x1, implicit $f10_d, implicit-def $f10_d + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $f10_d + ; CHECK-NEXT: $f10_d = COPY [[COPY1]](s64) + ; CHECK-NEXT: PseudoRET implicit $f10_d + %0:_(s64) = COPY $f10_d + %1:_(s64) = G_FFLOOR %0 + $f10_d = COPY %1(s64) + PseudoRET implicit $f10_d + +... +--- +name: trunc_f64 +body: | + bb.1: + liveins: $f10_d + + ; CHECK-LABEL: name: trunc_f64 + ; CHECK: liveins: $f10_d + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $f10_d + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 + ; CHECK-NEXT: $f10_d = COPY [[COPY]](s64) + ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &trunc, csr_ilp32d_lp64d, implicit-def $x1, implicit $f10_d, implicit-def $f10_d + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $f10_d + ; CHECK-NEXT: $f10_d = COPY [[COPY1]](s64) + ; CHECK-NEXT: PseudoRET implicit $f10_d + %0:_(s64) = COPY $f10_d + %1:_(s64) = G_INTRINSIC_TRUNC %0 + $f10_d = COPY %1(s64) + PseudoRET implicit $f10_d + +... +--- +name: rint_f64 +body: | + bb.1: + liveins: $f10_d + + ; CHECK-LABEL: name: rint_f64 + ; CHECK: liveins: $f10_d + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $f10_d + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 + ; CHECK-NEXT: $f10_d = COPY [[COPY]](s64) + ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &rint, csr_ilp32d_lp64d, implicit-def $x1, implicit $f10_d, implicit-def $f10_d + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $f10_d + ; CHECK-NEXT: $f10_d = COPY [[COPY1]](s64) + ; CHECK-NEXT: PseudoRET implicit $f10_d + %0:_(s64) = COPY $f10_d + %1:_(s64) = G_FRINT %0 + $f10_d = COPY %1(s64) + PseudoRET implicit $f10_d + +... +--- +name: nearbyint_f64 +body: | + bb.1: + liveins: $f10_d + + ; CHECK-LABEL: name: nearbyint_f64 + ; CHECK: liveins: $f10_d + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $f10_d + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 + ; CHECK-NEXT: $f10_d = COPY [[COPY]](s64) + ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &nearbyint, csr_ilp32d_lp64d, implicit-def $x1, implicit $f10_d, implicit-def $f10_d + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $f10_d + ; CHECK-NEXT: $f10_d = COPY [[COPY1]](s64) + ; CHECK-NEXT: PseudoRET implicit $f10_d + %0:_(s64) = COPY $f10_d + %1:_(s64) = G_FNEARBYINT %0 + $f10_d = COPY %1(s64) + PseudoRET implicit $f10_d + +... +--- +name: round_f64 +body: | + bb.1: + liveins: $f10_d + + ; CHECK-LABEL: name: round_f64 + ; CHECK: liveins: $f10_d + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $f10_d + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 + ; CHECK-NEXT: $f10_d = COPY [[COPY]](s64) + ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &round, csr_ilp32d_lp64d, implicit-def $x1, implicit $f10_d, implicit-def $f10_d + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $f10_d + ; CHECK-NEXT: $f10_d = COPY [[COPY1]](s64) + ; CHECK-NEXT: PseudoRET implicit $f10_d + %0:_(s64) = COPY $f10_d + %1:_(s64) = G_INTRINSIC_ROUND %0 + $f10_d = COPY %1(s64) + PseudoRET implicit $f10_d + +... +--- +name: roundeven_f64 +body: | + bb.1: + liveins: $f10_d + + ; CHECK-LABEL: name: roundeven_f64 + ; CHECK: liveins: $f10_d + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $f10_d + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 + ; CHECK-NEXT: $f10_d = COPY [[COPY]](s64) + ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &roundeven, csr_ilp32d_lp64d, implicit-def $x1, implicit $f10_d, implicit-def $f10_d + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $f10_d + ; CHECK-NEXT: $f10_d = COPY [[COPY1]](s64) + ; CHECK-NEXT: PseudoRET implicit $f10_d + %0:_(s64) = COPY $f10_d + %1:_(s64) = G_INTRINSIC_ROUNDEVEN %0 + $f10_d = COPY %1(s64) + PseudoRET implicit $f10_d + +... From b0bdc7fcc995fe6cf6ca9a184fc6ed211fc7b608 Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Wed, 18 Sep 2024 12:18:50 -0700 Subject: [PATCH 106/321] [flang] Fix subtle type naming bug in module file output (#108892) A derived type specification in semantics holds both its source name (for location purposes) and its ultimate derived type symbol. But for correct module file generation of a structure constructor using that derived type spec, the original symbol may be needed so that USE association can be exposed. Save both the original symbol and its ultimate symbol in the DerivedTypeSpec, and collect the right one when traversing expressions (specifically for handling initialization in module files). Fixes https://github.com/llvm/llvm-project/issues/108827. --- flang/include/flang/Evaluate/traverse.h | 2 +- flang/include/flang/Semantics/type.h | 9 +++-- flang/lib/Evaluate/characteristics.cpp | 5 +-- flang/lib/Semantics/check-declarations.cpp | 2 +- flang/lib/Semantics/resolve-names.cpp | 39 +++++++++++--------- flang/lib/Semantics/type.cpp | 11 +++--- flang/test/Semantics/get_team.f90 | 2 +- flang/test/Semantics/modfile68.f90 | 42 ++++++++++++++++++++++ flang/test/Semantics/modproc01.f90 | 8 +++-- 9 files changed, 88 insertions(+), 32 deletions(-) create mode 100644 flang/test/Semantics/modfile68.f90 diff --git a/flang/include/flang/Evaluate/traverse.h b/flang/include/flang/Evaluate/traverse.h index 7f4a67d97e64e7..90b93f6afd3515 100644 --- a/flang/include/flang/Evaluate/traverse.h +++ b/flang/include/flang/Evaluate/traverse.h @@ -217,7 +217,7 @@ class Traverse { return CombineContents(x); } Result operator()(const semantics::DerivedTypeSpec &x) const { - return Combine(x.typeSymbol(), x.parameters()); + return Combine(x.originalTypeSymbol(), x.parameters()); } Result operator()(const StructureConstructorValues::value_type &x) const { return visitor_(x.second); diff --git a/flang/include/flang/Semantics/type.h b/flang/include/flang/Semantics/type.h index e2d47d38f927f7..e2131e7e160cb6 100644 --- a/flang/include/flang/Semantics/type.h +++ b/flang/include/flang/Semantics/type.h @@ -259,6 +259,7 @@ class DerivedTypeSpec { DerivedTypeSpec(DerivedTypeSpec &&); const SourceName &name() const { return name_; } + const Symbol &originalTypeSymbol() const { return originalTypeSymbol_; } const Symbol &typeSymbol() const { return typeSymbol_; } const Scope *scope() const { return scope_; } // Return scope_ if it is set, or the typeSymbol_ scope otherwise. @@ -319,7 +320,8 @@ class DerivedTypeSpec { private: SourceName name_; - const Symbol &typeSymbol_; + const Symbol &originalTypeSymbol_; + const Symbol &typeSymbol_; // == originalTypeSymbol_.GetUltimate() const Scope *scope_{nullptr}; // same as typeSymbol_.scope() unless PDT bool cooked_{false}; bool evaluated_{false}; @@ -328,8 +330,9 @@ class DerivedTypeSpec { ParameterMapType parameters_; Category category_{Category::DerivedType}; bool RawEquals(const DerivedTypeSpec &that) const { - return &typeSymbol_ == &that.typeSymbol_ && cooked_ == that.cooked_ && - rawParameters_ == that.rawParameters_; + return &typeSymbol_ == &that.typeSymbol_ && + &originalTypeSymbol_ == &that.originalTypeSymbol_ && + cooked_ == that.cooked_ && rawParameters_ == that.rawParameters_; } friend llvm::raw_ostream &operator<<( llvm::raw_ostream &, const DerivedTypeSpec &); diff --git a/flang/lib/Evaluate/characteristics.cpp b/flang/lib/Evaluate/characteristics.cpp index 70e24d6e82eb8e..2496e4427fe7ae 100644 --- a/flang/lib/Evaluate/characteristics.cpp +++ b/flang/lib/Evaluate/characteristics.cpp @@ -66,8 +66,9 @@ bool ShapesAreCompatible(const std::optional &x, } bool TypeAndShape::operator==(const TypeAndShape &that) const { - return type_ == that.type_ && ShapesAreCompatible(shape_, that.shape_) && - attrs_ == that.attrs_ && corank_ == that.corank_; + return type_.IsEquivalentTo(that.type_) && + ShapesAreCompatible(shape_, that.shape_) && attrs_ == that.attrs_ && + corank_ == that.corank_; } TypeAndShape &TypeAndShape::Rewrite(FoldingContext &context) { diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp index b852fbf12a6e40..dfd49db74eea73 100644 --- a/flang/lib/Semantics/check-declarations.cpp +++ b/flang/lib/Semantics/check-declarations.cpp @@ -2519,7 +2519,7 @@ void CheckHelper::CheckProcBinding( ? "A NOPASS type-bound procedure may not override a passed-argument procedure"_err_en_US : "A passed-argument type-bound procedure may not override a NOPASS procedure"_err_en_US); } else { - const auto *bindingChars{Characterize(binding.symbol())}; + const auto *bindingChars{Characterize(symbol)}; const auto *overriddenChars{Characterize(*overridden)}; if (bindingChars && overriddenChars) { if (isNopass) { diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index 7c692440d24730..0ff2795cc98477 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -3053,11 +3053,16 @@ void ModuleVisitor::DoAddUse(SourceName location, SourceName localName, const Symbol &useUltimate{useSymbol.GetUltimate()}; const auto *useGeneric{useUltimate.detailsIf()}; if (localSymbol->has()) { - if (useGeneric && useGeneric->specific() && - IsProcedurePointer(*useGeneric->specific())) { - // We are use-associating a generic that shadows a procedure pointer. - // Local references that might be made to that procedure pointer should - // use a UseDetails symbol for proper data addressing. So create an + if (useGeneric && + ((useGeneric->specific() && + IsProcedurePointer(*useGeneric->specific())) || + (useGeneric->derivedType() && + useUltimate.name() != localSymbol->name()))) { + // We are use-associating a generic that either shadows a procedure + // pointer or shadows a derived type of the same name. + // Local references that might be made to the procedure pointer should + // use a UseDetails symbol for proper data addressing, and a derived + // type needs to be in scope with the renamed name. So create an // empty local generic now into which the use-associated generic may // be copied. localSymbol->set_details(GenericDetails{}); @@ -3153,9 +3158,15 @@ void ModuleVisitor::DoAddUse(SourceName location, SourceName localName, if (!useDerivedType) { combinedDerivedType = localDerivedType; } else if (!localDerivedType) { - combinedDerivedType = useDerivedType; + if (useDerivedType->name() == localName) { + combinedDerivedType = useDerivedType; + } else { + Symbol &combined{currScope().MakeSymbol(localName, + useDerivedType->attrs(), UseDetails{localName, *useDerivedType})}; + combinedDerivedType = &combined; + } } else { - const Scope *localScope{localDerivedType->scope()}; + const Scope *localScope{localDerivedType->GetUltimate().scope()}; const Scope *useScope{useDerivedType->GetUltimate().scope()}; if (localScope && useScope && localScope->derivedTypeSpec() && useScope->derivedTypeSpec() && @@ -6776,9 +6787,7 @@ std::optional DeclarationVisitor::ResolveDerivedType( } if (CheckUseError(name)) { return std::nullopt; - } - symbol = &symbol->GetUltimate(); - if (symbol->has()) { + } else if (symbol->GetUltimate().has()) { return DerivedTypeSpec{name.source, *symbol}; } else { Say(name, "'%s' is not a derived type"_err_en_US); @@ -7120,12 +7129,10 @@ bool ConstructVisitor::Pre(const parser::DataStmtValue &x) { auto &mutableData{const_cast(data)}; if (auto *elem{parser::Unwrap(mutableData)}) { if (const auto *name{std::get_if(&elem->base.u)}) { - if (const Symbol * symbol{FindSymbol(*name)}) { - const Symbol &ultimate{symbol->GetUltimate()}; - if (ultimate.has()) { - mutableData.u = elem->ConvertToStructureConstructor( - DerivedTypeSpec{name->source, ultimate}); - } + if (const Symbol * symbol{FindSymbol(*name)}; + symbol && symbol->GetUltimate().has()) { + mutableData.u = elem->ConvertToStructureConstructor( + DerivedTypeSpec{name->source, *symbol}); } } } diff --git a/flang/lib/Semantics/type.cpp b/flang/lib/Semantics/type.cpp index 810b9829b0b8db..e867d7ad6e2536 100644 --- a/flang/lib/Semantics/type.cpp +++ b/flang/lib/Semantics/type.cpp @@ -22,8 +22,9 @@ namespace Fortran::semantics { DerivedTypeSpec::DerivedTypeSpec(SourceName name, const Symbol &typeSymbol) - : name_{name}, typeSymbol_{typeSymbol} { - CHECK(typeSymbol.has()); + : name_{name}, originalTypeSymbol_{typeSymbol}, + typeSymbol_{typeSymbol.GetUltimate()} { + CHECK(typeSymbol_.has()); } DerivedTypeSpec::DerivedTypeSpec(const DerivedTypeSpec &that) = default; DerivedTypeSpec::DerivedTypeSpec(DerivedTypeSpec &&that) = default; @@ -340,9 +341,7 @@ void DerivedTypeSpec::Instantiate(Scope &containingScope) { const Scope &typeScope{DEREF(typeSymbol_.scope())}; if (!MightBeParameterized()) { scope_ = &typeScope; - if (typeScope.derivedTypeSpec()) { - CHECK(*this == *typeScope.derivedTypeSpec()); - } else { + if (!typeScope.derivedTypeSpec() || *this != *typeScope.derivedTypeSpec()) { Scope &mutableTypeScope{const_cast(typeScope)}; mutableTypeScope.set_derivedTypeSpec(*this); InstantiateNonPDTScope(mutableTypeScope, containingScope); @@ -664,7 +663,7 @@ std::string DerivedTypeSpec::VectorTypeAsFortran() const { std::string DerivedTypeSpec::AsFortran() const { std::string buf; llvm::raw_string_ostream ss{buf}; - ss << name_; + ss << originalTypeSymbol_.name(); if (!rawParameters_.empty()) { CHECK(parameters_.empty()); ss << '('; diff --git a/flang/test/Semantics/get_team.f90 b/flang/test/Semantics/get_team.f90 index a28b0d72f23ffe..7e4886703d17c2 100644 --- a/flang/test/Semantics/get_team.f90 +++ b/flang/test/Semantics/get_team.f90 @@ -49,7 +49,7 @@ program get_team_test !ERROR: repeated keyword argument to intrinsic 'get_team' result_team = get_team(level=initial_team, level=parent_team) - !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches operand types LOGICAL(4) and TYPE(__builtin_team_type) + !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches operand types LOGICAL(4) and TYPE(team_type) wrong_result_type = get_team() end program get_team_test diff --git a/flang/test/Semantics/modfile68.f90 b/flang/test/Semantics/modfile68.f90 new file mode 100644 index 00000000000000..550560303f082d --- /dev/null +++ b/flang/test/Semantics/modfile68.f90 @@ -0,0 +1,42 @@ +! RUN: %python %S/test_modfile.py %s %flang_fc1 +module m1 + use iso_c_binding, only : c_ptr, c_null_ptr + private + public :: t1 + type :: t1 + type(c_ptr) :: c_ptr = c_null_ptr + end type +end + +!Expect: m1.mod +!module m1 +!use,intrinsic::__fortran_builtins,only:__builtin_c_ptr +!use,intrinsic::iso_c_binding,only:c_ptr +!use,intrinsic::iso_c_binding,only:c_null_ptr +!private::__builtin_c_ptr +!private::c_ptr +!private::c_null_ptr +!type::t1 +!type(c_ptr)::c_ptr=__builtin_c_ptr(__address=0_8) +!end type +!end + +module m2 + use m1, only : t1 + private + public :: t2 + type :: t2 + type(t1) :: x = t1() + end type +end + +!Expect: m2.mod +!module m2 +!use,intrinsic::__fortran_builtins,only:__builtin_c_ptr +!use m1,only:t1 +!private::__builtin_c_ptr +!private::t1 +!type::t2 +!type(t1)::x=t1(c_ptr=__builtin_c_ptr(__address=0_8)) +!end type +!end diff --git a/flang/test/Semantics/modproc01.f90 b/flang/test/Semantics/modproc01.f90 index 5652e15750c7e9..5f45362e950934 100644 --- a/flang/test/Semantics/modproc01.f90 +++ b/flang/test/Semantics/modproc01.f90 @@ -144,8 +144,12 @@ program test !CHECK: a1, ALLOCATABLE size=40 offset=0: ObjectEntity type: TYPE(pdt2(k2=1_4,l2=3_4)) !CHECK: k1: TypeParam type:INTEGER(4) Kind init:1_4 !CHECK: l1: TypeParam type:INTEGER(4) Len init:3_4 -!CHECK: DerivedType scope: size=1 alignment=1 instantiation of pdt2(k2=1_4,l2=3_4) -!CHECK: a2: ObjectEntity type: TYPE(pdt1(k1=1_4,l1=3_4)) shape: 1_8:1_8 +!CHECK: DerivedType scope: size=48 alignment=8 instantiation of pdt2(k2=1_4,l2=3_4) sourceRange=0 bytes +!CHECK: a2 size=40 offset=8: ObjectEntity type: TYPE(pdt1(k1=1_4,l1=3_4)) shape: 1_8:1_8 !CHECK: j2 size=1 offset=0: ObjectEntity type: INTEGER(1) !CHECK: k2: TypeParam type:INTEGER(4) Kind init:1_4 !CHECK: l2: TypeParam type:INTEGER(4) Len init:3_4 +!CHECK: DerivedType scope: size=40 alignment=8 instantiation of pdt1(k1=1_4,l1=3_4) sourceRange=0 bytes +!CHECK: a1, ALLOCATABLE size=40 offset=0: ObjectEntity type: TYPE(pdt2(k2=1_4,l2=3_4)) +!CHECK: k1: TypeParam type:INTEGER(4) Kind init:1_4 +!CHECK: l1: TypeParam type:INTEGER(4) Len init:3_4 From 5f11d38d019b8447a3f76c978a5beae4639015de Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Wed, 18 Sep 2024 12:19:18 -0700 Subject: [PATCH 107/321] [flang] Fix code that deletes unit from bad OPEN (#108994) When an OPEN statement fails, a unit that was created for the OPEN needs to be removed from the unit map. The code that tried to do this was incorrect -- it needs to re-acquire the unit via LookUpForClose as a CLOSE statement does. (The failure to do this completely was leaving a zombie unit active that could break a later OPEN on the same unit number.) --- flang/runtime/io-api.cpp | 2 +- flang/runtime/io-stmt.cpp | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/flang/runtime/io-api.cpp b/flang/runtime/io-api.cpp index e3c6b9e5ca8959..39ac8c9eb6defb 100644 --- a/flang/runtime/io-api.cpp +++ b/flang/runtime/io-api.cpp @@ -948,7 +948,7 @@ bool IODEF(SetRecl)(Cookie cookie, std::size_t n) { io.GetIoErrorHandler().Crash( "SetRecl() called after GetNewUnit() for an OPEN statement"); } - if (n <= 0) { + if (static_cast(n) <= 0) { io.GetIoErrorHandler().SignalError("RECL= must be greater than zero"); return false; } else if (open->wasExtant() && diff --git a/flang/runtime/io-stmt.cpp b/flang/runtime/io-stmt.cpp index 265bd0dc9d9499..cd7a196335d31e 100644 --- a/flang/runtime/io-stmt.cpp +++ b/flang/runtime/io-stmt.cpp @@ -329,8 +329,11 @@ void OpenStatementState::CompleteOperation() { } if (!wasExtant_ && InError()) { // Release the new unit on failure - unit().CloseUnit(CloseStatus::Delete, *this); - unit().DestroyClosed(); + if (ExternalFileUnit * + toClose{unit().LookUpForClose(unit().unitNumber())}) { + toClose->Close(CloseStatus::Delete, *this); + toClose->DestroyClosed(); + } } IoStatementBase::CompleteOperation(); } From 1e19e1e1a471f648ff63f02114648211666669ca Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Wed, 18 Sep 2024 12:20:39 -0700 Subject: [PATCH 108/321] [flang] Catch untyped entities in interfaces with IMPLICIT NONE (#109018) The order of operations in name resolution wasn't converting named entities to objects by the time that they were subjected to the implicit typing rules in the case of interface blocks. This led to entities remaining untyped without error, leading to a crash in module file generation. Fixes https://github.com/llvm/llvm-project/issues/108975. --- flang/lib/Semantics/resolve-names.cpp | 3 +++ flang/test/Semantics/implicit16.f90 | 12 ++++++++++++ 2 files changed, 15 insertions(+) create mode 100644 flang/test/Semantics/implicit16.f90 diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index 0ff2795cc98477..5414787d85f7f7 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -8748,6 +8748,9 @@ void ResolveNamesVisitor::FinishSpecificationPart( CheckImports(); for (auto &pair : currScope()) { auto &symbol{*pair.second}; + if (inInterfaceBlock()) { + ConvertToObjectEntity(symbol); + } if (NeedsExplicitType(symbol)) { ApplyImplicitRules(symbol); } diff --git a/flang/test/Semantics/implicit16.f90 b/flang/test/Semantics/implicit16.f90 new file mode 100644 index 00000000000000..4a03e0c15747df --- /dev/null +++ b/flang/test/Semantics/implicit16.f90 @@ -0,0 +1,12 @@ +! RUN: %python %S/test_errors.py %s %flang_fc1 +interface +!ERROR: No explicit type declared for 'a' + subroutine s(a) + implicit none + end +!ERROR: No explicit type declared for 'f' + function f() + implicit none + end +end interface +end From a800ffac4115259a76d803512eda31e4de787570 Mon Sep 17 00:00:00 2001 From: Andrea Faulds Date: Wed, 18 Sep 2024 21:55:53 +0200 Subject: [PATCH 109/321] [mlir][gpu] Disjoint patterns for lowering clustered subgroup reduce (#109158) Making the existing populateGpuLowerSubgroupReduceToShufflePatterns() function also cover the new "clustered" subgroup reductions is proving to be inconvenient, because certain backends may have more specific lowerings that only cover the non-clustered type, and this creates pass ordering constraints. This commit removes coverage of clustered reductions from this function in favour of a new separate function, which makes controlling the lowering much more straightforward. --- .../mlir/Dialect/GPU/Transforms/Passes.h | 10 +++++ .../GPU/Transforms/SubgroupReduceLowering.cpp | 37 ++++++++++++++++--- mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp | 5 ++- 3 files changed, 46 insertions(+), 6 deletions(-) diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h index 67baa8777a6fcc..8eb711962583da 100644 --- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h @@ -73,10 +73,20 @@ void populateGpuBreakDownSubgroupReducePatterns( /// Collect a set of patterns to lower `gpu.subgroup_reduce` into `gpu.shuffle` /// ops over `shuffleBitwidth` scalar types. Assumes that the subgroup has /// `subgroupSize` lanes. Uses the butterfly shuffle algorithm. +/// +/// The patterns populated by this function will ignore ops with the +/// `cluster_size` attribute. +/// `populateGpuLowerClusteredSubgroupReduceToShufflePatterns` is the opposite. void populateGpuLowerSubgroupReduceToShufflePatterns( RewritePatternSet &patterns, unsigned subgroupSize, unsigned shuffleBitwidth = 32, PatternBenefit benefit = 1); +/// Disjoint counterpart of `populateGpuLowerSubgroupReduceToShufflePatterns` +/// that only matches `gpu.subgroup_reduce` ops with a `cluster_size`. +void populateGpuLowerClusteredSubgroupReduceToShufflePatterns( + RewritePatternSet &patterns, unsigned subgroupSize, + unsigned shuffleBitwidth = 32, PatternBenefit benefit = 1); + /// Collect all patterns to rewrite ops within the GPU dialect. inline void populateGpuRewritePatterns(RewritePatternSet &patterns) { populateGpuAllReducePatterns(patterns); diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp index b166f1cd469a4d..185f824351a230 100644 --- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp @@ -210,13 +210,21 @@ Value createSubgroupShuffleReduction(OpBuilder &builder, Location loc, struct ScalarSubgroupReduceToShuffles final : OpRewritePattern { ScalarSubgroupReduceToShuffles(MLIRContext *ctx, unsigned subgroupSize, - unsigned shuffleBitwidth, + unsigned shuffleBitwidth, bool matchClustered, PatternBenefit benefit) : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize), - shuffleBitwidth(shuffleBitwidth) {} + shuffleBitwidth(shuffleBitwidth), matchClustered(matchClustered) {} LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op, PatternRewriter &rewriter) const override { + if (op.getClusterSize().has_value() != matchClustered) { + return rewriter.notifyMatchFailure( + op, llvm::formatv("op is {0}clustered but pattern is configured to " + "only match {1}clustered ops", + matchClustered ? "non-" : "", + matchClustered ? "" : "non-")); + } + auto ci = getAndValidateClusterInfo(op, subgroupSize); if (failed(ci)) return failure(); @@ -262,19 +270,28 @@ struct ScalarSubgroupReduceToShuffles final private: unsigned subgroupSize = 0; unsigned shuffleBitwidth = 0; + bool matchClustered = false; }; /// Lowers vector gpu subgroup reductions to a series of shuffles. struct VectorSubgroupReduceToShuffles final : OpRewritePattern { VectorSubgroupReduceToShuffles(MLIRContext *ctx, unsigned subgroupSize, - unsigned shuffleBitwidth, + unsigned shuffleBitwidth, bool matchClustered, PatternBenefit benefit) : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize), - shuffleBitwidth(shuffleBitwidth) {} + shuffleBitwidth(shuffleBitwidth), matchClustered(matchClustered) {} LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op, PatternRewriter &rewriter) const override { + if (op.getClusterSize().has_value() != matchClustered) { + return rewriter.notifyMatchFailure( + op, llvm::formatv("op is {0}clustered but pattern is configured to " + "only match {1}clustered ops", + matchClustered ? "non-" : "", + matchClustered ? "" : "non-")); + } + auto ci = getAndValidateClusterInfo(op, subgroupSize); if (failed(ci)) return failure(); @@ -343,6 +360,7 @@ struct VectorSubgroupReduceToShuffles final private: unsigned subgroupSize = 0; unsigned shuffleBitwidth = 0; + bool matchClustered = false; }; } // namespace @@ -358,5 +376,14 @@ void mlir::populateGpuLowerSubgroupReduceToShufflePatterns( RewritePatternSet &patterns, unsigned subgroupSize, unsigned shuffleBitwidth, PatternBenefit benefit) { patterns.add( - patterns.getContext(), subgroupSize, shuffleBitwidth, benefit); + patterns.getContext(), subgroupSize, shuffleBitwidth, + /*matchClustered=*/false, benefit); +} + +void mlir::populateGpuLowerClusteredSubgroupReduceToShufflePatterns( + RewritePatternSet &patterns, unsigned subgroupSize, + unsigned shuffleBitwidth, PatternBenefit benefit) { + patterns.add( + patterns.getContext(), subgroupSize, shuffleBitwidth, + /*matchClustered=*/true, benefit); } diff --git a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp index 99a914506b011a..74d057c0b7b6cb 100644 --- a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp +++ b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp @@ -78,9 +78,12 @@ struct TestGpuSubgroupReduceLoweringPass populateGpuBreakDownSubgroupReducePatterns(patterns, /*maxShuffleBitwidth=*/32, PatternBenefit(2)); - if (expandToShuffles) + if (expandToShuffles) { populateGpuLowerSubgroupReduceToShufflePatterns( patterns, /*subgroupSize=*/32, /*shuffleBitwidth=*/32); + populateGpuLowerClusteredSubgroupReduceToShufflePatterns( + patterns, /*subgroupSize=*/32, /*shuffleBitwidth=*/32); + } (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)); } From 9284e1870d27e44845c8e0d1e9e0a1817dc59474 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 18 Sep 2024 13:06:41 -0700 Subject: [PATCH 110/321] [LLVM][TableGen] Change DAGISel code to use const RecordKeeper (#109038) Change DAGISel code to use const RecordKeeper. This is a part of effort to have better const correctness in TableGen backends: https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089 --- llvm/utils/TableGen/DAGISelEmitter.cpp | 10 +++++----- llvm/utils/TableGen/DAGISelMatcherGen.cpp | 20 ++++++++++---------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/llvm/utils/TableGen/DAGISelEmitter.cpp b/llvm/utils/TableGen/DAGISelEmitter.cpp index 6c72103f6251f5..2cceb22afdb99b 100644 --- a/llvm/utils/TableGen/DAGISelEmitter.cpp +++ b/llvm/utils/TableGen/DAGISelEmitter.cpp @@ -25,11 +25,11 @@ namespace { /// DAGISelEmitter - The top-level class which coordinates construction /// and emission of the instruction selector. class DAGISelEmitter { - RecordKeeper &Records; // Just so we can get at the timing functions. - CodeGenDAGPatterns CGP; + const RecordKeeper &Records; // Just so we can get at the timing functions. + const CodeGenDAGPatterns CGP; public: - explicit DAGISelEmitter(RecordKeeper &R) : Records(R), CGP(R) {} + explicit DAGISelEmitter(const RecordKeeper &R) : Records(R), CGP(R) {} void run(raw_ostream &OS); }; } // End anonymous namespace @@ -81,8 +81,8 @@ namespace { // In particular, we want to match maximal patterns first and lowest cost within // a particular complexity first. struct PatternSortingPredicate { - PatternSortingPredicate(CodeGenDAGPatterns &cgp) : CGP(cgp) {} - CodeGenDAGPatterns &CGP; + PatternSortingPredicate(const CodeGenDAGPatterns &cgp) : CGP(cgp) {} + const CodeGenDAGPatterns &CGP; bool operator()(const PatternToMatch *LHS, const PatternToMatch *RHS) { const TreePatternNode < = LHS->getSrcPattern(); diff --git a/llvm/utils/TableGen/DAGISelMatcherGen.cpp b/llvm/utils/TableGen/DAGISelMatcherGen.cpp index 5cb393ae7a538d..e159cf1bbefd33 100644 --- a/llvm/utils/TableGen/DAGISelMatcherGen.cpp +++ b/llvm/utils/TableGen/DAGISelMatcherGen.cpp @@ -23,7 +23,7 @@ using namespace llvm; /// getRegisterValueType - Look up and return the ValueType of the specified /// register. If the register is a member of multiple register classes, they /// must all have the same type. -static MVT::SimpleValueType getRegisterValueType(Record *R, +static MVT::SimpleValueType getRegisterValueType(const Record *R, const CodeGenTarget &T) { bool FoundRC = false; MVT::SimpleValueType VT = MVT::Other; @@ -91,7 +91,7 @@ class MatcherGen { /// PhysRegInputs - List list has an entry for each explicitly specified /// physreg input to the pattern. The first elt is the Register node, the /// second is the recorded slot number the input pattern match saved it in. - SmallVector, 2> PhysRegInputs; + SmallVector, 2> PhysRegInputs; /// Matcher - This is the top level of the generated matcher, the result. Matcher *TheMatcher; @@ -220,13 +220,13 @@ void MatcherGen::EmitLeafMatchCode(const TreePatternNode &N) { return; } - DefInit *DI = dyn_cast(N.getLeafValue()); + const DefInit *DI = dyn_cast(N.getLeafValue()); if (!DI) { errs() << "Unknown leaf kind: " << N << "\n"; abort(); } - Record *LeafRec = DI->getDef(); + const Record *LeafRec = DI->getDef(); // A ValueType leaf node can represent a register when named, or itself when // unnamed. @@ -673,7 +673,7 @@ void MatcherGen::EmitResultLeafAsOperand(const TreePatternNode &N, // If this is an explicit register reference, handle it. if (DefInit *DI = dyn_cast(N.getLeafValue())) { - Record *Def = DI->getDef(); + const Record *Def = DI->getDef(); if (Def->isSubClassOf("Register")) { const CodeGenRegister *Reg = CGP.getTargetInfo().getRegBank().getReg(Def); AddMatcher(new EmitRegisterMatcher(Reg, N.getSimpleType(0))); @@ -690,7 +690,7 @@ void MatcherGen::EmitResultLeafAsOperand(const TreePatternNode &N, if (Def->getName() == "undef_tied_input") { MVT::SimpleValueType ResultVT = N.getSimpleType(0); auto IDOperandNo = NextRecordedOperandNo++; - Record *ImpDef = Def->getRecords().getDef("IMPLICIT_DEF"); + const Record *ImpDef = Def->getRecords().getDef("IMPLICIT_DEF"); CodeGenInstruction &II = CGP.getTargetInfo().getInstruction(ImpDef); AddMatcher(new EmitNodeMatcher(II, ResultVT, std::nullopt, false, false, false, false, -1, IDOperandNo)); @@ -907,11 +907,11 @@ void MatcherGen::EmitResultInstructionAsOperand( if (isRoot && !Pattern.getDstRegs().empty()) { // If the root came from an implicit def in the instruction handling stuff, // don't re-add it. - Record *HandledReg = nullptr; + const Record *HandledReg = nullptr; if (II.HasOneImplicitDefWithKnownVT(CGT) != MVT::Other) HandledReg = II.ImplicitDefs[0]; - for (Record *Reg : Pattern.getDstRegs()) { + for (const Record *Reg : Pattern.getDstRegs()) { if (!Reg->isSubClassOf("Register") || Reg == HandledReg) continue; ResultVTs.push_back(getRegisterValueType(Reg, CGT)); @@ -1042,7 +1042,7 @@ void MatcherGen::EmitResultCode() { if (!Pattern.getDstRegs().empty()) { // If the root came from an implicit def in the instruction handling stuff, // don't re-add it. - Record *HandledReg = nullptr; + const Record *HandledReg = nullptr; const TreePatternNode &DstPat = Pattern.getDstPattern(); if (!DstPat.isLeaf() && DstPat.getOperator()->isSubClassOf("Instruction")) { const CodeGenTarget &CGT = CGP.getTargetInfo(); @@ -1052,7 +1052,7 @@ void MatcherGen::EmitResultCode() { HandledReg = II.ImplicitDefs[0]; } - for (Record *Reg : Pattern.getDstRegs()) { + for (const Record *Reg : Pattern.getDstRegs()) { if (!Reg->isSubClassOf("Register") || Reg == HandledReg) continue; ++NumSrcResults; From 9ddb1cd5648c0cf5b61fb661bdb5e8fa0a056c08 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 18 Sep 2024 13:07:09 -0700 Subject: [PATCH 111/321] [LLVM][TableGen] Change DFAPacketizerEmitter to use const RecordKeeper (#109044) Change DFAPacketizerEmitter to use const RecordKeeper. This is a part of effort to have better const correctness in TableGen backends: https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089 --- llvm/utils/TableGen/DFAPacketizerEmitter.cpp | 30 ++++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/llvm/utils/TableGen/DFAPacketizerEmitter.cpp b/llvm/utils/TableGen/DFAPacketizerEmitter.cpp index 4070bafded9cc2..42155e78d0a262 100644 --- a/llvm/utils/TableGen/DFAPacketizerEmitter.cpp +++ b/llvm/utils/TableGen/DFAPacketizerEmitter.cpp @@ -61,7 +61,7 @@ struct ScheduleClass { class DFAPacketizerEmitter { private: std::string TargetName; - RecordKeeper &Records; + const RecordKeeper &Records; UniqueVector UniqueResources; std::vector ScheduleClasses; @@ -69,18 +69,18 @@ class DFAPacketizerEmitter { std::map ComboBitToBitsMap; public: - DFAPacketizerEmitter(RecordKeeper &R); + DFAPacketizerEmitter(const RecordKeeper &R); // Construct a map of function unit names to bits. int collectAllFuncUnits(ArrayRef ProcModels); // Construct a map from a combo function unit bit to the bits of all included // functional units. - int collectAllComboFuncs(ArrayRef ComboFuncList); + int collectAllComboFuncs(ArrayRef ComboFuncList); ResourceVector getResourcesForItinerary(const Record *Itinerary); void createScheduleClasses(unsigned ItineraryIdx, - const ConstRecVec &Itineraries); + ArrayRef Itineraries); // Emit code for a subset of itineraries. void emitForItineraries(raw_ostream &OS, @@ -91,7 +91,7 @@ class DFAPacketizerEmitter { }; } // end anonymous namespace -DFAPacketizerEmitter::DFAPacketizerEmitter(RecordKeeper &R) +DFAPacketizerEmitter::DFAPacketizerEmitter(const RecordKeeper &R) : TargetName(std::string(CodeGenTarget(R).getName())), Records(R) {} int DFAPacketizerEmitter::collectAllFuncUnits( @@ -108,7 +108,7 @@ int DFAPacketizerEmitter::collectAllFuncUnits( int totalFUs = 0; // Parse functional units for all the itineraries. for (const Record *Proc : ProcItinList) { - std::vector FUs = Proc->getValueAsListOfDefs("FU"); + std::vector FUs = Proc->getValueAsListOfConstDefs("FU"); LLVM_DEBUG(dbgs() << " FU:" << " (" << FUs.size() << " FUs) " << Proc->getName()); @@ -130,7 +130,7 @@ int DFAPacketizerEmitter::collectAllFuncUnits( } int DFAPacketizerEmitter::collectAllComboFuncs( - ArrayRef ComboFuncList) { + ArrayRef ComboFuncList) { LLVM_DEBUG(dbgs() << "-------------------------------------------------------" "----------------------\n"); LLVM_DEBUG(dbgs() << "collectAllComboFuncs"); @@ -138,8 +138,8 @@ int DFAPacketizerEmitter::collectAllComboFuncs( int numCombos = 0; for (unsigned i = 0, N = ComboFuncList.size(); i < N; ++i) { - Record *Func = ComboFuncList[i]; - std::vector FUs = Func->getValueAsListOfDefs("CFD"); + const Record *Func = ComboFuncList[i]; + std::vector FUs = Func->getValueAsListOfConstDefs("CFD"); LLVM_DEBUG(dbgs() << " CFD:" << i << " (" << FUs.size() << " combo FUs) " << Func->getName() << "\n"); @@ -148,16 +148,16 @@ int DFAPacketizerEmitter::collectAllComboFuncs( for (unsigned j = 0, N = FUs.size(); j < N; ++j) { assert((j < DFA_MAX_RESOURCES) && "Exceeded maximum number of DFA resources"); - Record *FuncData = FUs[j]; - Record *ComboFunc = FuncData->getValueAsDef("TheComboFunc"); - const std::vector &FuncList = - FuncData->getValueAsListOfDefs("FuncList"); + const Record *FuncData = FUs[j]; + const Record *ComboFunc = FuncData->getValueAsDef("TheComboFunc"); + const std::vector FuncList = + FuncData->getValueAsListOfConstDefs("FuncList"); const std::string &ComboFuncName = std::string(ComboFunc->getName()); uint64_t ComboBit = FUNameToBitsMap[ComboFuncName]; uint64_t ComboResources = ComboBit; LLVM_DEBUG(dbgs() << " combo: " << ComboFuncName << ":0x" << Twine::utohexstr(ComboResources) << "\n"); - for (auto *K : FuncList) { + for (const Record *K : FuncList) { std::string FuncName = std::string(K->getName()); uint64_t FuncResources = FUNameToBitsMap[FuncName]; LLVM_DEBUG(dbgs() << " " << FuncName << ":0x" @@ -190,7 +190,7 @@ DFAPacketizerEmitter::getResourcesForItinerary(const Record *Itinerary) { } void DFAPacketizerEmitter::createScheduleClasses( - unsigned ItineraryIdx, const ConstRecVec &Itineraries) { + unsigned ItineraryIdx, ArrayRef Itineraries) { unsigned Idx = 0; for (const Record *Itinerary : Itineraries) { if (!Itinerary) { From bde51d9b0d473447ea12fb14924f14ea167eec85 Mon Sep 17 00:00:00 2001 From: Xing Xue Date: Wed, 18 Sep 2024 16:12:39 -0400 Subject: [PATCH 112/321] [libomp][AIX] Ensure only libomp.a is published on AIX (#109016) For `libomp` on AIX, we build shared object `libomp.so` first and then archive it into `libomp.a`. Due to a CMake for AIX problem, the install step also tries to publish `libomp.so`. While we use a script to build `libomp.a` out-of-tree for Clang and avoided the problem, this chokes the in-tree build for Flang. The issue will be reported to CMake but before a fixed CMake is available, this patch ensures only `libomp.a` is published. --- openmp/runtime/src/CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/openmp/runtime/src/CMakeLists.txt b/openmp/runtime/src/CMakeLists.txt index f106694841ce8d..2dd54b5116d920 100644 --- a/openmp/runtime/src/CMakeLists.txt +++ b/openmp/runtime/src/CMakeLists.txt @@ -426,7 +426,11 @@ if(WIN32) endforeach() else() - install(TARGETS omp ${export_to_llvmexports} ${LIBOMP_INSTALL_KIND} DESTINATION "${OPENMP_INSTALL_LIBDIR}") + if(${CMAKE_SYSTEM_NAME} MATCHES "AIX") + install(FILES ${LIBOMP_LIBRARY_DIR}/libomp.a DESTINATION "${OPENMP_INSTALL_LIBDIR}" COMPONENT runtime) + else() + install(TARGETS omp ${export_to_llvmexports} ${LIBOMP_INSTALL_KIND} DESTINATION "${OPENMP_INSTALL_LIBDIR}") + endif() if(${LIBOMP_INSTALL_ALIASES}) # Create aliases (symlinks) of the library for backwards compatibility From 644899addd8fd789c93e9a0f0727d37eb1b29c55 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 18 Sep 2024 12:32:48 -0700 Subject: [PATCH 113/321] [RISCV][GISel] Port portions of float-intrinsics.ll and double-intrinsics.ll. NFC Remove the legalizer test for the same intrinsics as it is no longer interesting with end to end tests. --- .../RISCV/GlobalISel/double-intrinsics.ll | 264 +++++++++++ .../RISCV/GlobalISel/float-intrinsics.ll | 441 ++++++++++++++++++ .../legalizer/legalize-fp-libcall.mir | 328 ------------- 3 files changed, 705 insertions(+), 328 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/double-intrinsics.ll create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/float-intrinsics.ll delete mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fp-libcall.mir diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/double-intrinsics.ll b/llvm/test/CodeGen/RISCV/GlobalISel/double-intrinsics.ll new file mode 100644 index 00000000000000..ad461f8f24b917 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/double-intrinsics.ll @@ -0,0 +1,264 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -global-isel -mattr=+d \ +; RUN: -verify-machineinstrs -target-abi=ilp32d \ +; RUN: | FileCheck -check-prefixes=CHECKIFD,RV32IFD %s +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -global-isel -mattr=+d \ +; RUN: -verify-machineinstrs -target-abi=lp64d \ +; RUN: | FileCheck -check-prefixes=CHECKIFD,RV64IFD %s + +declare double @llvm.sqrt.f64(double) + +define double @sqrt_f64(double %a) nounwind { +; CHECKIFD-LABEL: sqrt_f64: +; CHECKIFD: # %bb.0: +; CHECKIFD-NEXT: fsqrt.d fa0, fa0 +; CHECKIFD-NEXT: ret + %1 = call double @llvm.sqrt.f64(double %a) + ret double %1 +} + +declare double @llvm.fma.f64(double, double, double) + +define double @fma_f64(double %a, double %b, double %c) nounwind { +; CHECKIFD-LABEL: fma_f64: +; CHECKIFD: # %bb.0: +; CHECKIFD-NEXT: fmadd.d fa0, fa0, fa1, fa2 +; CHECKIFD-NEXT: ret + %1 = call double @llvm.fma.f64(double %a, double %b, double %c) + ret double %1 +} + +declare double @llvm.fmuladd.f64(double, double, double) + +define double @fmuladd_f64(double %a, double %b, double %c) nounwind { +; CHECKIFD-LABEL: fmuladd_f64: +; CHECKIFD: # %bb.0: +; CHECKIFD-NEXT: fmadd.d fa0, fa0, fa1, fa2 +; CHECKIFD-NEXT: ret + %1 = call double @llvm.fmuladd.f64(double %a, double %b, double %c) + ret double %1 +} + +declare double @llvm.fabs.f64(double) + +define double @fabs_f64(double %a) nounwind { +; CHECKIFD-LABEL: fabs_f64: +; CHECKIFD: # %bb.0: +; CHECKIFD-NEXT: fabs.d fa0, fa0 +; CHECKIFD-NEXT: ret + %1 = call double @llvm.fabs.f64(double %a) + ret double %1 +} + +declare double @llvm.minnum.f64(double, double) + +define double @minnum_f64(double %a, double %b) nounwind { +; CHECKIFD-LABEL: minnum_f64: +; CHECKIFD: # %bb.0: +; CHECKIFD-NEXT: fmin.d fa0, fa0, fa1 +; CHECKIFD-NEXT: ret + %1 = call double @llvm.minnum.f64(double %a, double %b) + ret double %1 +} + +declare double @llvm.maxnum.f64(double, double) + +define double @maxnum_f64(double %a, double %b) nounwind { +; CHECKIFD-LABEL: maxnum_f64: +; CHECKIFD: # %bb.0: +; CHECKIFD-NEXT: fmax.d fa0, fa0, fa1 +; CHECKIFD-NEXT: ret + %1 = call double @llvm.maxnum.f64(double %a, double %b) + ret double %1 +} + +declare double @llvm.copysign.f64(double, double) + +define double @copysign_f64(double %a, double %b) nounwind { +; CHECKIFD-LABEL: copysign_f64: +; CHECKIFD: # %bb.0: +; CHECKIFD-NEXT: fsgnj.d fa0, fa0, fa1 +; CHECKIFD-NEXT: ret + %1 = call double @llvm.copysign.f64(double %a, double %b) + ret double %1 +} + +declare double @llvm.floor.f64(double) + +define double @floor_f64(double %a) nounwind { +; RV32IFD-LABEL: floor_f64: +; RV32IFD: # %bb.0: +; RV32IFD-NEXT: addi sp, sp, -16 +; RV32IFD-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32IFD-NEXT: call floor +; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32IFD-NEXT: addi sp, sp, 16 +; RV32IFD-NEXT: ret +; +; RV64IFD-LABEL: floor_f64: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: call floor +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret + %1 = call double @llvm.floor.f64(double %a) + ret double %1 +} + +declare double @llvm.ceil.f64(double) + +define double @ceil_f64(double %a) nounwind { +; RV32IFD-LABEL: ceil_f64: +; RV32IFD: # %bb.0: +; RV32IFD-NEXT: addi sp, sp, -16 +; RV32IFD-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32IFD-NEXT: call ceil +; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32IFD-NEXT: addi sp, sp, 16 +; RV32IFD-NEXT: ret +; +; RV64IFD-LABEL: ceil_f64: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: call ceil +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret + %1 = call double @llvm.ceil.f64(double %a) + ret double %1 +} + +declare double @llvm.trunc.f64(double) + +define double @trunc_f64(double %a) nounwind { +; RV32IFD-LABEL: trunc_f64: +; RV32IFD: # %bb.0: +; RV32IFD-NEXT: addi sp, sp, -16 +; RV32IFD-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32IFD-NEXT: call trunc +; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32IFD-NEXT: addi sp, sp, 16 +; RV32IFD-NEXT: ret +; +; RV64IFD-LABEL: trunc_f64: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: call trunc +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret + %1 = call double @llvm.trunc.f64(double %a) + ret double %1 +} + +declare double @llvm.rint.f64(double) + +define double @rint_f64(double %a) nounwind { +; RV32IFD-LABEL: rint_f64: +; RV32IFD: # %bb.0: +; RV32IFD-NEXT: addi sp, sp, -16 +; RV32IFD-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32IFD-NEXT: call rint +; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32IFD-NEXT: addi sp, sp, 16 +; RV32IFD-NEXT: ret +; +; RV64IFD-LABEL: rint_f64: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: call rint +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret + %1 = call double @llvm.rint.f64(double %a) + ret double %1 +} + +declare double @llvm.nearbyint.f64(double) + +define double @nearbyint_f64(double %a) nounwind { +; RV32IFD-LABEL: nearbyint_f64: +; RV32IFD: # %bb.0: +; RV32IFD-NEXT: addi sp, sp, -16 +; RV32IFD-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32IFD-NEXT: call nearbyint +; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32IFD-NEXT: addi sp, sp, 16 +; RV32IFD-NEXT: ret +; +; RV64IFD-LABEL: nearbyint_f64: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: call nearbyint +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret + %1 = call double @llvm.nearbyint.f64(double %a) + ret double %1 +} + +declare double @llvm.round.f64(double) + +define double @round_f64(double %a) nounwind { +; RV32IFD-LABEL: round_f64: +; RV32IFD: # %bb.0: +; RV32IFD-NEXT: addi sp, sp, -16 +; RV32IFD-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32IFD-NEXT: call round +; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32IFD-NEXT: addi sp, sp, 16 +; RV32IFD-NEXT: ret +; +; RV64IFD-LABEL: round_f64: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: call round +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret + %1 = call double @llvm.round.f64(double %a) + ret double %1 +} + +declare double @llvm.roundeven.f64(double) + +define double @roundeven_f64(double %a) nounwind { +; RV32IFD-LABEL: roundeven_f64: +; RV32IFD: # %bb.0: +; RV32IFD-NEXT: addi sp, sp, -16 +; RV32IFD-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32IFD-NEXT: call roundeven +; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32IFD-NEXT: addi sp, sp, 16 +; RV32IFD-NEXT: ret +; +; RV64IFD-LABEL: roundeven_f64: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: call roundeven +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret + %1 = call double @llvm.roundeven.f64(double %a) + ret double %1 +} + +declare i1 @llvm.is.fpclass.f64(double, i32) +define i1 @isnan_d_fpclass(double %x) { +; CHECKIFD-LABEL: isnan_d_fpclass: +; CHECKIFD: # %bb.0: +; CHECKIFD-NEXT: fclass.d a0, fa0 +; CHECKIFD-NEXT: andi a0, a0, 768 +; CHECKIFD-NEXT: snez a0, a0 +; CHECKIFD-NEXT: ret + %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 3) ; nan + ret i1 %1 +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/float-intrinsics.ll b/llvm/test/CodeGen/RISCV/GlobalISel/float-intrinsics.ll new file mode 100644 index 00000000000000..39a5beb317ab91 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/float-intrinsics.ll @@ -0,0 +1,441 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -global-isel -mattr=+f \ +; RUN: -verify-machineinstrs -target-abi=ilp32f \ +; RUN: | FileCheck -check-prefix=RV32IF %s +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -global-isel -mattr=+d \ +; RUN: -verify-machineinstrs -target-abi=ilp32f \ +; RUN: | FileCheck -check-prefix=RV32IF %s +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -global-isel -mattr=+f \ +; RUN: -verify-machineinstrs -target-abi=lp64f \ +; RUN: | FileCheck -check-prefix=RV64IF %s +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -global-isel -mattr=+d \ +; RUN: -verify-machineinstrs -target-abi=lp64d \ +; RUN: | FileCheck -check-prefix=RV64IF %s + +define float @sqrt_f32(float %a) nounwind { +; RV32IF-LABEL: sqrt_f32: +; RV32IF: # %bb.0: +; RV32IF-NEXT: fsqrt.s fa0, fa0 +; RV32IF-NEXT: ret +; +; RV64IF-LABEL: sqrt_f32: +; RV64IF: # %bb.0: +; RV64IF-NEXT: fsqrt.s fa0, fa0 +; RV64IF-NEXT: ret + %1 = call float @llvm.sqrt.f32(float %a) + ret float %1 +} + +define float @fma_f32(float %a, float %b, float %c) nounwind { +; RV32IF-LABEL: fma_f32: +; RV32IF: # %bb.0: +; RV32IF-NEXT: fmadd.s fa0, fa0, fa1, fa2 +; RV32IF-NEXT: ret +; +; RV64IF-LABEL: fma_f32: +; RV64IF: # %bb.0: +; RV64IF-NEXT: fmadd.s fa0, fa0, fa1, fa2 +; RV64IF-NEXT: ret + %1 = call float @llvm.fma.f32(float %a, float %b, float %c) + ret float %1 +} + +define float @fmuladd_f32(float %a, float %b, float %c) nounwind { +; RV32IF-LABEL: fmuladd_f32: +; RV32IF: # %bb.0: +; RV32IF-NEXT: fmadd.s fa0, fa0, fa1, fa2 +; RV32IF-NEXT: ret +; +; RV64IF-LABEL: fmuladd_f32: +; RV64IF: # %bb.0: +; RV64IF-NEXT: fmadd.s fa0, fa0, fa1, fa2 +; RV64IF-NEXT: ret + %1 = call float @llvm.fmuladd.f32(float %a, float %b, float %c) + ret float %1 +} + +define float @fabs_f32(float %a) nounwind { +; RV32IF-LABEL: fabs_f32: +; RV32IF: # %bb.0: +; RV32IF-NEXT: fabs.s fa0, fa0 +; RV32IF-NEXT: ret +; +; RV64IF-LABEL: fabs_f32: +; RV64IF: # %bb.0: +; RV64IF-NEXT: fabs.s fa0, fa0 +; RV64IF-NEXT: ret + %1 = call float @llvm.fabs.f32(float %a) + ret float %1 +} + +define float @minnum_f32(float %a, float %b) nounwind { +; RV32IF-LABEL: minnum_f32: +; RV32IF: # %bb.0: +; RV32IF-NEXT: fmin.s fa0, fa0, fa1 +; RV32IF-NEXT: ret +; +; RV64IF-LABEL: minnum_f32: +; RV64IF: # %bb.0: +; RV64IF-NEXT: fmin.s fa0, fa0, fa1 +; RV64IF-NEXT: ret + %1 = call float @llvm.minnum.f32(float %a, float %b) + ret float %1 +} + +define float @maxnum_f32(float %a, float %b) nounwind { +; RV32IF-LABEL: maxnum_f32: +; RV32IF: # %bb.0: +; RV32IF-NEXT: fmax.s fa0, fa0, fa1 +; RV32IF-NEXT: ret +; +; RV64IF-LABEL: maxnum_f32: +; RV64IF: # %bb.0: +; RV64IF-NEXT: fmax.s fa0, fa0, fa1 +; RV64IF-NEXT: ret + %1 = call float @llvm.maxnum.f32(float %a, float %b) + ret float %1 +} + +define float @copysign_f32(float %a, float %b) nounwind { +; RV32IF-LABEL: copysign_f32: +; RV32IF: # %bb.0: +; RV32IF-NEXT: fsgnj.s fa0, fa0, fa1 +; RV32IF-NEXT: ret +; +; RV64IF-LABEL: copysign_f32: +; RV64IF: # %bb.0: +; RV64IF-NEXT: fsgnj.s fa0, fa0, fa1 +; RV64IF-NEXT: ret + %1 = call float @llvm.copysign.f32(float %a, float %b) + ret float %1 +} + +define float @ceil_f32(float %a) nounwind { +; RV32IF-LABEL: ceil_f32: +; RV32IF: # %bb.0: +; RV32IF-NEXT: addi sp, sp, -16 +; RV32IF-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32IF-NEXT: call ceilf +; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32IF-NEXT: addi sp, sp, 16 +; RV32IF-NEXT: ret +; +; RV64IF-LABEL: ceil_f32: +; RV64IF: # %bb.0: +; RV64IF-NEXT: addi sp, sp, -16 +; RV64IF-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IF-NEXT: call ceilf +; RV64IF-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IF-NEXT: addi sp, sp, 16 +; RV64IF-NEXT: ret + %1 = call float @llvm.ceil.f32(float %a) + ret float %1 +} + +define float @trunc_f32(float %a) nounwind { +; RV32IF-LABEL: trunc_f32: +; RV32IF: # %bb.0: +; RV32IF-NEXT: addi sp, sp, -16 +; RV32IF-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32IF-NEXT: call truncf +; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32IF-NEXT: addi sp, sp, 16 +; RV32IF-NEXT: ret +; +; RV64IF-LABEL: trunc_f32: +; RV64IF: # %bb.0: +; RV64IF-NEXT: addi sp, sp, -16 +; RV64IF-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IF-NEXT: call truncf +; RV64IF-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IF-NEXT: addi sp, sp, 16 +; RV64IF-NEXT: ret + %1 = call float @llvm.trunc.f32(float %a) + ret float %1 +} + +define float @rint_f32(float %a) nounwind { +; RV32IF-LABEL: rint_f32: +; RV32IF: # %bb.0: +; RV32IF-NEXT: addi sp, sp, -16 +; RV32IF-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32IF-NEXT: call rintf +; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32IF-NEXT: addi sp, sp, 16 +; RV32IF-NEXT: ret +; +; RV64IF-LABEL: rint_f32: +; RV64IF: # %bb.0: +; RV64IF-NEXT: addi sp, sp, -16 +; RV64IF-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IF-NEXT: call rintf +; RV64IF-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IF-NEXT: addi sp, sp, 16 +; RV64IF-NEXT: ret + %1 = call float @llvm.rint.f32(float %a) + ret float %1 +} + +define float @nearbyint_f32(float %a) nounwind { +; RV32IF-LABEL: nearbyint_f32: +; RV32IF: # %bb.0: +; RV32IF-NEXT: addi sp, sp, -16 +; RV32IF-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32IF-NEXT: call nearbyintf +; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32IF-NEXT: addi sp, sp, 16 +; RV32IF-NEXT: ret +; +; RV64IF-LABEL: nearbyint_f32: +; RV64IF: # %bb.0: +; RV64IF-NEXT: addi sp, sp, -16 +; RV64IF-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IF-NEXT: call nearbyintf +; RV64IF-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IF-NEXT: addi sp, sp, 16 +; RV64IF-NEXT: ret + %1 = call float @llvm.nearbyint.f32(float %a) + ret float %1 +} + +define float @round_f32(float %a) nounwind { +; RV32IF-LABEL: round_f32: +; RV32IF: # %bb.0: +; RV32IF-NEXT: addi sp, sp, -16 +; RV32IF-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32IF-NEXT: call roundf +; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32IF-NEXT: addi sp, sp, 16 +; RV32IF-NEXT: ret +; +; RV64IF-LABEL: round_f32: +; RV64IF: # %bb.0: +; RV64IF-NEXT: addi sp, sp, -16 +; RV64IF-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IF-NEXT: call roundf +; RV64IF-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IF-NEXT: addi sp, sp, 16 +; RV64IF-NEXT: ret + %1 = call float @llvm.round.f32(float %a) + ret float %1 +} + +define float @roundeven_f32(float %a) nounwind { +; RV32IF-LABEL: roundeven_f32: +; RV32IF: # %bb.0: +; RV32IF-NEXT: addi sp, sp, -16 +; RV32IF-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32IF-NEXT: call roundevenf +; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32IF-NEXT: addi sp, sp, 16 +; RV32IF-NEXT: ret +; +; RV64IF-LABEL: roundeven_f32: +; RV64IF: # %bb.0: +; RV64IF-NEXT: addi sp, sp, -16 +; RV64IF-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IF-NEXT: call roundevenf +; RV64IF-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IF-NEXT: addi sp, sp, 16 +; RV64IF-NEXT: ret + %1 = call float @llvm.roundeven.f32(float %a) + ret float %1 +} + +define i1 @fpclass(float %x) { +; RV32IF-LABEL: fpclass: +; RV32IF: # %bb.0: +; RV32IF-NEXT: fclass.s a0, fa0 +; RV32IF-NEXT: andi a0, a0, 927 +; RV32IF-NEXT: snez a0, a0 +; RV32IF-NEXT: ret +; +; RV64IF-LABEL: fpclass: +; RV64IF: # %bb.0: +; RV64IF-NEXT: fclass.s a0, fa0 +; RV64IF-NEXT: andi a0, a0, 927 +; RV64IF-NEXT: snez a0, a0 +; RV64IF-NEXT: ret + %cmp = call i1 @llvm.is.fpclass.f32(float %x, i32 639) + ret i1 %cmp +} + +define i1 @isnan_fpclass(float %x) { +; RV32IF-LABEL: isnan_fpclass: +; RV32IF: # %bb.0: +; RV32IF-NEXT: fclass.s a0, fa0 +; RV32IF-NEXT: andi a0, a0, 768 +; RV32IF-NEXT: snez a0, a0 +; RV32IF-NEXT: ret +; +; RV64IF-LABEL: isnan_fpclass: +; RV64IF: # %bb.0: +; RV64IF-NEXT: fclass.s a0, fa0 +; RV64IF-NEXT: andi a0, a0, 768 +; RV64IF-NEXT: snez a0, a0 +; RV64IF-NEXT: ret + %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 3) ; nan + ret i1 %1 +} + +define i1 @isqnan_fpclass(float %x) { +; RV32IF-LABEL: isqnan_fpclass: +; RV32IF: # %bb.0: +; RV32IF-NEXT: fclass.s a0, fa0 +; RV32IF-NEXT: andi a0, a0, 512 +; RV32IF-NEXT: snez a0, a0 +; RV32IF-NEXT: ret +; +; RV64IF-LABEL: isqnan_fpclass: +; RV64IF: # %bb.0: +; RV64IF-NEXT: fclass.s a0, fa0 +; RV64IF-NEXT: andi a0, a0, 512 +; RV64IF-NEXT: snez a0, a0 +; RV64IF-NEXT: ret + %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 2) ; qnan + ret i1 %1 +} + +define i1 @issnan_fpclass(float %x) { +; RV32IF-LABEL: issnan_fpclass: +; RV32IF: # %bb.0: +; RV32IF-NEXT: fclass.s a0, fa0 +; RV32IF-NEXT: andi a0, a0, 256 +; RV32IF-NEXT: snez a0, a0 +; RV32IF-NEXT: ret +; +; RV64IF-LABEL: issnan_fpclass: +; RV64IF: # %bb.0: +; RV64IF-NEXT: fclass.s a0, fa0 +; RV64IF-NEXT: andi a0, a0, 256 +; RV64IF-NEXT: snez a0, a0 +; RV64IF-NEXT: ret + %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 1) ; snan + ret i1 %1 +} + +define i1 @isinf_fpclass(float %x) { +; RV32IF-LABEL: isinf_fpclass: +; RV32IF: # %bb.0: +; RV32IF-NEXT: fclass.s a0, fa0 +; RV32IF-NEXT: andi a0, a0, 129 +; RV32IF-NEXT: snez a0, a0 +; RV32IF-NEXT: ret +; +; RV64IF-LABEL: isinf_fpclass: +; RV64IF: # %bb.0: +; RV64IF-NEXT: fclass.s a0, fa0 +; RV64IF-NEXT: andi a0, a0, 129 +; RV64IF-NEXT: snez a0, a0 +; RV64IF-NEXT: ret + %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 516) ; 0x204 = "inf" + ret i1 %1 +} + +define i1 @isposinf_fpclass(float %x) { +; RV32IF-LABEL: isposinf_fpclass: +; RV32IF: # %bb.0: +; RV32IF-NEXT: fclass.s a0, fa0 +; RV32IF-NEXT: andi a0, a0, 128 +; RV32IF-NEXT: snez a0, a0 +; RV32IF-NEXT: ret +; +; RV64IF-LABEL: isposinf_fpclass: +; RV64IF: # %bb.0: +; RV64IF-NEXT: fclass.s a0, fa0 +; RV64IF-NEXT: andi a0, a0, 128 +; RV64IF-NEXT: snez a0, a0 +; RV64IF-NEXT: ret + %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 512) ; 0x200 = "+inf" + ret i1 %1 +} + +define i1 @isneginf_fpclass(float %x) { +; RV32IF-LABEL: isneginf_fpclass: +; RV32IF: # %bb.0: +; RV32IF-NEXT: fclass.s a0, fa0 +; RV32IF-NEXT: andi a0, a0, 1 +; RV32IF-NEXT: snez a0, a0 +; RV32IF-NEXT: ret +; +; RV64IF-LABEL: isneginf_fpclass: +; RV64IF: # %bb.0: +; RV64IF-NEXT: fclass.s a0, fa0 +; RV64IF-NEXT: andi a0, a0, 1 +; RV64IF-NEXT: snez a0, a0 +; RV64IF-NEXT: ret + %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 4) ; "-inf" + ret i1 %1 +} + +define i1 @isfinite_fpclass(float %x) { +; RV32IF-LABEL: isfinite_fpclass: +; RV32IF: # %bb.0: +; RV32IF-NEXT: fclass.s a0, fa0 +; RV32IF-NEXT: andi a0, a0, 126 +; RV32IF-NEXT: snez a0, a0 +; RV32IF-NEXT: ret +; +; RV64IF-LABEL: isfinite_fpclass: +; RV64IF: # %bb.0: +; RV64IF-NEXT: fclass.s a0, fa0 +; RV64IF-NEXT: andi a0, a0, 126 +; RV64IF-NEXT: snez a0, a0 +; RV64IF-NEXT: ret + %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 504) ; 0x1f8 = "finite" + ret i1 %1 +} + +define i1 @isposfinite_fpclass(float %x) { +; RV32IF-LABEL: isposfinite_fpclass: +; RV32IF: # %bb.0: +; RV32IF-NEXT: fclass.s a0, fa0 +; RV32IF-NEXT: andi a0, a0, 112 +; RV32IF-NEXT: snez a0, a0 +; RV32IF-NEXT: ret +; +; RV64IF-LABEL: isposfinite_fpclass: +; RV64IF: # %bb.0: +; RV64IF-NEXT: fclass.s a0, fa0 +; RV64IF-NEXT: andi a0, a0, 112 +; RV64IF-NEXT: snez a0, a0 +; RV64IF-NEXT: ret + %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 448) ; 0x1c0 = "+finite" + ret i1 %1 +} + +define i1 @isnegfinite_fpclass(float %x) { +; RV32IF-LABEL: isnegfinite_fpclass: +; RV32IF: # %bb.0: +; RV32IF-NEXT: fclass.s a0, fa0 +; RV32IF-NEXT: andi a0, a0, 14 +; RV32IF-NEXT: snez a0, a0 +; RV32IF-NEXT: ret +; +; RV64IF-LABEL: isnegfinite_fpclass: +; RV64IF: # %bb.0: +; RV64IF-NEXT: fclass.s a0, fa0 +; RV64IF-NEXT: andi a0, a0, 14 +; RV64IF-NEXT: snez a0, a0 +; RV64IF-NEXT: ret + %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 56) ; 0x38 = "-finite" + ret i1 %1 +} + +define i1 @isnotfinite_fpclass(float %x) { +; RV32IF-LABEL: isnotfinite_fpclass: +; RV32IF: # %bb.0: +; RV32IF-NEXT: fclass.s a0, fa0 +; RV32IF-NEXT: andi a0, a0, 897 +; RV32IF-NEXT: snez a0, a0 +; RV32IF-NEXT: ret +; +; RV64IF-LABEL: isnotfinite_fpclass: +; RV64IF: # %bb.0: +; RV64IF-NEXT: fclass.s a0, fa0 +; RV64IF-NEXT: andi a0, a0, 897 +; RV64IF-NEXT: snez a0, a0 +; RV64IF-NEXT: ret + %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 519) ; ox207 = "inf|nan" + ret i1 %1 +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fp-libcall.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fp-libcall.mir deleted file mode 100644 index 3b4f6a065d9736..00000000000000 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fp-libcall.mir +++ /dev/null @@ -1,328 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=riscv32 -mattr=+d -run-pass=legalizer %s -o - \ -# RUN: | FileCheck %s -# RUN: llc -mtriple=riscv64 -mattr=+d -run-pass=legalizer %s -o - \ -# RUN: | FileCheck %s - ---- -name: ceil_f32 -body: | - bb.1: - liveins: $f10_f - - ; CHECK-LABEL: name: ceil_f32 - ; CHECK: liveins: $f10_f - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f - ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: $f10_f = COPY [[COPY]](s32) - ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &ceilf, csr_ilp32d_lp64d, implicit-def $x1, implicit $f10_f, implicit-def $f10_f - ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $f10_f - ; CHECK-NEXT: $f10_f = COPY [[COPY1]](s32) - ; CHECK-NEXT: PseudoRET implicit $f10_f - %0:_(s32) = COPY $f10_f - %1:_(s32) = G_FCEIL %0 - $f10_f = COPY %1(s32) - PseudoRET implicit $f10_f - -... ---- -name: floor_f32 -body: | - bb.1: - liveins: $f10_f - - ; CHECK-LABEL: name: floor_f32 - ; CHECK: liveins: $f10_f - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f - ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: $f10_f = COPY [[COPY]](s32) - ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &floorf, csr_ilp32d_lp64d, implicit-def $x1, implicit $f10_f, implicit-def $f10_f - ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $f10_f - ; CHECK-NEXT: $f10_f = COPY [[COPY1]](s32) - ; CHECK-NEXT: PseudoRET implicit $f10_f - %0:_(s32) = COPY $f10_f - %1:_(s32) = G_FFLOOR %0 - $f10_f = COPY %1(s32) - PseudoRET implicit $f10_f - -... ---- -name: trunc_f32 -body: | - bb.1: - liveins: $f10_f - - ; CHECK-LABEL: name: trunc_f32 - ; CHECK: liveins: $f10_f - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f - ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: $f10_f = COPY [[COPY]](s32) - ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &truncf, csr_ilp32d_lp64d, implicit-def $x1, implicit $f10_f, implicit-def $f10_f - ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $f10_f - ; CHECK-NEXT: $f10_f = COPY [[COPY1]](s32) - ; CHECK-NEXT: PseudoRET implicit $f10_f - %0:_(s32) = COPY $f10_f - %1:_(s32) = G_INTRINSIC_TRUNC %0 - $f10_f = COPY %1(s32) - PseudoRET implicit $f10_f - -... ---- -name: rint_f32 -body: | - bb.1: - liveins: $f10_f - - ; CHECK-LABEL: name: rint_f32 - ; CHECK: liveins: $f10_f - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f - ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: $f10_f = COPY [[COPY]](s32) - ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &rintf, csr_ilp32d_lp64d, implicit-def $x1, implicit $f10_f, implicit-def $f10_f - ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $f10_f - ; CHECK-NEXT: $f10_f = COPY [[COPY1]](s32) - ; CHECK-NEXT: PseudoRET implicit $f10_f - %0:_(s32) = COPY $f10_f - %1:_(s32) = G_FRINT %0 - $f10_f = COPY %1(s32) - PseudoRET implicit $f10_f - -... ---- -name: nearbyint_f32 -body: | - bb.1: - liveins: $f10_f - - ; CHECK-LABEL: name: nearbyint_f32 - ; CHECK: liveins: $f10_f - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f - ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: $f10_f = COPY [[COPY]](s32) - ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &nearbyintf, csr_ilp32d_lp64d, implicit-def $x1, implicit $f10_f, implicit-def $f10_f - ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $f10_f - ; CHECK-NEXT: $f10_f = COPY [[COPY1]](s32) - ; CHECK-NEXT: PseudoRET implicit $f10_f - %0:_(s32) = COPY $f10_f - %1:_(s32) = G_FNEARBYINT %0 - $f10_f = COPY %1(s32) - PseudoRET implicit $f10_f - -... ---- -name: round_f32 -body: | - bb.1: - liveins: $f10_f - - ; CHECK-LABEL: name: round_f32 - ; CHECK: liveins: $f10_f - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f - ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: $f10_f = COPY [[COPY]](s32) - ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &roundf, csr_ilp32d_lp64d, implicit-def $x1, implicit $f10_f, implicit-def $f10_f - ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $f10_f - ; CHECK-NEXT: $f10_f = COPY [[COPY1]](s32) - ; CHECK-NEXT: PseudoRET implicit $f10_f - %0:_(s32) = COPY $f10_f - %1:_(s32) = G_INTRINSIC_ROUND %0 - $f10_f = COPY %1(s32) - PseudoRET implicit $f10_f - -... ---- -name: roundeven_f32 -body: | - bb.1: - liveins: $f10_f - - ; CHECK-LABEL: name: roundeven_f32 - ; CHECK: liveins: $f10_f - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f - ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: $f10_f = COPY [[COPY]](s32) - ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &roundevenf, csr_ilp32d_lp64d, implicit-def $x1, implicit $f10_f, implicit-def $f10_f - ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $f10_f - ; CHECK-NEXT: $f10_f = COPY [[COPY1]](s32) - ; CHECK-NEXT: PseudoRET implicit $f10_f - %0:_(s32) = COPY $f10_f - %1:_(s32) = G_INTRINSIC_ROUNDEVEN %0 - $f10_f = COPY %1(s32) - PseudoRET implicit $f10_f - -... ---- -name: ceil_f64 -body: | - bb.1: - liveins: $f10_d - - ; CHECK-LABEL: name: ceil_f64 - ; CHECK: liveins: $f10_d - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $f10_d - ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: $f10_d = COPY [[COPY]](s64) - ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &ceil, csr_ilp32d_lp64d, implicit-def $x1, implicit $f10_d, implicit-def $f10_d - ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $f10_d - ; CHECK-NEXT: $f10_d = COPY [[COPY1]](s64) - ; CHECK-NEXT: PseudoRET implicit $f10_d - %0:_(s64) = COPY $f10_d - %1:_(s64) = G_FCEIL %0 - $f10_d = COPY %1(s64) - PseudoRET implicit $f10_d - -... ---- -name: floor_f64 -body: | - bb.1: - liveins: $f10_d - - ; CHECK-LABEL: name: floor_f64 - ; CHECK: liveins: $f10_d - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $f10_d - ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: $f10_d = COPY [[COPY]](s64) - ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &floor, csr_ilp32d_lp64d, implicit-def $x1, implicit $f10_d, implicit-def $f10_d - ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $f10_d - ; CHECK-NEXT: $f10_d = COPY [[COPY1]](s64) - ; CHECK-NEXT: PseudoRET implicit $f10_d - %0:_(s64) = COPY $f10_d - %1:_(s64) = G_FFLOOR %0 - $f10_d = COPY %1(s64) - PseudoRET implicit $f10_d - -... ---- -name: trunc_f64 -body: | - bb.1: - liveins: $f10_d - - ; CHECK-LABEL: name: trunc_f64 - ; CHECK: liveins: $f10_d - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $f10_d - ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: $f10_d = COPY [[COPY]](s64) - ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &trunc, csr_ilp32d_lp64d, implicit-def $x1, implicit $f10_d, implicit-def $f10_d - ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $f10_d - ; CHECK-NEXT: $f10_d = COPY [[COPY1]](s64) - ; CHECK-NEXT: PseudoRET implicit $f10_d - %0:_(s64) = COPY $f10_d - %1:_(s64) = G_INTRINSIC_TRUNC %0 - $f10_d = COPY %1(s64) - PseudoRET implicit $f10_d - -... ---- -name: rint_f64 -body: | - bb.1: - liveins: $f10_d - - ; CHECK-LABEL: name: rint_f64 - ; CHECK: liveins: $f10_d - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $f10_d - ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: $f10_d = COPY [[COPY]](s64) - ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &rint, csr_ilp32d_lp64d, implicit-def $x1, implicit $f10_d, implicit-def $f10_d - ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $f10_d - ; CHECK-NEXT: $f10_d = COPY [[COPY1]](s64) - ; CHECK-NEXT: PseudoRET implicit $f10_d - %0:_(s64) = COPY $f10_d - %1:_(s64) = G_FRINT %0 - $f10_d = COPY %1(s64) - PseudoRET implicit $f10_d - -... ---- -name: nearbyint_f64 -body: | - bb.1: - liveins: $f10_d - - ; CHECK-LABEL: name: nearbyint_f64 - ; CHECK: liveins: $f10_d - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $f10_d - ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: $f10_d = COPY [[COPY]](s64) - ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &nearbyint, csr_ilp32d_lp64d, implicit-def $x1, implicit $f10_d, implicit-def $f10_d - ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $f10_d - ; CHECK-NEXT: $f10_d = COPY [[COPY1]](s64) - ; CHECK-NEXT: PseudoRET implicit $f10_d - %0:_(s64) = COPY $f10_d - %1:_(s64) = G_FNEARBYINT %0 - $f10_d = COPY %1(s64) - PseudoRET implicit $f10_d - -... ---- -name: round_f64 -body: | - bb.1: - liveins: $f10_d - - ; CHECK-LABEL: name: round_f64 - ; CHECK: liveins: $f10_d - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $f10_d - ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: $f10_d = COPY [[COPY]](s64) - ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &round, csr_ilp32d_lp64d, implicit-def $x1, implicit $f10_d, implicit-def $f10_d - ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $f10_d - ; CHECK-NEXT: $f10_d = COPY [[COPY1]](s64) - ; CHECK-NEXT: PseudoRET implicit $f10_d - %0:_(s64) = COPY $f10_d - %1:_(s64) = G_INTRINSIC_ROUND %0 - $f10_d = COPY %1(s64) - PseudoRET implicit $f10_d - -... ---- -name: roundeven_f64 -body: | - bb.1: - liveins: $f10_d - - ; CHECK-LABEL: name: roundeven_f64 - ; CHECK: liveins: $f10_d - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $f10_d - ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: $f10_d = COPY [[COPY]](s64) - ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &roundeven, csr_ilp32d_lp64d, implicit-def $x1, implicit $f10_d, implicit-def $f10_d - ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $f10_d - ; CHECK-NEXT: $f10_d = COPY [[COPY1]](s64) - ; CHECK-NEXT: PseudoRET implicit $f10_d - %0:_(s64) = COPY $f10_d - %1:_(s64) = G_INTRINSIC_ROUNDEVEN %0 - $f10_d = COPY %1(s64) - PseudoRET implicit $f10_d - -... From abb317ff9aba8a58449d91f6162597e54d02a57c Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 18 Sep 2024 14:18:04 -0700 Subject: [PATCH 114/321] [clang-tidy] Fix performance-unnecessary-value-param (#109145) This patch essentially reverts #108674 while adding a testcase that triggers a crash in clang-tidy. Fixes #108963. --- .../unnecessary-value-param-crash.cpp | 23 +++++++++++++++++++ .../Analysis/Analyses/ExprMutationAnalyzer.h | 17 ++++++++++---- 2 files changed, 36 insertions(+), 4 deletions(-) create mode 100644 clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-crash.cpp diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-crash.cpp b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-crash.cpp new file mode 100644 index 00000000000000..99c2fe905bdf37 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-crash.cpp @@ -0,0 +1,23 @@ +// RUN: %check_clang_tidy -std=c++14-or-later %s performance-unnecessary-value-param %t + +// The test case used to crash clang-tidy. +// https://github.com/llvm/llvm-project/issues/108963 + +struct A +{ + template A(T&&) {} +}; + +struct B +{ + ~B(); +}; + +struct C +{ + A a; + C(B, int i) : a(i) {} + // CHECK-MESSAGES: [[@LINE-1]]:6: warning: the parameter #1 is copied for each invocation but only used as a const reference; consider making it a const reference +}; + +C c(B(), 0); diff --git a/clang/include/clang/Analysis/Analyses/ExprMutationAnalyzer.h b/clang/include/clang/Analysis/Analyses/ExprMutationAnalyzer.h index b7b84852168e2e..c7a5b016c949d0 100644 --- a/clang/include/clang/Analysis/Analyses/ExprMutationAnalyzer.h +++ b/clang/include/clang/Analysis/Analyses/ExprMutationAnalyzer.h @@ -118,10 +118,19 @@ class FunctionParmMutationAnalyzer { static FunctionParmMutationAnalyzer * getFunctionParmMutationAnalyzer(const FunctionDecl &Func, ASTContext &Context, ExprMutationAnalyzer::Memoized &Memorized) { - auto [it, Inserted] = Memorized.FuncParmAnalyzer.try_emplace(&Func); - if (Inserted) - it->second = std::unique_ptr( - new FunctionParmMutationAnalyzer(Func, Context, Memorized)); + auto it = Memorized.FuncParmAnalyzer.find(&Func); + if (it == Memorized.FuncParmAnalyzer.end()) { + // Creating a new instance of FunctionParmMutationAnalyzer below may add + // additional elements to FuncParmAnalyzer. If we did try_emplace before + // creating a new instance, the returned iterator of try_emplace could be + // invalidated. + it = + Memorized.FuncParmAnalyzer + .try_emplace(&Func, std::unique_ptr( + new FunctionParmMutationAnalyzer( + Func, Context, Memorized))) + .first; + } return it->getSecond().get(); } From e0ad34e56590fa2e6ffdf617e044de7eadee2139 Mon Sep 17 00:00:00 2001 From: Matheus Izvekov Date: Wed, 18 Sep 2024 18:22:14 -0300 Subject: [PATCH 115/321] [clang] Use canonical type for substitution which might be incomplete (#109065) When checking deduction consistency, a substitution can be incomplete such that only sugar parts refer to non-deduced template parameters. This would not otherwise lead to an inconsistent deduction, so this patch makes it so we canonicalize the types before substitution in order to avoid that possibility, for now. When we are able to produce substitution failure diagnostics for partial ordering, we might want to improve the TemplateInstantiator so that it does not fail in that case. This fixes a regression on top of #100692, which was reported on the PR. This was never released, so there are no release notes. --- clang/lib/Sema/SemaTemplateDeduction.cpp | 7 +++++-- clang/test/SemaTemplate/GH18291.cpp | 9 +++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp index b50648d5752ce5..7d83b86a007337 100644 --- a/clang/lib/Sema/SemaTemplateDeduction.cpp +++ b/clang/lib/Sema/SemaTemplateDeduction.cpp @@ -5505,8 +5505,11 @@ static TemplateDeductionResult CheckDeductionConsistency( Sema::ArgumentPackSubstitutionIndexRAII PackIndex( S, ArgIdx != -1 ? ::getPackIndexForParam(S, FTD, MLTAL, ArgIdx) : -1); bool IsIncompleteSubstitution = false; - QualType InstP = S.SubstType(P, MLTAL, FTD->getLocation(), FTD->getDeclName(), - &IsIncompleteSubstitution); + // FIXME: A substitution can be incomplete on a non-structural part of the + // type. Use the canonical type for now, until the TemplateInstantiator can + // deal with that. + QualType InstP = S.SubstType(P.getCanonicalType(), MLTAL, FTD->getLocation(), + FTD->getDeclName(), &IsIncompleteSubstitution); if (InstP.isNull() && !IsIncompleteSubstitution) return TemplateDeductionResult::SubstitutionFailure; if (!CheckConsistency) diff --git a/clang/test/SemaTemplate/GH18291.cpp b/clang/test/SemaTemplate/GH18291.cpp index 820564ffa6f1a0..2e9754b6561740 100644 --- a/clang/test/SemaTemplate/GH18291.cpp +++ b/clang/test/SemaTemplate/GH18291.cpp @@ -112,3 +112,12 @@ namespace static_vs_nonstatic { } } // namespace explicit_obj_param } // namespace static_vs_nonstatic + +namespace incomplete_on_sugar { + template void f(T[P]) = delete; + template void f(int[][P]); + void test() { + int array[1][8]; + f<8>(array); + } +} // namespace incomplete_on_sugar From a7c174502aef45b2d33291129cce10c085fef944 Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Wed, 11 Sep 2024 10:35:37 +0200 Subject: [PATCH 116/321] [lldb] Only send "posix" error codes through the gdb-remote protocol The other side has no way of telling which namespace do these codes belong to, so mashing them all together is not very helpful. I'm mainly doing this to simplify some code in a pending patch , and I've picked the posix error category semi-randomly. If we wanted to be serious about assigning meaning to these error codes, we should create a special error category for "gdb errors". --- .../Process/gdb-remote/GDBRemoteCommunicationServer.cpp | 7 ++++--- .../gdb-remote/GDBRemoteCommunicationServerTest.cpp | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServer.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServer.cpp index 9b72cb00352821..d4aa90b2c7731a 100644 --- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServer.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServer.cpp @@ -103,13 +103,14 @@ GDBRemoteCommunicationServer::SendErrorResponse(uint8_t err) { GDBRemoteCommunication::PacketResult GDBRemoteCommunicationServer::SendErrorResponse(const Status &error) { + uint8_t code = error.GetType() == eErrorTypePOSIX ? error.GetError() : 0xff; if (m_send_error_strings) { lldb_private::StreamString packet; - packet.Printf("E%2.2x;", static_cast(error.GetError())); + packet.Printf("E%2.2x;", code); packet.PutStringAsRawHex8(error.AsCString()); return SendPacketNoLock(packet.GetString()); - } else - return SendErrorResponse(error.GetError()); + } + return SendErrorResponse(code); } GDBRemoteCommunication::PacketResult diff --git a/lldb/unittests/Process/gdb-remote/GDBRemoteCommunicationServerTest.cpp b/lldb/unittests/Process/gdb-remote/GDBRemoteCommunicationServerTest.cpp index 69ca1720c04fc9..ba9ca6ea73e3be 100644 --- a/lldb/unittests/Process/gdb-remote/GDBRemoteCommunicationServerTest.cpp +++ b/lldb/unittests/Process/gdb-remote/GDBRemoteCommunicationServerTest.cpp @@ -12,6 +12,7 @@ #include "Plugins/Process/gdb-remote/GDBRemoteCommunicationServer.h" #include "lldb/Utility/Connection.h" #include "lldb/Utility/UnimplementedError.h" +#include "lldb/lldb-enumerations.h" namespace lldb_private { namespace process_gdb_remote { @@ -25,7 +26,7 @@ TEST(GDBRemoteCommunicationServerTest, SendErrorResponse_ErrorNumber) { TEST(GDBRemoteCommunicationServerTest, SendErrorResponse_Status) { MockServerWithMockConnection server; - Status status(0x42, lldb::eErrorTypeGeneric, "Test error message"); + Status status(0x42, lldb::eErrorTypePOSIX, "Test error message"); server.SendErrorResponse(status); EXPECT_THAT( From 06939fa2e140a171132275ec0ea1857d20c5dbdd Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Wed, 18 Sep 2024 14:54:49 -0700 Subject: [PATCH 117/321] [lldb] Change the implementation of Status to store an llvm::Error (NFC) (#106774) (based on a conversation I had with @labath yesterday in https://github.com/llvm/llvm-project/pull/106442) Most APIs that currently vend a Status would be better served by returning llvm::Expected<> instead. If possibles APIs should be refactored to avoid Status. The only legitimate long-term uses of Status are objects that need to store an error for a long time (which should be questioned as a design decision, too). This patch makes the transition to llvm::Error easier by making the places that cannot switch to llvm::Error explicit: They are marked with a call to Status::clone(). Every other API can and should be refactored to use llvm::Expected. In the end Status should only be used in very few places. Whenever an unchecked Error is dropped by Status it logs this to the verbose API channel. Implementation notes: This patch introduces two new kinds of error_category as well as new llvm::Error types. Here is the mapping of lldb::ErrorType to llvm::Errors: ``` (eErrorTypeInvalid) eErrorTypeGeneric llvm::StringError eErrorTypePOSIX llvm::ECError eErrorTypeMachKernel MachKernelError eErrorTypeExpression llvm::ErrorList eErrorTypeWin32 Win32Error ``` --- lldb/include/lldb/Utility/Status.h | 85 +++++- .../Python/PythonDataObjects.cpp | 31 ++- lldb/source/Utility/Status.cpp | 253 ++++++++++++------ .../Host/SocketTestUtilities.cpp | 16 +- lldb/unittests/Utility/StatusTest.cpp | 8 + 5 files changed, 285 insertions(+), 108 deletions(-) diff --git a/lldb/include/lldb/Utility/Status.h b/lldb/include/lldb/Utility/Status.h index 795c830b965173..4a09c38ce62f1b 100644 --- a/lldb/include/lldb/Utility/Status.h +++ b/lldb/include/lldb/Utility/Status.h @@ -28,6 +28,69 @@ namespace lldb_private { const char *ExpressionResultAsCString(lldb::ExpressionResults result); +/// Going a bit against the spirit of llvm::Error, +/// lldb_private::Status need to store errors long-term and sometimes +/// copy them. This base class defines an interface for this +/// operation. +class CloneableError + : public llvm::ErrorInfo { +public: + using llvm::ErrorInfo::ErrorInfo; + CloneableError() : ErrorInfo() {} + virtual std::unique_ptr Clone() const = 0; + static char ID; +}; + +/// Common base class for all error-code errors. +class CloneableECError + : public llvm::ErrorInfo { +public: + using llvm::ErrorInfo::ErrorInfo; + CloneableECError() = delete; + CloneableECError(std::error_code ec) : ErrorInfo(), EC(ec) {} + std::error_code convertToErrorCode() const override { return EC; } + void log(llvm::raw_ostream &OS) const override { OS << EC.message(); } + std::unique_ptr Clone() const override; + static char ID; + +protected: + std::error_code EC; +}; + +/// FIXME: Move these declarations closer to where they're used. +class MachKernelError + : public llvm::ErrorInfo { +public: + using llvm::ErrorInfo::ErrorInfo; + MachKernelError(std::error_code ec) : ErrorInfo(ec) {} + std::string message() const override; + std::unique_ptr Clone() const override; + static char ID; +}; + +class Win32Error : public llvm::ErrorInfo { +public: + using llvm::ErrorInfo::ErrorInfo; + Win32Error(std::error_code ec, const llvm::Twine &msg = {}) : ErrorInfo(ec) {} + std::string message() const override; + std::unique_ptr Clone() const override; + static char ID; +}; + +class ExpressionError + : public llvm::ErrorInfo { +public: + using llvm::ErrorInfo::ErrorInfo; + ExpressionError(std::error_code ec, std::string msg = {}) + : ErrorInfo(ec), m_string(msg) {} + std::unique_ptr Clone() const override; + std::string message() const override { return m_string; } + static char ID; + +protected: + std::string m_string; +}; + /// \class Status Status.h "lldb/Utility/Status.h" An error handling class. /// /// This class is designed to be able to hold any error code that can be @@ -100,9 +163,7 @@ class Status { } static Status FromExpressionError(lldb::ExpressionResults result, - std::string msg) { - return Status(result, lldb::eErrorTypeExpression, msg); - } + std::string msg); /// Set the current error to errno. /// @@ -115,6 +176,7 @@ class Status { const Status &operator=(Status &&); /// Avoid using this in new code. Migrate APIs to llvm::Expected instead. static Status FromError(llvm::Error error); + /// FIXME: Replace this with a takeError() method. llvm::Error ToError() const; /// Don't call this function in new code. Instead, redesign the API @@ -149,12 +211,20 @@ class Status { /// Access the error value. /// + /// If the internally stored \ref llvm::Error is an \ref + /// llvm::ErrorList then this returns the error value of the first + /// error. + /// /// \return /// The error value. ValueType GetError() const; /// Access the error type. /// + /// If the internally stored \ref llvm::Error is an \ref + /// llvm::ErrorList then this returns the error value of the first + /// error. + /// /// \return /// The error type enumeration value. lldb::ErrorType GetType() const; @@ -170,12 +240,9 @@ class Status { bool Success() const; protected: - Status(llvm::Error error); - /// Status code as an integer value. - ValueType m_code = 0; - /// The type of the above error code. - lldb::ErrorType m_type = lldb::eErrorTypeInvalid; - /// A string representation of the error code. + Status(llvm::Error error) : m_error(std::move(error)) {} + llvm::Error m_error; + /// TODO: Replace this with just callling toString(m_error). mutable std::string m_string; }; diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.cpp index 24cf3430006329..6ddd00df3a2180 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.cpp @@ -993,8 +993,8 @@ void PythonException::Restore() { } PythonException::~PythonException() { - Py_XDECREF(m_exception_type); Py_XDECREF(m_exception); + Py_XDECREF(m_exception_type); Py_XDECREF(m_traceback); Py_XDECREF(m_repr_bytes); } @@ -1108,9 +1108,10 @@ template class OwnedPythonFile : public Base { py_error = Status::FromError(r.takeError()); } base_error = Base::Close(); + // Cloning since the wrapped exception may still reference the PyThread. if (py_error.Fail()) - return py_error; - return base_error; + return py_error.Clone(); + return base_error.Clone(); }; PyObject *GetPythonObject() const { @@ -1196,7 +1197,8 @@ class PythonIOFile : public OwnedPythonFile { return Flush(); auto r = m_py_obj.CallMethod("close"); if (!r) - return Status::FromError(r.takeError()); + // Cloning since the wrapped exception may still reference the PyThread. + return Status::FromError(r.takeError()).Clone(); return Status(); } @@ -1204,7 +1206,8 @@ class PythonIOFile : public OwnedPythonFile { GIL takeGIL; auto r = m_py_obj.CallMethod("flush"); if (!r) - return Status::FromError(r.takeError()); + // Cloning since the wrapped exception may still reference the PyThread. + return Status::FromError(r.takeError()).Clone(); return Status(); } @@ -1240,7 +1243,8 @@ class BinaryPythonFile : public PythonIOFile { PyObject *pybuffer_p = PyMemoryView_FromMemory( const_cast((const char *)buf), num_bytes, PyBUF_READ); if (!pybuffer_p) - return Status::FromError(llvm::make_error()); + // Cloning since the wrapped exception may still reference the PyThread. + return Status::FromError(llvm::make_error()).Clone(); auto pybuffer = Take(pybuffer_p); num_bytes = 0; auto bytes_written = As(m_py_obj.CallMethod("write", pybuffer)); @@ -1260,7 +1264,8 @@ class BinaryPythonFile : public PythonIOFile { auto pybuffer_obj = m_py_obj.CallMethod("read", (unsigned long long)num_bytes); if (!pybuffer_obj) - return Status::FromError(pybuffer_obj.takeError()); + // Cloning since the wrapped exception may still reference the PyThread. + return Status::FromError(pybuffer_obj.takeError()).Clone(); num_bytes = 0; if (pybuffer_obj.get().IsNone()) { // EOF @@ -1269,7 +1274,8 @@ class BinaryPythonFile : public PythonIOFile { } auto pybuffer = PythonBuffer::Create(pybuffer_obj.get()); if (!pybuffer) - return Status::FromError(pybuffer.takeError()); + // Cloning since the wrapped exception may still reference the PyThread. + return Status::FromError(pybuffer.takeError()).Clone(); memcpy(buf, pybuffer.get().get().buf, pybuffer.get().get().len); num_bytes = pybuffer.get().get().len; return Status(); @@ -1300,7 +1306,8 @@ class TextPythonFile : public PythonIOFile { auto bytes_written = As(m_py_obj.CallMethod("write", pystring.get())); if (!bytes_written) - return Status::FromError(bytes_written.takeError()); + // Cloning since the wrapped exception may still reference the PyThread. + return Status::FromError(bytes_written.takeError()).Clone(); if (bytes_written.get() < 0) return Status::FromErrorString( ".write() method returned a negative number!"); @@ -1321,14 +1328,16 @@ class TextPythonFile : public PythonIOFile { auto pystring = As( m_py_obj.CallMethod("read", (unsigned long long)num_chars)); if (!pystring) - return Status::FromError(pystring.takeError()); + // Cloning since the wrapped exception may still reference the PyThread. + return Status::FromError(pystring.takeError()).Clone(); if (pystring.get().IsNone()) { // EOF return Status(); } auto stringref = pystring.get().AsUTF8(); if (!stringref) - return Status::FromError(stringref.takeError()); + // Cloning since the wrapped exception may still reference the PyThread. + return Status::FromError(stringref.takeError()).Clone(); num_bytes = stringref.get().size(); memcpy(buf, stringref.get().begin(), num_bytes); return Status(); diff --git a/lldb/source/Utility/Status.cpp b/lldb/source/Utility/Status.cpp index 4af3af5fba0185..faa8d3a83c7ed1 100644 --- a/lldb/source/Utility/Status.cpp +++ b/lldb/source/Utility/Status.cpp @@ -8,6 +8,8 @@ #include "lldb/Utility/Status.h" +#include "lldb/Utility/LLDBLog.h" +#include "lldb/Utility/Log.h" #include "lldb/Utility/VASPrintf.h" #include "lldb/lldb-defines.h" #include "lldb/lldb-enumerations.h" @@ -37,48 +39,78 @@ class raw_ostream; using namespace lldb; using namespace lldb_private; -Status::Status() {} +char CloneableError::ID; +char CloneableECError::ID; +char MachKernelError::ID; +char Win32Error::ID; +char ExpressionError::ID; + +namespace { +/// A std::error_code category for eErrorTypeGeneric. +class LLDBGenericCategory : public std::error_category { + const char *name() const override { return "LLDBGenericCategory"; } + std::string message(int __ev) const override { return "generic LLDB error"; }; +}; +LLDBGenericCategory &lldb_generic_category() { + static LLDBGenericCategory g_generic_category; + return g_generic_category; +} + +/// A std::error_code category for eErrorTypeExpression. +class ExpressionCategory : public std::error_category { + const char *name() const override { return "LLDBExpressionCategory"; } + std::string message(int __ev) const override { + return ExpressionResultAsCString( + static_cast(__ev)); + }; +}; +ExpressionCategory &expression_category() { + static ExpressionCategory g_expression_category; + return g_expression_category; +} +} // namespace + +Status::Status() : m_error(llvm::Error::success()) {} + +static llvm::Error ErrorFromEnums(Status::ValueType err, ErrorType type, + std::string msg) { + switch (type) { + case eErrorTypeMachKernel: + return llvm::make_error( + std::error_code(err, std::system_category())); + case eErrorTypeWin32: + return llvm::make_error( + std::error_code(err, std::system_category())); + case eErrorTypePOSIX: + if (msg.empty()) + return llvm::errorCodeToError( + std::error_code(err, std::generic_category())); + return llvm::createStringError( + std::move(msg), std::error_code(err, std::generic_category())); + default: + return llvm::createStringError( + std::move(msg), std::error_code(err, lldb_generic_category())); + } +} Status::Status(ValueType err, ErrorType type, std::string msg) - : m_code(err), m_type(type), m_string(std::move(msg)) {} + : m_error(ErrorFromEnums(err, type, msg)) {} -// This logic is confusing because c++ calls the traditional (posix) errno codes +// This logic is confusing because C++ calls the traditional (posix) errno codes // "generic errors", while we use the term "generic" to mean completely // arbitrary (text-based) errors. Status::Status(std::error_code EC) - : m_code(EC.value()), - m_type(EC.category() == std::generic_category() ? eErrorTypePOSIX - : eErrorTypeGeneric), - m_string(EC.message()) {} + : m_error(!EC ? llvm::Error::success() : llvm::errorCodeToError(EC)) {} Status::Status(std::string err_str) - : m_code(LLDB_GENERIC_ERROR), m_type(eErrorTypeGeneric), - m_string(std::move(err_str)) {} - -Status::Status(llvm::Error error) { - if (!error) { - Clear(); - return; - } + : m_error( + llvm::createStringError(llvm::inconvertibleErrorCode(), err_str)) {} - // if the error happens to be a errno error, preserve the error code - error = llvm::handleErrors( - std::move(error), [&](std::unique_ptr e) -> llvm::Error { - std::error_code ec = e->convertToErrorCode(); - if (ec.category() == std::generic_category()) { - m_code = ec.value(); - m_type = ErrorType::eErrorTypePOSIX; - return llvm::Error::success(); - } - return llvm::Error(std::move(e)); - }); - - // Otherwise, just preserve the message - if (error) { - m_code = LLDB_GENERIC_ERROR; - m_type = eErrorTypeGeneric; - m_string = llvm::toString(std::move(error)); - } +const Status &Status::operator=(Status &&other) { + Clear(); + llvm::consumeError(std::move(m_error)); + m_error = std::move(other.m_error); + return *this; } Status Status::FromErrorStringWithFormat(const char *format, ...) { @@ -94,25 +126,33 @@ Status Status::FromErrorStringWithFormat(const char *format, ...) { return Status(string); } -Status Status::FromError(llvm::Error error) { return Status(std::move(error)); } +Status Status::FromExpressionError(lldb::ExpressionResults result, + std::string msg) { + return Status(llvm::make_error( + std::error_code(result, expression_category()), msg)); +} -llvm::Error Status::ToError() const { - if (Success()) - return llvm::Error::success(); - if (m_type == ErrorType::eErrorTypePOSIX) - return llvm::errorCodeToError( - std::error_code(m_code, std::generic_category())); - return llvm::createStringError(AsCString()); +/// Creates a deep copy of all known errors and converts all other +/// errors to a new llvm::StringError. +static llvm::Error CloneError(const llvm::Error &error) { + llvm::Error result = llvm::Error::success(); + auto clone = [](const llvm::ErrorInfoBase &e) { + if (e.isA()) + return llvm::Error(static_cast(e).Clone()); + return llvm::make_error(e.message(), + e.convertToErrorCode(), true); + }; + visitErrors(error, [&](const llvm::ErrorInfoBase &e) { + result = joinErrors(std::move(result), clone(e)); + }); + return result; } -Status::~Status() = default; +Status Status::FromError(llvm::Error error) { return Status(std::move(error)); } -const Status &Status::operator=(Status &&other) { - m_code = other.m_code; - m_type = other.m_type; - m_string = std::move(other.m_string); - return *this; -} +llvm::Error Status::ToError() const { return CloneError(m_error); } + +Status::~Status() { llvm::consumeError(std::move(m_error)); } #ifdef _WIN32 static std::string RetrieveWin32ErrorString(uint32_t error_code) { @@ -140,6 +180,37 @@ static std::string RetrieveWin32ErrorString(uint32_t error_code) { } #endif +std::string MachKernelError::message() const { +#if defined(__APPLE__) + if (const char *s = ::mach_error_string(convertToErrorCode().value())) + return s; +#endif + return "MachKernelError"; +} + +std::string Win32Error::message() const { +#if defined(_WIN32) + return RetrieveWin32ErrorString(convertToErrorCode().value()); +#endif + return "Win32Error"; +} + +std::unique_ptr CloneableECError::Clone() const { + return std::make_unique(convertToErrorCode()); +} + +std::unique_ptr MachKernelError::Clone() const { + return std::make_unique(convertToErrorCode()); +} + +std::unique_ptr Win32Error::Clone() const { + return std::make_unique(convertToErrorCode()); +} + +std::unique_ptr ExpressionError::Clone() const { + return std::make_unique(convertToErrorCode(), message()); +} + // Get the error value as a NULL C string. The error string will be fetched and // cached on demand. The cached error string value will remain until the error // value is changed or cleared. @@ -147,29 +218,12 @@ const char *Status::AsCString(const char *default_error_str) const { if (Success()) return nullptr; - if (m_string.empty()) { - switch (m_type) { - case eErrorTypeMachKernel: -#if defined(__APPLE__) - if (const char *s = ::mach_error_string(m_code)) - m_string.assign(s); -#endif - break; - - case eErrorTypePOSIX: - m_string = llvm::sys::StrError(m_code); - break; - - case eErrorTypeWin32: -#if defined(_WIN32) - m_string = RetrieveWin32ErrorString(m_code); -#endif - break; + m_string = llvm::toStringWithoutConsuming(m_error); + // Backwards compatibility with older implementations of Status. + if (m_error.isA()) + if (!m_string.empty() && m_string[m_string.size() - 1] == '\n') + m_string.pop_back(); - default: - break; - } - } if (m_string.empty()) { if (default_error_str) m_string.assign(default_error_str); @@ -181,29 +235,64 @@ const char *Status::AsCString(const char *default_error_str) const { // Clear the error and any cached error string that it might contain. void Status::Clear() { - m_code = 0; - m_type = eErrorTypeInvalid; - m_string.clear(); + if (m_error) + LLDB_LOG_ERRORV(GetLog(LLDBLog::API), std::move(m_error), + "dropping error {0}"); + m_error = llvm::Error::success(); } -// Access the error value. -Status::ValueType Status::GetError() const { return m_code; } +Status::ValueType Status::GetError() const { + Status::ValueType result = 0; + llvm::visitErrors(m_error, [&](const llvm::ErrorInfoBase &error) { + // Return the first only. + if (result) + return; + std::error_code ec = error.convertToErrorCode(); + result = ec.value(); + }); + return result; +} // Access the error type. -ErrorType Status::GetType() const { return m_type; } +ErrorType Status::GetType() const { + ErrorType result = eErrorTypeInvalid; + llvm::visitErrors(m_error, [&](const llvm::ErrorInfoBase &error) { + // Return the first only. + if (result != eErrorTypeInvalid) + return; + if (error.isA()) + result = eErrorTypeMachKernel; + else if (error.isA()) + result = eErrorTypeWin32; + else if (error.isA()) + result = eErrorTypeExpression; + else if (error.convertToErrorCode().category() == std::generic_category()) + result = eErrorTypePOSIX; + else if (error.convertToErrorCode().category() == lldb_generic_category() || + error.convertToErrorCode() == llvm::inconvertibleErrorCode()) + result = eErrorTypeGeneric; + else + result = eErrorTypeInvalid; + }); + return result; +} -// Returns true if this object contains a value that describes an error or -// otherwise non-success result. -bool Status::Fail() const { return m_code != 0; } +bool Status::Fail() const { + // Note that this does not clear the checked flag in + // m_error. Otherwise we'd need to make this thread-safe. + return m_error.isA(); +} Status Status::FromErrno() { - // Update the error value to be "errno" and update the type to be "POSIX". - return Status(errno, eErrorTypePOSIX); + std::error_code ec = llvm::errnoAsErrorCode(); + if (ec) + return Status::FromError(llvm::make_error(ec)); + return Status(); } // Returns true if the error code in this object is considered a successful // return value. -bool Status::Success() const { return m_code == 0; } +bool Status::Success() const { return !Fail(); } void llvm::format_provider::format( const lldb_private::Status &error, llvm::raw_ostream &OS, diff --git a/lldb/unittests/TestingSupport/Host/SocketTestUtilities.cpp b/lldb/unittests/TestingSupport/Host/SocketTestUtilities.cpp index 2455a4f6f5d490..86fb5b05ea8009 100644 --- a/lldb/unittests/TestingSupport/Host/SocketTestUtilities.cpp +++ b/lldb/unittests/TestingSupport/Host/SocketTestUtilities.cpp @@ -102,12 +102,16 @@ static bool CheckIPSupport(llvm::StringRef Proto, llvm::StringRef Addr) { Proto, Err) .str(); bool HasProtocolError = false; - handleAllErrors(std::move(Err), [&](std::unique_ptr ECErr) { - std::error_code ec = ECErr->convertToErrorCode(); - if (ec == std::make_error_code(std::errc::address_family_not_supported) || - ec == std::make_error_code(std::errc::address_not_available)) - HasProtocolError = true; - }); + handleAllErrors( + std::move(Err), + [&](std::unique_ptr ECErr) { + std::error_code ec = ECErr->convertToErrorCode(); + if (ec == + std::make_error_code(std::errc::address_family_not_supported) || + ec == std::make_error_code(std::errc::address_not_available)) + HasProtocolError = true; + }, + [](const llvm::ErrorInfoBase &) {}); if (HasProtocolError) { GTEST_LOG_(WARNING) << llvm::formatv( diff --git a/lldb/unittests/Utility/StatusTest.cpp b/lldb/unittests/Utility/StatusTest.cpp index be4f2beebcdb52..e37c94ac17f2d0 100644 --- a/lldb/unittests/Utility/StatusTest.cpp +++ b/lldb/unittests/Utility/StatusTest.cpp @@ -70,6 +70,14 @@ TEST(StatusTest, ErrorConversion) { llvm::Error foo = Status::FromErrorString("foo").ToError(); EXPECT_TRUE(bool(foo)); EXPECT_EQ("foo", llvm::toString(std::move(foo))); + + llvm::Error eperm = llvm::errorCodeToError({EPERM, std::generic_category()}); + llvm::Error eintr = llvm::errorCodeToError({EINTR, std::generic_category()}); + llvm::Error elist = llvm::joinErrors(std::move(eperm), std::move(eintr)); + elist = llvm::joinErrors(std::move(elist), llvm::createStringError("foo")); + Status list = Status::FromError(std::move(elist)); + EXPECT_EQ((int)list.GetError(), EPERM); + EXPECT_EQ(list.GetType(), eErrorTypePOSIX); } #ifdef _WIN32 From b4a8e877ee3002a8cfd613f7950afcbe1d98821c Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Wed, 18 Sep 2024 15:03:41 -0700 Subject: [PATCH 118/321] Add noexcept qualifier to placate g++ --- lldb/source/Utility/Status.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lldb/source/Utility/Status.cpp b/lldb/source/Utility/Status.cpp index faa8d3a83c7ed1..f557cb540b5655 100644 --- a/lldb/source/Utility/Status.cpp +++ b/lldb/source/Utility/Status.cpp @@ -48,7 +48,7 @@ char ExpressionError::ID; namespace { /// A std::error_code category for eErrorTypeGeneric. class LLDBGenericCategory : public std::error_category { - const char *name() const override { return "LLDBGenericCategory"; } + const char *name() const noexcept override { return "LLDBGenericCategory"; } std::string message(int __ev) const override { return "generic LLDB error"; }; }; LLDBGenericCategory &lldb_generic_category() { @@ -58,7 +58,9 @@ LLDBGenericCategory &lldb_generic_category() { /// A std::error_code category for eErrorTypeExpression. class ExpressionCategory : public std::error_category { - const char *name() const override { return "LLDBExpressionCategory"; } + const char *name() const noexcept override { + return "LLDBExpressionCategory"; + } std::string message(int __ev) const override { return ExpressionResultAsCString( static_cast(__ev)); From 775de20c3a0a149158cdafce66ef29510a436f1f Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 18 Sep 2024 15:35:05 -0700 Subject: [PATCH 119/321] [RISCV][GISel] Support unaligned-scalar-mem. (#108905) We need to set the required alignment to 8 with unaligned-scalar-mem. If we don't do this, the legalizer will try to lower the unaligned load/store and the lower code will call allowsMemoryAccess to verify what its supposed to do. allowsMemoryAccess will say the unaligned access is allowed. So the legalizer gives up. --- .../Target/RISCV/GISel/RISCVLegalizerInfo.cpp | 62 ++++++---- .../legalizer/legalize-load-rv32.mir | 84 ++++++++++++- .../legalizer/legalize-load-rv64.mir | 114 +++++++++++++++++- .../legalizer/legalize-store-rv32.mir | 72 ++++++++++- .../legalizer/legalize-store-rv64.mir | 90 +++++++++++++- 5 files changed, 394 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp index 192ba375d5a5d9..055193bcc2c8db 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp @@ -287,34 +287,48 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) auto &LoadActions = getActionDefinitionsBuilder(G_LOAD); auto &StoreActions = getActionDefinitionsBuilder(G_STORE); + auto &ExtLoadActions = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}); - LoadActions - .legalForTypesWithMemDesc({{s32, p0, s8, 8}, - {s32, p0, s16, 16}, - {s32, p0, s32, 32}, - {p0, p0, sXLen, XLen}}); - StoreActions - .legalForTypesWithMemDesc({{s32, p0, s8, 8}, - {s32, p0, s16, 16}, - {s32, p0, s32, 32}, - {p0, p0, sXLen, XLen}}); - auto &ExtLoadActions = - getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) - .legalForTypesWithMemDesc({{s32, p0, s8, 8}, {s32, p0, s16, 16}}); + // Return the alignment needed for scalar memory ops. If unaligned scalar mem + // is supported, we only require byte alignment. Otherwise, we need the memory + // op to be natively aligned. + auto getScalarMemAlign = [&ST](unsigned Size) { + return ST.enableUnalignedScalarMem() ? 8 : Size; + }; + + LoadActions.legalForTypesWithMemDesc( + {{s32, p0, s8, getScalarMemAlign(8)}, + {s32, p0, s16, getScalarMemAlign(16)}, + {s32, p0, s32, getScalarMemAlign(32)}, + {p0, p0, sXLen, getScalarMemAlign(XLen)}}); + StoreActions.legalForTypesWithMemDesc( + {{s32, p0, s8, getScalarMemAlign(8)}, + {s32, p0, s16, getScalarMemAlign(16)}, + {s32, p0, s32, getScalarMemAlign(32)}, + {p0, p0, sXLen, getScalarMemAlign(XLen)}}); + ExtLoadActions.legalForTypesWithMemDesc( + {{s32, p0, s8, getScalarMemAlign(8)}, + {s32, p0, s16, getScalarMemAlign(16)}}); if (XLen == 64) { - LoadActions.legalForTypesWithMemDesc({{s64, p0, s8, 8}, - {s64, p0, s16, 16}, - {s64, p0, s32, 32}, - {s64, p0, s64, 64}}); - StoreActions.legalForTypesWithMemDesc({{s64, p0, s8, 8}, - {s64, p0, s16, 16}, - {s64, p0, s32, 32}, - {s64, p0, s64, 64}}); + LoadActions.legalForTypesWithMemDesc( + {{s64, p0, s8, getScalarMemAlign(8)}, + {s64, p0, s16, getScalarMemAlign(16)}, + {s64, p0, s32, getScalarMemAlign(32)}, + {s64, p0, s64, getScalarMemAlign(64)}}); + StoreActions.legalForTypesWithMemDesc( + {{s64, p0, s8, getScalarMemAlign(8)}, + {s64, p0, s16, getScalarMemAlign(16)}, + {s64, p0, s32, getScalarMemAlign(32)}, + {s64, p0, s64, getScalarMemAlign(64)}}); ExtLoadActions.legalForTypesWithMemDesc( - {{s64, p0, s8, 8}, {s64, p0, s16, 16}, {s64, p0, s32, 32}}); + {{s64, p0, s8, getScalarMemAlign(8)}, + {s64, p0, s16, getScalarMemAlign(16)}, + {s64, p0, s32, getScalarMemAlign(32)}}); } else if (ST.hasStdExtD()) { - LoadActions.legalForTypesWithMemDesc({{s64, p0, s64, 64}}); - StoreActions.legalForTypesWithMemDesc({{s64, p0, s64, 64}}); + LoadActions.legalForTypesWithMemDesc( + {{s64, p0, s64, getScalarMemAlign(64)}}); + StoreActions.legalForTypesWithMemDesc( + {{s64, p0, s64, getScalarMemAlign(64)}}); } // Vector loads/stores. diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-load-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-load-rv32.mir index f925d245150864..bed44eb657da91 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-load-rv32.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-load-rv32.mir @@ -1,6 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=riscv32 -run-pass=legalizer %s -o - \ -# RUN: | FileCheck %s +# RUN: | FileCheck %s +# RUN: llc -mtriple=riscv32 -mattr=+unaligned-scalar-mem -run-pass=legalizer %s -o - \ +# RUN: | FileCheck %s --check-prefix=UNALIGNED --- name: load_i8 @@ -26,6 +28,14 @@ body: | ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8)) ; CHECK-NEXT: $x10 = COPY [[LOAD]](s32) ; CHECK-NEXT: PseudoRET implicit $x10 + ; + ; UNALIGNED-LABEL: name: load_i8 + ; UNALIGNED: liveins: $x10 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8)) + ; UNALIGNED-NEXT: $x10 = COPY [[LOAD]](s32) + ; UNALIGNED-NEXT: PseudoRET implicit $x10 %0:_(p0) = COPY $x10 %1:_(s8) = G_LOAD %0(p0) :: (load (s8)) %2:_(s32) = G_ANYEXT %1(s8) @@ -57,6 +67,14 @@ body: | ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16)) ; CHECK-NEXT: $x10 = COPY [[LOAD]](s32) ; CHECK-NEXT: PseudoRET implicit $x10 + ; + ; UNALIGNED-LABEL: name: load_i16 + ; UNALIGNED: liveins: $x10 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16)) + ; UNALIGNED-NEXT: $x10 = COPY [[LOAD]](s32) + ; UNALIGNED-NEXT: PseudoRET implicit $x10 %0:_(p0) = COPY $x10 %1:_(s16) = G_LOAD %0(p0) :: (load (s16)) %2:_(s32) = G_ANYEXT %1(s16) @@ -87,6 +105,14 @@ body: | ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32)) ; CHECK-NEXT: $x10 = COPY [[LOAD]](s32) ; CHECK-NEXT: PseudoRET implicit $x10 + ; + ; UNALIGNED-LABEL: name: load_i32 + ; UNALIGNED: liveins: $x10 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32)) + ; UNALIGNED-NEXT: $x10 = COPY [[LOAD]](s32) + ; UNALIGNED-NEXT: PseudoRET implicit $x10 %0:_(p0) = COPY $x10 %1:_(s32) = G_LOAD %0(p0) :: (load (s32)) $x10 = COPY %1(s32) @@ -122,6 +148,18 @@ body: | ; CHECK-NEXT: $x10 = COPY [[LOAD]](s32) ; CHECK-NEXT: $x11 = COPY [[LOAD1]](s32) ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11 + ; + ; UNALIGNED-LABEL: name: load_i64 + ; UNALIGNED: liveins: $x10 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 8) + ; UNALIGNED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 4) + ; UNALIGNED-NEXT: $x10 = COPY [[LOAD]](s32) + ; UNALIGNED-NEXT: $x11 = COPY [[LOAD1]](s32) + ; UNALIGNED-NEXT: PseudoRET implicit $x10, implicit $x11 %0:_(p0) = COPY $x10 %1:_(s64) = G_LOAD %0(p0) :: (load (s64)) %2:_(s32), %3:_(s32) = G_UNMERGE_VALUES %1(s64) @@ -153,6 +191,14 @@ body: | ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p0) = G_LOAD [[COPY]](p0) :: (load (p0), align 8) ; CHECK-NEXT: $x10 = COPY [[LOAD]](p0) ; CHECK-NEXT: PseudoRET implicit $x10 + ; + ; UNALIGNED-LABEL: name: load_ptr + ; UNALIGNED: liveins: $x10 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(p0) = G_LOAD [[COPY]](p0) :: (load (p0), align 8) + ; UNALIGNED-NEXT: $x10 = COPY [[LOAD]](p0) + ; UNALIGNED-NEXT: PseudoRET implicit $x10 %0:_(p0) = COPY $x10 %1:_(p0) = G_LOAD %0(p0) :: (load (p0), align 8) $x10 = COPY %1(p0) @@ -189,6 +235,14 @@ body: | ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] ; CHECK-NEXT: $x10 = COPY [[OR]](s32) ; CHECK-NEXT: PseudoRET implicit $x10 + ; + ; UNALIGNED-LABEL: name: load_i16_unaligned + ; UNALIGNED: liveins: $x10 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; UNALIGNED-NEXT: $x10 = COPY [[LOAD]](s32) + ; UNALIGNED-NEXT: PseudoRET implicit $x10 %0:_(p0) = COPY $x10 %1:_(s16) = G_LOAD %0(p0) :: (load (s16), align 1) %2:_(s32) = G_ANYEXT %1(s16) @@ -237,6 +291,14 @@ body: | ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] ; CHECK-NEXT: $x10 = COPY [[OR2]](s32) ; CHECK-NEXT: PseudoRET implicit $x10 + ; + ; UNALIGNED-LABEL: name: load_i32_unaligned + ; UNALIGNED: liveins: $x10 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1) + ; UNALIGNED-NEXT: $x10 = COPY [[LOAD]](s32) + ; UNALIGNED-NEXT: PseudoRET implicit $x10 %0:_(p0) = COPY $x10 %1:_(s32) = G_LOAD %0(p0) :: (load (s32), align 1) $x10 = COPY %1(s32) @@ -272,6 +334,14 @@ body: | ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] ; CHECK-NEXT: $x10 = COPY [[OR]](s32) ; CHECK-NEXT: PseudoRET implicit $x10 + ; + ; UNALIGNED-LABEL: name: load_i32_align2 + ; UNALIGNED: liveins: $x10 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2) + ; UNALIGNED-NEXT: $x10 = COPY [[LOAD]](s32) + ; UNALIGNED-NEXT: PseudoRET implicit $x10 %0:_(p0) = COPY $x10 %1:_(s32) = G_LOAD %0(p0) :: (load (s32), align 2) $x10 = COPY %1(s32) @@ -343,6 +413,18 @@ body: | ; CHECK-NEXT: $x10 = COPY [[OR2]](s32) ; CHECK-NEXT: $x11 = COPY [[OR5]](s32) ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11 + ; + ; UNALIGNED-LABEL: name: load_i64_unaligned + ; UNALIGNED: liveins: $x10 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1) + ; UNALIGNED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 4, align 1) + ; UNALIGNED-NEXT: $x10 = COPY [[LOAD]](s32) + ; UNALIGNED-NEXT: $x11 = COPY [[LOAD1]](s32) + ; UNALIGNED-NEXT: PseudoRET implicit $x10, implicit $x11 %0:_(p0) = COPY $x10 %1:_(s64) = G_LOAD %0(p0) :: (load (s64), align 1) %2:_(s32), %3:_(s32) = G_UNMERGE_VALUES %1(s64) diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-load-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-load-rv64.mir index 933bc589f6018e..491e4a358b1ad6 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-load-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-load-rv64.mir @@ -1,6 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=riscv64 -run-pass=legalizer %s -o - \ -# RUN: | FileCheck %s +# RUN: | FileCheck %s +# RUN: llc -mtriple=riscv64 -mattr=+unaligned-scalar-mem -run-pass=legalizer %s -o - \ +# RUN: | FileCheck %s --check-prefix=UNALIGNED --- name: load_i8 @@ -27,6 +29,15 @@ body: | ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32) ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64) ; CHECK-NEXT: PseudoRET implicit $x10 + ; + ; UNALIGNED-LABEL: name: load_i8 + ; UNALIGNED: liveins: $x10 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8)) + ; UNALIGNED-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32) + ; UNALIGNED-NEXT: $x10 = COPY [[ANYEXT]](s64) + ; UNALIGNED-NEXT: PseudoRET implicit $x10 %0:_(p0) = COPY $x10 %1:_(s8) = G_LOAD %0(p0) :: (load (s8)) %2:_(s64) = G_ANYEXT %1(s8) @@ -59,6 +70,15 @@ body: | ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32) ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64) ; CHECK-NEXT: PseudoRET implicit $x10 + ; + ; UNALIGNED-LABEL: name: load_i16 + ; UNALIGNED: liveins: $x10 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16)) + ; UNALIGNED-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32) + ; UNALIGNED-NEXT: $x10 = COPY [[ANYEXT]](s64) + ; UNALIGNED-NEXT: PseudoRET implicit $x10 %0:_(p0) = COPY $x10 %1:_(s16) = G_LOAD %0(p0) :: (load (s16)) %2:_(s64) = G_ANYEXT %1(s16) @@ -91,6 +111,15 @@ body: | ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32) ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64) ; CHECK-NEXT: PseudoRET implicit $x10 + ; + ; UNALIGNED-LABEL: name: load_i32 + ; UNALIGNED: liveins: $x10 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32)) + ; UNALIGNED-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32) + ; UNALIGNED-NEXT: $x10 = COPY [[ANYEXT]](s64) + ; UNALIGNED-NEXT: PseudoRET implicit $x10 %0:_(p0) = COPY $x10 %1:_(s32) = G_LOAD %0(p0) :: (load (s32)) %2:_(s64) = G_ANYEXT %1(s32) @@ -121,6 +150,14 @@ body: | ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64)) ; CHECK-NEXT: $x10 = COPY [[LOAD]](s64) ; CHECK-NEXT: PseudoRET implicit $x10 + ; + ; UNALIGNED-LABEL: name: load_i64 + ; UNALIGNED: liveins: $x10 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64)) + ; UNALIGNED-NEXT: $x10 = COPY [[LOAD]](s64) + ; UNALIGNED-NEXT: PseudoRET implicit $x10 %0:_(p0) = COPY $x10 %1:_(s64) = G_LOAD %0(p0) :: (load (s64)) $x10 = COPY %1(s64) @@ -156,6 +193,18 @@ body: | ; CHECK-NEXT: $x10 = COPY [[LOAD]](s64) ; CHECK-NEXT: $x11 = COPY [[LOAD1]](s64) ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11 + ; + ; UNALIGNED-LABEL: name: load_i128 + ; UNALIGNED: liveins: $x10 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64)) + ; UNALIGNED-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; UNALIGNED-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) + ; UNALIGNED-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 8) + ; UNALIGNED-NEXT: $x10 = COPY [[LOAD]](s64) + ; UNALIGNED-NEXT: $x11 = COPY [[LOAD1]](s64) + ; UNALIGNED-NEXT: PseudoRET implicit $x10, implicit $x11 %0:_(p0) = COPY $x10 %1:_(s128) = G_LOAD %0(p0) :: (load (s128), align 8) %2:_(s64), %3:_(s64) = G_UNMERGE_VALUES %1(s128) @@ -187,6 +236,14 @@ body: | ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p0) = G_LOAD [[COPY]](p0) :: (load (p0)) ; CHECK-NEXT: $x10 = COPY [[LOAD]](p0) ; CHECK-NEXT: PseudoRET implicit $x10 + ; + ; UNALIGNED-LABEL: name: load_ptr + ; UNALIGNED: liveins: $x10 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(p0) = G_LOAD [[COPY]](p0) :: (load (p0)) + ; UNALIGNED-NEXT: $x10 = COPY [[LOAD]](p0) + ; UNALIGNED-NEXT: PseudoRET implicit $x10 %0:_(p0) = COPY $x10 %1:_(p0) = G_LOAD %0(p0) :: (load (p0)) $x10 = COPY %1(p0) @@ -224,6 +281,15 @@ body: | ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR]](s32) ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64) ; CHECK-NEXT: PseudoRET implicit $x10 + ; + ; UNALIGNED-LABEL: name: load_i16_unaligned + ; UNALIGNED: liveins: $x10 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; UNALIGNED-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32) + ; UNALIGNED-NEXT: $x10 = COPY [[ANYEXT]](s64) + ; UNALIGNED-NEXT: PseudoRET implicit $x10 %0:_(p0) = COPY $x10 %1:_(s16) = G_LOAD %0(p0) :: (load (s16), align 1) %2:_(s64) = G_ANYEXT %1(s16) @@ -274,6 +340,15 @@ body: | ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR2]](s32) ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64) ; CHECK-NEXT: PseudoRET implicit $x10 + ; + ; UNALIGNED-LABEL: name: load_i32_unaligned + ; UNALIGNED: liveins: $x10 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1) + ; UNALIGNED-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32) + ; UNALIGNED-NEXT: $x10 = COPY [[ANYEXT]](s64) + ; UNALIGNED-NEXT: PseudoRET implicit $x10 %0:_(p0) = COPY $x10 %1:_(s32) = G_LOAD %0(p0) :: (load (s32), align 1) %2:_(s64) = G_ANYEXT %1(s32) @@ -312,6 +387,15 @@ body: | ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR]](s32) ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64) ; CHECK-NEXT: PseudoRET implicit $x10 + ; + ; UNALIGNED-LABEL: name: load_i32_align2 + ; UNALIGNED: liveins: $x10 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2) + ; UNALIGNED-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32) + ; UNALIGNED-NEXT: $x10 = COPY [[ANYEXT]](s64) + ; UNALIGNED-NEXT: PseudoRET implicit $x10 %0:_(p0) = COPY $x10 %1:_(s32) = G_LOAD %0(p0) :: (load (s32), align 2) %2:_(s64) = G_ANYEXT %1(s32) @@ -384,6 +468,14 @@ body: | ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[OR2]] ; CHECK-NEXT: $x10 = COPY [[OR6]](s64) ; CHECK-NEXT: PseudoRET implicit $x10 + ; + ; UNALIGNED-LABEL: name: load_i64_unaligned + ; UNALIGNED: liveins: $x10 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 1) + ; UNALIGNED-NEXT: $x10 = COPY [[LOAD]](s64) + ; UNALIGNED-NEXT: PseudoRET implicit $x10 %0:_(p0) = COPY $x10 %1:_(s64) = G_LOAD %0(p0) :: (load (s64), align 1) $x10 = COPY %1(s64) @@ -431,6 +523,14 @@ body: | ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[OR]] ; CHECK-NEXT: $x10 = COPY [[OR2]](s64) ; CHECK-NEXT: PseudoRET implicit $x10 + ; + ; UNALIGNED-LABEL: name: load_i64_align2 + ; UNALIGNED: liveins: $x10 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 2) + ; UNALIGNED-NEXT: $x10 = COPY [[LOAD]](s64) + ; UNALIGNED-NEXT: PseudoRET implicit $x10 %0:_(p0) = COPY $x10 %1:_(s64) = G_LOAD %0(p0) :: (load (s64), align 2) $x10 = COPY %1(s64) @@ -550,6 +650,18 @@ body: | ; CHECK-NEXT: $x10 = COPY [[OR6]](s64) ; CHECK-NEXT: $x11 = COPY [[OR13]](s64) ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11 + ; + ; UNALIGNED-LABEL: name: load_i128_unaligned + ; UNALIGNED: liveins: $x10 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 1) + ; UNALIGNED-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; UNALIGNED-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) + ; UNALIGNED-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 8, align 1) + ; UNALIGNED-NEXT: $x10 = COPY [[LOAD]](s64) + ; UNALIGNED-NEXT: $x11 = COPY [[LOAD1]](s64) + ; UNALIGNED-NEXT: PseudoRET implicit $x10, implicit $x11 %0:_(p0) = COPY $x10 %1:_(s128) = G_LOAD %0(p0) :: (load (s128), align 1) %2:_(s64), %3:_(s64) = G_UNMERGE_VALUES %1(s128) diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-store-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-store-rv32.mir index 2ece5a8c9d4142..791bdb30c490f9 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-store-rv32.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-store-rv32.mir @@ -1,6 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=riscv32 -run-pass=legalizer %s -o - \ -# RUN: | FileCheck %s +# RUN: | FileCheck %s +# RUN: llc -mtriple=riscv32 -mattr=+unaligned-scalar-mem -run-pass=legalizer %s -o - \ +# RUN: | FileCheck %s --check-prefix=UNALIGNED --- name: store_i8 @@ -26,6 +28,14 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11 ; CHECK-NEXT: G_STORE [[COPY]](s32), [[COPY1]](p0) :: (store (s8)) ; CHECK-NEXT: PseudoRET + ; + ; UNALIGNED-LABEL: name: store_i8 + ; UNALIGNED: liveins: $x10, $x11 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; UNALIGNED-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11 + ; UNALIGNED-NEXT: G_STORE [[COPY]](s32), [[COPY1]](p0) :: (store (s8)) + ; UNALIGNED-NEXT: PseudoRET %2:_(s32) = COPY $x10 %0:_(s8) = G_TRUNC %2(s32) %1:_(p0) = COPY $x11 @@ -57,6 +67,14 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11 ; CHECK-NEXT: G_STORE [[COPY]](s32), [[COPY1]](p0) :: (store (s16)) ; CHECK-NEXT: PseudoRET + ; + ; UNALIGNED-LABEL: name: store_i16 + ; UNALIGNED: liveins: $x10, $x11 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; UNALIGNED-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11 + ; UNALIGNED-NEXT: G_STORE [[COPY]](s32), [[COPY1]](p0) :: (store (s16)) + ; UNALIGNED-NEXT: PseudoRET %2:_(s32) = COPY $x10 %0:_(s16) = G_TRUNC %2(s32) %1:_(p0) = COPY $x11 @@ -87,6 +105,14 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11 ; CHECK-NEXT: G_STORE [[COPY]](s32), [[COPY1]](p0) :: (store (s32)) ; CHECK-NEXT: PseudoRET + ; + ; UNALIGNED-LABEL: name: store_i32 + ; UNALIGNED: liveins: $x10, $x11 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; UNALIGNED-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11 + ; UNALIGNED-NEXT: G_STORE [[COPY]](s32), [[COPY1]](p0) :: (store (s32)) + ; UNALIGNED-NEXT: PseudoRET %0:_(s32) = COPY $x10 %1:_(p0) = COPY $x11 G_STORE %0(s32), %1(p0) :: (store (s32)) @@ -122,6 +148,18 @@ body: | ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY2]], [[C]](s32) ; CHECK-NEXT: G_STORE [[COPY1]](s32), [[PTR_ADD]](p0) :: (store (s32) into unknown-address + 4) ; CHECK-NEXT: PseudoRET + ; + ; UNALIGNED-LABEL: name: store_i64 + ; UNALIGNED: liveins: $x10, $x11, $x12 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; UNALIGNED-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11 + ; UNALIGNED-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY $x12 + ; UNALIGNED-NEXT: G_STORE [[COPY]](s32), [[COPY2]](p0) :: (store (s32), align 8) + ; UNALIGNED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY2]], [[C]](s32) + ; UNALIGNED-NEXT: G_STORE [[COPY1]](s32), [[PTR_ADD]](p0) :: (store (s32) into unknown-address + 4) + ; UNALIGNED-NEXT: PseudoRET %2:_(s32) = COPY $x10 %3:_(s32) = COPY $x11 %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32) @@ -153,6 +191,14 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11 ; CHECK-NEXT: G_STORE [[COPY]](p0), [[COPY1]](p0) :: (store (p0), align 8) ; CHECK-NEXT: PseudoRET + ; + ; UNALIGNED-LABEL: name: store_ptr + ; UNALIGNED: liveins: $x10, $x11 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; UNALIGNED-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11 + ; UNALIGNED-NEXT: G_STORE [[COPY]](p0), [[COPY1]](p0) :: (store (p0), align 8) + ; UNALIGNED-NEXT: PseudoRET %0:_(p0) = COPY $x10 %1:_(p0) = COPY $x11 G_STORE %0(p0), %1(p0) :: (store (p0), align 8) @@ -190,6 +236,14 @@ body: | ; CHECK-NEXT: G_STORE [[COPY]](s32), [[COPY1]](p0) :: (store (s8)) ; CHECK-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p0) :: (store (s8) into unknown-address + 1) ; CHECK-NEXT: PseudoRET + ; + ; UNALIGNED-LABEL: name: store_i16_unaligned + ; UNALIGNED: liveins: $x10, $x11 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; UNALIGNED-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11 + ; UNALIGNED-NEXT: G_STORE [[COPY]](s32), [[COPY1]](p0) :: (store (s16), align 1) + ; UNALIGNED-NEXT: PseudoRET %2:_(s32) = COPY $x10 %0:_(s16) = G_TRUNC %2(s32) %1:_(p0) = COPY $x11 @@ -238,6 +292,14 @@ body: | ; CHECK-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p0) :: (store (s8) into unknown-address + 2) ; CHECK-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p0) :: (store (s8) into unknown-address + 3) ; CHECK-NEXT: PseudoRET + ; + ; UNALIGNED-LABEL: name: store_i32_unaligned + ; UNALIGNED: liveins: $x10, $x11 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; UNALIGNED-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11 + ; UNALIGNED-NEXT: G_STORE [[COPY]](s32), [[COPY1]](p0) :: (store (s32), align 1) + ; UNALIGNED-NEXT: PseudoRET %0:_(s32) = COPY $x10 %1:_(p0) = COPY $x11 G_STORE %0(s32), %1(p0) :: (store (s32), align 1) @@ -273,6 +335,14 @@ body: | ; CHECK-NEXT: G_STORE [[COPY2]](s32), [[COPY1]](p0) :: (store (s16)) ; CHECK-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p0) :: (store (s16) into unknown-address + 2) ; CHECK-NEXT: PseudoRET + ; + ; UNALIGNED-LABEL: name: store_i32_align2 + ; UNALIGNED: liveins: $x10, $x11 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; UNALIGNED-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11 + ; UNALIGNED-NEXT: G_STORE [[COPY]](s32), [[COPY1]](p0) :: (store (s32), align 2) + ; UNALIGNED-NEXT: PseudoRET %0:_(s32) = COPY $x10 %1:_(p0) = COPY $x11 G_STORE %0(s32), %1(p0) :: (store (s32), align 2) diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-store-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-store-rv64.mir index 85055561c4f927..860bc932d8560b 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-store-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-store-rv64.mir @@ -1,6 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=riscv64 -run-pass=legalizer %s -o - \ -# RUN: | FileCheck %s +# RUN: | FileCheck %s +# RUN: llc -mtriple=riscv64 -mattr=+unaligned-scalar-mem -run-pass=legalizer %s -o - \ +# RUN: | FileCheck %s --check-prefix=UNALIGNED --- name: store_i8 @@ -27,6 +29,15 @@ body: | ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) ; CHECK-NEXT: G_STORE [[TRUNC]](s32), [[COPY1]](p0) :: (store (s8)) ; CHECK-NEXT: PseudoRET + ; + ; UNALIGNED-LABEL: name: store_i8 + ; UNALIGNED: liveins: $x10, $x11 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; UNALIGNED-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11 + ; UNALIGNED-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; UNALIGNED-NEXT: G_STORE [[TRUNC]](s32), [[COPY1]](p0) :: (store (s8)) + ; UNALIGNED-NEXT: PseudoRET %2:_(s64) = COPY $x10 %0:_(s8) = G_TRUNC %2(s64) %1:_(p0) = COPY $x11 @@ -59,6 +70,15 @@ body: | ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) ; CHECK-NEXT: G_STORE [[TRUNC]](s32), [[COPY1]](p0) :: (store (s16)) ; CHECK-NEXT: PseudoRET + ; + ; UNALIGNED-LABEL: name: store_i16 + ; UNALIGNED: liveins: $x10, $x11 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; UNALIGNED-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11 + ; UNALIGNED-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; UNALIGNED-NEXT: G_STORE [[TRUNC]](s32), [[COPY1]](p0) :: (store (s16)) + ; UNALIGNED-NEXT: PseudoRET %2:_(s64) = COPY $x10 %0:_(s16) = G_TRUNC %2(s64) %1:_(p0) = COPY $x11 @@ -91,6 +111,15 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11 ; CHECK-NEXT: G_STORE [[TRUNC]](s32), [[COPY1]](p0) :: (store (s32)) ; CHECK-NEXT: PseudoRET + ; + ; UNALIGNED-LABEL: name: store_i32 + ; UNALIGNED: liveins: $x10, $x11 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; UNALIGNED-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; UNALIGNED-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11 + ; UNALIGNED-NEXT: G_STORE [[TRUNC]](s32), [[COPY1]](p0) :: (store (s32)) + ; UNALIGNED-NEXT: PseudoRET %2:_(s64) = COPY $x10 %0:_(s32) = G_TRUNC %2(s64) %1:_(p0) = COPY $x11 @@ -121,6 +150,14 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11 ; CHECK-NEXT: G_STORE [[COPY]](s64), [[COPY1]](p0) :: (store (s64)) ; CHECK-NEXT: PseudoRET + ; + ; UNALIGNED-LABEL: name: store_i64 + ; UNALIGNED: liveins: $x10, $x11 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; UNALIGNED-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11 + ; UNALIGNED-NEXT: G_STORE [[COPY]](s64), [[COPY1]](p0) :: (store (s64)) + ; UNALIGNED-NEXT: PseudoRET %0:_(s64) = COPY $x10 %1:_(p0) = COPY $x11 G_STORE %0(s64), %1(p0) :: (store (s64)) @@ -150,6 +187,14 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11 ; CHECK-NEXT: G_STORE [[COPY]](s64), [[COPY1]](p0) :: (store (s64)) ; CHECK-NEXT: PseudoRET + ; + ; UNALIGNED-LABEL: name: store_i128 + ; UNALIGNED: liveins: $x10, $x11 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; UNALIGNED-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11 + ; UNALIGNED-NEXT: G_STORE [[COPY]](s64), [[COPY1]](p0) :: (store (s64)) + ; UNALIGNED-NEXT: PseudoRET %0:_(s64) = COPY $x10 %1:_(p0) = COPY $x11 G_STORE %0(s64), %1(p0) :: (store (s64)) @@ -179,6 +224,14 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11 ; CHECK-NEXT: G_STORE [[COPY]](p0), [[COPY1]](p0) :: (store (p0)) ; CHECK-NEXT: PseudoRET + ; + ; UNALIGNED-LABEL: name: store_ptr + ; UNALIGNED: liveins: $x10, $x11 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; UNALIGNED-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11 + ; UNALIGNED-NEXT: G_STORE [[COPY]](p0), [[COPY1]](p0) :: (store (p0)) + ; UNALIGNED-NEXT: PseudoRET %0:_(p0) = COPY $x10 %1:_(p0) = COPY $x11 G_STORE %0(p0), %1(p0) :: (store (p0)) @@ -217,6 +270,15 @@ body: | ; CHECK-NEXT: G_STORE [[TRUNC]](s32), [[COPY1]](p0) :: (store (s8)) ; CHECK-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p0) :: (store (s8) into unknown-address + 1) ; CHECK-NEXT: PseudoRET + ; + ; UNALIGNED-LABEL: name: store_i16_unaligned + ; UNALIGNED: liveins: $x10, $x11 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; UNALIGNED-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11 + ; UNALIGNED-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; UNALIGNED-NEXT: G_STORE [[TRUNC]](s32), [[COPY1]](p0) :: (store (s16), align 1) + ; UNALIGNED-NEXT: PseudoRET %2:_(s64) = COPY $x10 %0:_(s16) = G_TRUNC %2(s64) %1:_(p0) = COPY $x11 @@ -267,6 +329,15 @@ body: | ; CHECK-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p0) :: (store (s8) into unknown-address + 2) ; CHECK-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p0) :: (store (s8) into unknown-address + 3) ; CHECK-NEXT: PseudoRET + ; + ; UNALIGNED-LABEL: name: store_i32_unaligned + ; UNALIGNED: liveins: $x10, $x11 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; UNALIGNED-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; UNALIGNED-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11 + ; UNALIGNED-NEXT: G_STORE [[TRUNC]](s32), [[COPY1]](p0) :: (store (s32), align 1) + ; UNALIGNED-NEXT: PseudoRET %2:_(s64) = COPY $x10 %0:_(s32) = G_TRUNC %2(s64) %1:_(p0) = COPY $x11 @@ -305,6 +376,15 @@ body: | ; CHECK-NEXT: G_STORE [[COPY2]](s32), [[COPY1]](p0) :: (store (s16)) ; CHECK-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p0) :: (store (s16) into unknown-address + 2) ; CHECK-NEXT: PseudoRET + ; + ; UNALIGNED-LABEL: name: store_i32_align2 + ; UNALIGNED: liveins: $x10, $x11 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; UNALIGNED-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; UNALIGNED-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11 + ; UNALIGNED-NEXT: G_STORE [[TRUNC]](s32), [[COPY1]](p0) :: (store (s32), align 2) + ; UNALIGNED-NEXT: PseudoRET %2:_(s64) = COPY $x10 %0:_(s32) = G_TRUNC %2(s64) %1:_(p0) = COPY $x11 @@ -353,6 +433,14 @@ body: | ; CHECK-NEXT: G_STORE [[TRUNC1]](s32), [[PTR_ADD]](p0) :: (store (s16) into unknown-address + 4) ; CHECK-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p0) :: (store (s16) into unknown-address + 6) ; CHECK-NEXT: PseudoRET + ; + ; UNALIGNED-LABEL: name: store_i64_align2 + ; UNALIGNED: liveins: $x10, $x11 + ; UNALIGNED-NEXT: {{ $}} + ; UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; UNALIGNED-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11 + ; UNALIGNED-NEXT: G_STORE [[COPY]](s64), [[COPY1]](p0) :: (store (s64), align 2) + ; UNALIGNED-NEXT: PseudoRET %0:_(s64) = COPY $x10 %1:_(p0) = COPY $x11 G_STORE %0(s64), %1(p0) :: (store (s64), align 2) From 2383bc8216bd7d373bb48337199c09d40922adf2 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Wed, 18 Sep 2024 15:54:01 -0700 Subject: [PATCH 120/321] [lldb] Update SocketTestUtilities.cpp to use CloneableECError --- lldb/unittests/TestingSupport/Host/SocketTestUtilities.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lldb/unittests/TestingSupport/Host/SocketTestUtilities.cpp b/lldb/unittests/TestingSupport/Host/SocketTestUtilities.cpp index 86fb5b05ea8009..86aed292ec01f8 100644 --- a/lldb/unittests/TestingSupport/Host/SocketTestUtilities.cpp +++ b/lldb/unittests/TestingSupport/Host/SocketTestUtilities.cpp @@ -104,14 +104,13 @@ static bool CheckIPSupport(llvm::StringRef Proto, llvm::StringRef Addr) { bool HasProtocolError = false; handleAllErrors( std::move(Err), - [&](std::unique_ptr ECErr) { + [&](std::unique_ptr ECErr) { std::error_code ec = ECErr->convertToErrorCode(); if (ec == std::make_error_code(std::errc::address_family_not_supported) || ec == std::make_error_code(std::errc::address_not_available)) HasProtocolError = true; - }, - [](const llvm::ErrorInfoBase &) {}); + }); if (HasProtocolError) { GTEST_LOG_(WARNING) << llvm::formatv( From 38752ffd417103621232e6ba6ba70e970e0d6356 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Wed, 18 Sep 2024 16:10:16 -0700 Subject: [PATCH 121/321] [sanitizer] Adjust size for begin/start mismatch (#109079) Follow up to 51d913af827567e6a0999609e7e624a422781870. --- .../lib/sanitizer_common/sanitizer_tls_get_addr.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp index 0db8547268f4e2..6f3b6af3c58474 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp @@ -15,6 +15,7 @@ #include "sanitizer_allocator_interface.h" #include "sanitizer_atomic.h" #include "sanitizer_common/sanitizer_common.h" +#include "sanitizer_common/sanitizer_internal_defs.h" #include "sanitizer_flags.h" #include "sanitizer_platform_interceptors.h" @@ -116,10 +117,14 @@ SANITIZER_INTERFACE_WEAK_DEF(uptr, __sanitizer_get_dtls_size, const void *start = __sanitizer_get_allocated_begin(tls_begin); if (!start) return 0; + CHECK_LE(start, tls_begin); uptr tls_size = __sanitizer_get_allocated_size(start); VReport(2, "__tls_get_addr: glibc DTLS suspected; tls={%p,0x%zx}\n", tls_begin, tls_size); - return tls_size; + uptr offset = + (reinterpret_cast(tls_begin) - reinterpret_cast(start)); + CHECK_LE(offset, tls_size); + return tls_size - offset; } DTLS::DTV *DTLS_on_tls_get_addr(void *arg_void, void *res, From a0bb2e21c10bebcdb6bc6b8bc18f74dcf7c4b8b2 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Wed, 18 Sep 2024 16:19:35 -0700 Subject: [PATCH 122/321] [NFC][sanitizer] Move `InitTlsSize` into `InitializePlatformEarly` (#108921) --- compiler-rt/lib/asan/asan_rtl.cpp | 3 --- compiler-rt/lib/dfsan/dfsan.cpp | 2 ++ compiler-rt/lib/hwasan/hwasan.cpp | 4 ++-- compiler-rt/lib/lsan/lsan.cpp | 2 +- compiler-rt/lib/memprof/memprof_rtl.cpp | 3 --- compiler-rt/lib/msan/msan.cpp | 3 ++- .../lib/sanitizer_common/sanitizer_common_nolibc.cpp | 1 + compiler-rt/lib/sanitizer_common/sanitizer_fuchsia.cpp | 1 - compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp | 4 +--- .../lib/sanitizer_common/sanitizer_linux_libcdep.cpp | 7 +++++-- compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp | 3 --- compiler-rt/lib/sanitizer_common/sanitizer_win.cpp | 3 --- compiler-rt/lib/tsan/rtl/tsan_platform_linux.cpp | 1 - compiler-rt/lib/ubsan/ubsan_init.cpp | 2 +- 14 files changed, 15 insertions(+), 24 deletions(-) diff --git a/compiler-rt/lib/asan/asan_rtl.cpp b/compiler-rt/lib/asan/asan_rtl.cpp index a390802af28d09..19c6c210b564c5 100644 --- a/compiler-rt/lib/asan/asan_rtl.cpp +++ b/compiler-rt/lib/asan/asan_rtl.cpp @@ -478,9 +478,6 @@ static bool AsanInitInternal() { if (flags()->start_deactivated) AsanDeactivate(); - // interceptors - InitTlsSize(); - // Create main thread. AsanThread *main_thread = CreateMainThread(); CHECK_EQ(0, main_thread->tid()); diff --git a/compiler-rt/lib/dfsan/dfsan.cpp b/compiler-rt/lib/dfsan/dfsan.cpp index 1972a07d15ac51..886e93e5fa8139 100644 --- a/compiler-rt/lib/dfsan/dfsan.cpp +++ b/compiler-rt/lib/dfsan/dfsan.cpp @@ -1262,6 +1262,8 @@ static void DFsanInit(int argc, char **argv, char **envp) { CheckASLR(); + InitializePlatformEarly(); + if (!InitShadowWithReExec(dfsan_get_track_origins())) { Printf("FATAL: DataflowSanitizer can not mmap the shadow memory.\n"); DumpProcessMap(); diff --git a/compiler-rt/lib/hwasan/hwasan.cpp b/compiler-rt/lib/hwasan/hwasan.cpp index ccdc0b4bc21bd3..24384d8b4d2cf1 100644 --- a/compiler-rt/lib/hwasan/hwasan.cpp +++ b/compiler-rt/lib/hwasan/hwasan.cpp @@ -357,8 +357,6 @@ __attribute__((constructor(0))) void __hwasan_init() { hwasan_init_is_running = 1; SanitizerToolName = "HWAddressSanitizer"; - InitTlsSize(); - CacheBinaryName(); InitializeFlags(); @@ -367,6 +365,8 @@ __attribute__((constructor(0))) void __hwasan_init() { __sanitizer_set_report_path(common_flags()->log_path); + InitializePlatformEarly(); + AndroidTestTlsSlot(); DisableCoreDumperIfNecessary(); diff --git a/compiler-rt/lib/lsan/lsan.cpp b/compiler-rt/lib/lsan/lsan.cpp index 7a27b600f203f7..798294b499e2f0 100644 --- a/compiler-rt/lib/lsan/lsan.cpp +++ b/compiler-rt/lib/lsan/lsan.cpp @@ -92,10 +92,10 @@ extern "C" void __lsan_init() { CacheBinaryName(); AvoidCVE_2016_2143(); InitializeFlags(); + InitializePlatformEarly(); InitCommonLsan(); InitializeAllocator(); ReplaceSystemMalloc(); - InitTlsSize(); InitializeInterceptors(); InitializeThreads(); InstallDeadlySignalHandlers(LsanOnDeadlySignal); diff --git a/compiler-rt/lib/memprof/memprof_rtl.cpp b/compiler-rt/lib/memprof/memprof_rtl.cpp index cf4bde808bfad6..2cc6c2df5a6fe4 100644 --- a/compiler-rt/lib/memprof/memprof_rtl.cpp +++ b/compiler-rt/lib/memprof/memprof_rtl.cpp @@ -213,9 +213,6 @@ static void MemprofInitInternal() { InitializeCoverage(common_flags()->coverage, common_flags()->coverage_dir); - // interceptors - InitTlsSize(); - // Create main thread. MemprofThread *main_thread = CreateMainThread(); CHECK_EQ(0, main_thread->tid()); diff --git a/compiler-rt/lib/msan/msan.cpp b/compiler-rt/lib/msan/msan.cpp index 2ee05f43ec5e56..6c27ab21eeebfd 100644 --- a/compiler-rt/lib/msan/msan.cpp +++ b/compiler-rt/lib/msan/msan.cpp @@ -457,10 +457,11 @@ void __msan_init() { __sanitizer_set_report_path(common_flags()->log_path); + InitializePlatformEarly(); + InitializeInterceptors(); InstallAtForkHandler(); CheckASLR(); - InitTlsSize(); InstallDeadlySignalHandlers(MsanOnDeadlySignal); InstallAtExitHandler(); // Needs __cxa_atexit interceptor. diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_nolibc.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_common_nolibc.cpp index 7d88575160c6c6..e49285f22dff99 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_common_nolibc.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_nolibc.cpp @@ -22,6 +22,7 @@ namespace __sanitizer { #if !SANITIZER_WINDOWS # if SANITIZER_LINUX void LogMessageOnPrintf(const char *str) {} +void InitTlsSize() {} # endif void WriteToSyslog(const char *buffer) {} void Abort() { internal__exit(1); } diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_fuchsia.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_fuchsia.cpp index a67b2a8725eca8..75dcf546729f6e 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_fuchsia.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_fuchsia.cpp @@ -94,7 +94,6 @@ void DisableCoreDumperIfNecessary() {} void InstallDeadlySignalHandlers(SignalHandlerType handler) {} void SetAlternateSignalStack() {} void UnsetAlternateSignalStack() {} -void InitTlsSize() {} bool SignalContext::IsStackOverflow() const { return false; } void SignalContext::DumpAllRegisters(void *context) { UNIMPLEMENTED(); } diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp index 6359f4348e3c48..1c637d109649b6 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp @@ -2672,9 +2672,7 @@ static void GetPcSpBp(void *context, uptr *pc, uptr *sp, uptr *bp) { void SignalContext::InitPcSpBp() { GetPcSpBp(context, &pc, &sp, &bp); } -void InitializePlatformEarly() { - // Do nothing. -} +void InitializePlatformEarly() { InitTlsSize(); } void CheckASLR() { # if SANITIZER_NETBSD diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp index 53add5a9b16423..055d5e9473131c 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp @@ -228,9 +228,12 @@ void InitTlsSize() { # if defined(__aarch64__) || defined(__x86_64__) || \ defined(__powerpc64__) || defined(__loongarch__) - void *get_tls_static_info = dlsym(RTLD_DEFAULT, "_dl_get_tls_static_info"); + auto *get_tls_static_info = (void (*)(size_t *, size_t *))dlsym( + RTLD_DEFAULT, "_dl_get_tls_static_info"); size_t tls_align; - ((void (*)(size_t *, size_t *))get_tls_static_info)(&g_tls_size, &tls_align); + // Can be null if static link. + if (get_tls_static_info) + get_tls_static_info(&g_tls_size, &tls_align); # endif } # else diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp index 2a36104e6f9f29..26d2e8d4ed7680 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp @@ -545,9 +545,6 @@ uptr GetTlsSize() { return 0; } -void InitTlsSize() { -} - uptr TlsBaseAddr() { uptr segbase = 0; #if defined(__x86_64__) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp index 2c8f8343519ed8..7cee571314868e 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp @@ -873,9 +873,6 @@ uptr GetTlsSize() { return 0; } -void InitTlsSize() { -} - void GetThreadStackAndTls(bool main, uptr *stk_begin, uptr *stk_end, uptr *tls_begin, uptr *tls_end) { # if SANITIZER_GO diff --git a/compiler-rt/lib/tsan/rtl/tsan_platform_linux.cpp b/compiler-rt/lib/tsan/rtl/tsan_platform_linux.cpp index 621c679a05db45..3e08a1bece98f0 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_platform_linux.cpp +++ b/compiler-rt/lib/tsan/rtl/tsan_platform_linux.cpp @@ -418,7 +418,6 @@ void InitializePlatform() { Die(); } - InitTlsSize(); #endif // !SANITIZER_GO } diff --git a/compiler-rt/lib/ubsan/ubsan_init.cpp b/compiler-rt/lib/ubsan/ubsan_init.cpp index 5802d58896f0fe..aea7ca00e3cb3f 100644 --- a/compiler-rt/lib/ubsan/ubsan_init.cpp +++ b/compiler-rt/lib/ubsan/ubsan_init.cpp @@ -43,8 +43,8 @@ static void CommonStandaloneInit() { SanitizerToolName = GetSanititizerToolName(); CacheBinaryName(); InitializeFlags(); - __sanitizer::InitializePlatformEarly(); __sanitizer_set_report_path(common_flags()->log_path); + __sanitizer::InitializePlatformEarly(); AndroidLogInit(); InitializeCoverage(common_flags()->coverage, common_flags()->coverage_dir); CommonInit(); From cf02d8bbb6dfed17081fbdbf44e2071aea1af728 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Wed, 18 Sep 2024 16:23:18 -0700 Subject: [PATCH 123/321] [lldb] Store ECError as CloneableECError in Status --- lldb/source/Utility/Status.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/lldb/source/Utility/Status.cpp b/lldb/source/Utility/Status.cpp index f557cb540b5655..a659456b9b1b39 100644 --- a/lldb/source/Utility/Status.cpp +++ b/lldb/source/Utility/Status.cpp @@ -144,13 +144,19 @@ static llvm::Error CloneError(const llvm::Error &error) { return llvm::make_error(e.message(), e.convertToErrorCode(), true); }; - visitErrors(error, [&](const llvm::ErrorInfoBase &e) { + llvm::visitErrors(error, [&](const llvm::ErrorInfoBase &e) { result = joinErrors(std::move(result), clone(e)); }); return result; } -Status Status::FromError(llvm::Error error) { return Status(std::move(error)); } +Status Status::FromError(llvm::Error error) { + if (error.isA()) { + std::error_code ec = llvm::errorToErrorCode(std::move(error)); + return Status::FromError(llvm::make_error(ec)); + } + return Status(std::move(error)); +} llvm::Error Status::ToError() const { return CloneError(m_error); } From d1544da533378c1fbb81f08f20270d58d34587ca Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Wed, 18 Sep 2024 16:29:18 -0700 Subject: [PATCH 124/321] [NFC][sanitizer] Move InitTlsSize (#108922) Move after ThreadDescriptorSizeFallback to be able to use it. --- .../sanitizer_linux_libcdep.cpp | 48 ++++++++++--------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp index 055d5e9473131c..6b43fea507a401 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp @@ -209,7 +209,6 @@ bool SetEnv(const char *name, const char *value) { __attribute__((unused)) static int g_use_dlpi_tls_data; # if SANITIZER_GLIBC && !SANITIZER_GO - static void GetGLibcVersion(int *major, int *minor, int *patch) { const char *p = gnu_get_libc_version(); *major = internal_simple_strtoll(p, &p, 10); @@ -218,26 +217,6 @@ static void GetGLibcVersion(int *major, int *minor, int *patch) { *minor = (*p == '.') ? internal_simple_strtoll(p + 1, &p, 10) : 0; *patch = (*p == '.') ? internal_simple_strtoll(p + 1, &p, 10) : 0; } - -__attribute__((unused)) static size_t g_tls_size; - -void InitTlsSize() { - int major, minor, patch; - GetGLibcVersion(&major, &minor, &patch); - g_use_dlpi_tls_data = major == 2 && minor >= 25; - -# if defined(__aarch64__) || defined(__x86_64__) || \ - defined(__powerpc64__) || defined(__loongarch__) - auto *get_tls_static_info = (void (*)(size_t *, size_t *))dlsym( - RTLD_DEFAULT, "_dl_get_tls_static_info"); - size_t tls_align; - // Can be null if static link. - if (get_tls_static_info) - get_tls_static_info(&g_tls_size, &tls_align); -# endif -} -# else -void InitTlsSize() {} # endif // SANITIZER_GLIBC && !SANITIZER_GO // On glibc x86_64, ThreadDescriptorSize() needs to be precise due to the usage @@ -341,6 +320,28 @@ uptr ThreadDescriptorSize() { return val; } +# if SANITIZER_GLIBC +__attribute__((unused)) static size_t g_tls_size; +# endif + +void InitTlsSize() { +# if SANITIZER_GLIBC + int major, minor, patch; + GetGLibcVersion(&major, &minor, &patch); + g_use_dlpi_tls_data = major == 2 && minor >= 25; + +# if defined(__aarch64__) || defined(__x86_64__) || \ + defined(__powerpc64__) || defined(__loongarch__) + auto *get_tls_static_info = (void (*)(size_t *, size_t *))dlsym( + RTLD_DEFAULT, "_dl_get_tls_static_info"); + size_t tls_align; + // Can be null if static link. + if (get_tls_static_info) + get_tls_static_info(&g_tls_size, &tls_align); +# endif +# endif // SANITIZER_GLIBC +} + # if defined(__mips__) || defined(__powerpc64__) || SANITIZER_RISCV64 || \ SANITIZER_LOONGARCH64 // TlsPreTcbSize includes size of struct pthread_descr and size of tcb @@ -361,8 +362,9 @@ static uptr TlsPreTcbSize() { return kTlsPreTcbSize; } # endif - -# endif +# else // (SANITIZER_FREEBSD || SANITIZER_GLIBC) && !SANITIZER_GO +void InitTlsSize() {} +# endif // (SANITIZER_FREEBSD || SANITIZER_GLIBC) && !SANITIZER_GO # if (SANITIZER_FREEBSD || SANITIZER_LINUX || SANITIZER_SOLARIS) && \ !SANITIZER_ANDROID && !SANITIZER_GO From c86b1b0f44509585390c8df09b41d707e6a14011 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Wed, 18 Sep 2024 16:39:04 -0700 Subject: [PATCH 125/321] [lldb] Add handling of ECError to unit test to validate a hypothesis on the bots --- .../unittests/TestingSupport/Host/SocketTestUtilities.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lldb/unittests/TestingSupport/Host/SocketTestUtilities.cpp b/lldb/unittests/TestingSupport/Host/SocketTestUtilities.cpp index 86aed292ec01f8..9777555d57ffff 100644 --- a/lldb/unittests/TestingSupport/Host/SocketTestUtilities.cpp +++ b/lldb/unittests/TestingSupport/Host/SocketTestUtilities.cpp @@ -110,6 +110,14 @@ static bool CheckIPSupport(llvm::StringRef Proto, llvm::StringRef Addr) { std::make_error_code(std::errc::address_family_not_supported) || ec == std::make_error_code(std::errc::address_not_available)) HasProtocolError = true; + }, + [&](std::unique_ptr ECErr) { + // FIXME: This code path should not be reachable. + std::error_code ec = ECErr->convertToErrorCode(); + if (ec == + std::make_error_code(std::errc::address_family_not_supported) || + ec == std::make_error_code(std::errc::address_not_available)) + HasProtocolError = true; }); if (HasProtocolError) { GTEST_LOG_(WARNING) From 999313debe8a87760b128e4469f17ec0ce1a4a8f Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Wed, 18 Sep 2024 16:45:43 -0700 Subject: [PATCH 126/321] [sanitizer] Switch from lazy `ThreadDescriptorSize` (#108923) `ThreadDescriptorSize` uses `dlsym` which may use malloc in unexpected time. It's relatively easy to init size from the main init. --- .../sanitizer_linux_libcdep.cpp | 30 +++++++++---------- .../tests/sanitizer_linux_test.cpp | 2 ++ 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp index 6b43fea507a401..aa156acd7b657a 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp @@ -224,7 +224,7 @@ static void GetGLibcVersion(int *major, int *minor, int *patch) { // to get the pointer to thread-specific data keys in the thread control block. # if (SANITIZER_FREEBSD || SANITIZER_GLIBC) && !SANITIZER_GO // sizeof(struct pthread) from glibc. -static atomic_uintptr_t thread_descriptor_size; +static uptr thread_descriptor_size; // FIXME: Implementation is very GLIBC specific, but it's used by FREEBSD. static uptr ThreadDescriptorSizeFallback() { @@ -305,20 +305,7 @@ static uptr ThreadDescriptorSizeFallback() { # endif } -uptr ThreadDescriptorSize() { - uptr val = atomic_load_relaxed(&thread_descriptor_size); - if (val) - return val; - // _thread_db_sizeof_pthread is a GLIBC_PRIVATE symbol that is exported in - // glibc 2.34 and later. - if (unsigned *psizeof = static_cast( - dlsym(RTLD_DEFAULT, "_thread_db_sizeof_pthread"))) - val = *psizeof; - if (!val) - val = ThreadDescriptorSizeFallback(); - atomic_store_relaxed(&thread_descriptor_size, val); - return val; -} +uptr ThreadDescriptorSize() { return thread_descriptor_size; } # if SANITIZER_GLIBC __attribute__((unused)) static size_t g_tls_size; @@ -330,6 +317,15 @@ void InitTlsSize() { GetGLibcVersion(&major, &minor, &patch); g_use_dlpi_tls_data = major == 2 && minor >= 25; + if (major == 2 && minor >= 34) { + // _thread_db_sizeof_pthread is a GLIBC_PRIVATE symbol that is exported in + // glibc 2.34 and later. + if (unsigned *psizeof = static_cast( + dlsym(RTLD_DEFAULT, "_thread_db_sizeof_pthread"))) { + thread_descriptor_size = *psizeof; + } + } + # if defined(__aarch64__) || defined(__x86_64__) || \ defined(__powerpc64__) || defined(__loongarch__) auto *get_tls_static_info = (void (*)(size_t *, size_t *))dlsym( @@ -339,7 +335,11 @@ void InitTlsSize() { if (get_tls_static_info) get_tls_static_info(&g_tls_size, &tls_align); # endif + # endif // SANITIZER_GLIBC + + if (!thread_descriptor_size) + thread_descriptor_size = ThreadDescriptorSizeFallback(); } # if defined(__mips__) || defined(__powerpc64__) || SANITIZER_RISCV64 || \ diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_linux_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_linux_test.cpp index 338c4d3bab2b04..b286ab72a5c795 100644 --- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_linux_test.cpp +++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_linux_test.cpp @@ -202,6 +202,8 @@ TEST(SanitizerLinux, ThreadDescriptorSize) { void *result; ASSERT_EQ(0, pthread_create(&tid, 0, thread_descriptor_size_test_func, 0)); ASSERT_EQ(0, pthread_join(tid, &result)); + EXPECT_EQ(0u, ThreadDescriptorSize()); + InitTlsSize(); EXPECT_EQ((uptr)result, ThreadDescriptorSize()); } # endif From 05a292caa57de13a6d0bc25dee2be25c2a0dcee3 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Wed, 18 Sep 2024 16:50:32 -0700 Subject: [PATCH 127/321] Revert "[lldb] Add handling of ECError to unit test to validate a hypothesis on the bots" This reverts commit c86b1b0f44509585390c8df09b41d707e6a14011. --- .../unittests/TestingSupport/Host/SocketTestUtilities.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/lldb/unittests/TestingSupport/Host/SocketTestUtilities.cpp b/lldb/unittests/TestingSupport/Host/SocketTestUtilities.cpp index 9777555d57ffff..86aed292ec01f8 100644 --- a/lldb/unittests/TestingSupport/Host/SocketTestUtilities.cpp +++ b/lldb/unittests/TestingSupport/Host/SocketTestUtilities.cpp @@ -110,14 +110,6 @@ static bool CheckIPSupport(llvm::StringRef Proto, llvm::StringRef Addr) { std::make_error_code(std::errc::address_family_not_supported) || ec == std::make_error_code(std::errc::address_not_available)) HasProtocolError = true; - }, - [&](std::unique_ptr ECErr) { - // FIXME: This code path should not be reachable. - std::error_code ec = ECErr->convertToErrorCode(); - if (ec == - std::make_error_code(std::errc::address_family_not_supported) || - ec == std::make_error_code(std::errc::address_not_available)) - HasProtocolError = true; }); if (HasProtocolError) { GTEST_LOG_(WARNING) From 79a69cb06665859658677b9ade4a1a262490f8c1 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Wed, 18 Sep 2024 17:27:38 -0700 Subject: [PATCH 128/321] Revert "[lldb] Store ECError as CloneableECError in Status" This reverts commit cf02d8bbb6dfed17081fbdbf44e2071aea1af728. --- lldb/source/Utility/Status.cpp | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/lldb/source/Utility/Status.cpp b/lldb/source/Utility/Status.cpp index a659456b9b1b39..f557cb540b5655 100644 --- a/lldb/source/Utility/Status.cpp +++ b/lldb/source/Utility/Status.cpp @@ -144,19 +144,13 @@ static llvm::Error CloneError(const llvm::Error &error) { return llvm::make_error(e.message(), e.convertToErrorCode(), true); }; - llvm::visitErrors(error, [&](const llvm::ErrorInfoBase &e) { + visitErrors(error, [&](const llvm::ErrorInfoBase &e) { result = joinErrors(std::move(result), clone(e)); }); return result; } -Status Status::FromError(llvm::Error error) { - if (error.isA()) { - std::error_code ec = llvm::errorToErrorCode(std::move(error)); - return Status::FromError(llvm::make_error(ec)); - } - return Status(std::move(error)); -} +Status Status::FromError(llvm::Error error) { return Status(std::move(error)); } llvm::Error Status::ToError() const { return CloneError(m_error); } From 8b456b436f996b4c6dfee0abc704b9219d43f7d2 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Wed, 18 Sep 2024 17:27:40 -0700 Subject: [PATCH 129/321] Revert "[lldb] Update SocketTestUtilities.cpp to use CloneableECError" This reverts commit 2383bc8216bd7d373bb48337199c09d40922adf2. --- lldb/unittests/TestingSupport/Host/SocketTestUtilities.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lldb/unittests/TestingSupport/Host/SocketTestUtilities.cpp b/lldb/unittests/TestingSupport/Host/SocketTestUtilities.cpp index 86aed292ec01f8..86fb5b05ea8009 100644 --- a/lldb/unittests/TestingSupport/Host/SocketTestUtilities.cpp +++ b/lldb/unittests/TestingSupport/Host/SocketTestUtilities.cpp @@ -104,13 +104,14 @@ static bool CheckIPSupport(llvm::StringRef Proto, llvm::StringRef Addr) { bool HasProtocolError = false; handleAllErrors( std::move(Err), - [&](std::unique_ptr ECErr) { + [&](std::unique_ptr ECErr) { std::error_code ec = ECErr->convertToErrorCode(); if (ec == std::make_error_code(std::errc::address_family_not_supported) || ec == std::make_error_code(std::errc::address_not_available)) HasProtocolError = true; - }); + }, + [](const llvm::ErrorInfoBase &) {}); if (HasProtocolError) { GTEST_LOG_(WARNING) << llvm::formatv( From 27303736512c32bb87b67a4ecf0b17881a8811c6 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Wed, 18 Sep 2024 17:27:42 -0700 Subject: [PATCH 130/321] Revert "Add noexcept qualifier to placate g++" This reverts commit b4a8e877ee3002a8cfd613f7950afcbe1d98821c. --- lldb/source/Utility/Status.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/lldb/source/Utility/Status.cpp b/lldb/source/Utility/Status.cpp index f557cb540b5655..faa8d3a83c7ed1 100644 --- a/lldb/source/Utility/Status.cpp +++ b/lldb/source/Utility/Status.cpp @@ -48,7 +48,7 @@ char ExpressionError::ID; namespace { /// A std::error_code category for eErrorTypeGeneric. class LLDBGenericCategory : public std::error_category { - const char *name() const noexcept override { return "LLDBGenericCategory"; } + const char *name() const override { return "LLDBGenericCategory"; } std::string message(int __ev) const override { return "generic LLDB error"; }; }; LLDBGenericCategory &lldb_generic_category() { @@ -58,9 +58,7 @@ LLDBGenericCategory &lldb_generic_category() { /// A std::error_code category for eErrorTypeExpression. class ExpressionCategory : public std::error_category { - const char *name() const noexcept override { - return "LLDBExpressionCategory"; - } + const char *name() const override { return "LLDBExpressionCategory"; } std::string message(int __ev) const override { return ExpressionResultAsCString( static_cast(__ev)); From cb6d53198e39838ba6f9d2974c4f4317057d1556 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Wed, 18 Sep 2024 17:27:44 -0700 Subject: [PATCH 131/321] Revert "[lldb] Change the implementation of Status to store an llvm::Error (NFC) (#106774)" This reverts commit 06939fa2e140a171132275ec0ea1857d20c5dbdd. --- lldb/include/lldb/Utility/Status.h | 85 +----- .../Python/PythonDataObjects.cpp | 31 +-- lldb/source/Utility/Status.cpp | 253 ++++++------------ .../Host/SocketTestUtilities.cpp | 16 +- lldb/unittests/Utility/StatusTest.cpp | 8 - 5 files changed, 108 insertions(+), 285 deletions(-) diff --git a/lldb/include/lldb/Utility/Status.h b/lldb/include/lldb/Utility/Status.h index 4a09c38ce62f1b..795c830b965173 100644 --- a/lldb/include/lldb/Utility/Status.h +++ b/lldb/include/lldb/Utility/Status.h @@ -28,69 +28,6 @@ namespace lldb_private { const char *ExpressionResultAsCString(lldb::ExpressionResults result); -/// Going a bit against the spirit of llvm::Error, -/// lldb_private::Status need to store errors long-term and sometimes -/// copy them. This base class defines an interface for this -/// operation. -class CloneableError - : public llvm::ErrorInfo { -public: - using llvm::ErrorInfo::ErrorInfo; - CloneableError() : ErrorInfo() {} - virtual std::unique_ptr Clone() const = 0; - static char ID; -}; - -/// Common base class for all error-code errors. -class CloneableECError - : public llvm::ErrorInfo { -public: - using llvm::ErrorInfo::ErrorInfo; - CloneableECError() = delete; - CloneableECError(std::error_code ec) : ErrorInfo(), EC(ec) {} - std::error_code convertToErrorCode() const override { return EC; } - void log(llvm::raw_ostream &OS) const override { OS << EC.message(); } - std::unique_ptr Clone() const override; - static char ID; - -protected: - std::error_code EC; -}; - -/// FIXME: Move these declarations closer to where they're used. -class MachKernelError - : public llvm::ErrorInfo { -public: - using llvm::ErrorInfo::ErrorInfo; - MachKernelError(std::error_code ec) : ErrorInfo(ec) {} - std::string message() const override; - std::unique_ptr Clone() const override; - static char ID; -}; - -class Win32Error : public llvm::ErrorInfo { -public: - using llvm::ErrorInfo::ErrorInfo; - Win32Error(std::error_code ec, const llvm::Twine &msg = {}) : ErrorInfo(ec) {} - std::string message() const override; - std::unique_ptr Clone() const override; - static char ID; -}; - -class ExpressionError - : public llvm::ErrorInfo { -public: - using llvm::ErrorInfo::ErrorInfo; - ExpressionError(std::error_code ec, std::string msg = {}) - : ErrorInfo(ec), m_string(msg) {} - std::unique_ptr Clone() const override; - std::string message() const override { return m_string; } - static char ID; - -protected: - std::string m_string; -}; - /// \class Status Status.h "lldb/Utility/Status.h" An error handling class. /// /// This class is designed to be able to hold any error code that can be @@ -163,7 +100,9 @@ class Status { } static Status FromExpressionError(lldb::ExpressionResults result, - std::string msg); + std::string msg) { + return Status(result, lldb::eErrorTypeExpression, msg); + } /// Set the current error to errno. /// @@ -176,7 +115,6 @@ class Status { const Status &operator=(Status &&); /// Avoid using this in new code. Migrate APIs to llvm::Expected instead. static Status FromError(llvm::Error error); - /// FIXME: Replace this with a takeError() method. llvm::Error ToError() const; /// Don't call this function in new code. Instead, redesign the API @@ -211,20 +149,12 @@ class Status { /// Access the error value. /// - /// If the internally stored \ref llvm::Error is an \ref - /// llvm::ErrorList then this returns the error value of the first - /// error. - /// /// \return /// The error value. ValueType GetError() const; /// Access the error type. /// - /// If the internally stored \ref llvm::Error is an \ref - /// llvm::ErrorList then this returns the error value of the first - /// error. - /// /// \return /// The error type enumeration value. lldb::ErrorType GetType() const; @@ -240,9 +170,12 @@ class Status { bool Success() const; protected: - Status(llvm::Error error) : m_error(std::move(error)) {} - llvm::Error m_error; - /// TODO: Replace this with just callling toString(m_error). + Status(llvm::Error error); + /// Status code as an integer value. + ValueType m_code = 0; + /// The type of the above error code. + lldb::ErrorType m_type = lldb::eErrorTypeInvalid; + /// A string representation of the error code. mutable std::string m_string; }; diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.cpp index 6ddd00df3a2180..24cf3430006329 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.cpp @@ -993,8 +993,8 @@ void PythonException::Restore() { } PythonException::~PythonException() { - Py_XDECREF(m_exception); Py_XDECREF(m_exception_type); + Py_XDECREF(m_exception); Py_XDECREF(m_traceback); Py_XDECREF(m_repr_bytes); } @@ -1108,10 +1108,9 @@ template class OwnedPythonFile : public Base { py_error = Status::FromError(r.takeError()); } base_error = Base::Close(); - // Cloning since the wrapped exception may still reference the PyThread. if (py_error.Fail()) - return py_error.Clone(); - return base_error.Clone(); + return py_error; + return base_error; }; PyObject *GetPythonObject() const { @@ -1197,8 +1196,7 @@ class PythonIOFile : public OwnedPythonFile { return Flush(); auto r = m_py_obj.CallMethod("close"); if (!r) - // Cloning since the wrapped exception may still reference the PyThread. - return Status::FromError(r.takeError()).Clone(); + return Status::FromError(r.takeError()); return Status(); } @@ -1206,8 +1204,7 @@ class PythonIOFile : public OwnedPythonFile { GIL takeGIL; auto r = m_py_obj.CallMethod("flush"); if (!r) - // Cloning since the wrapped exception may still reference the PyThread. - return Status::FromError(r.takeError()).Clone(); + return Status::FromError(r.takeError()); return Status(); } @@ -1243,8 +1240,7 @@ class BinaryPythonFile : public PythonIOFile { PyObject *pybuffer_p = PyMemoryView_FromMemory( const_cast((const char *)buf), num_bytes, PyBUF_READ); if (!pybuffer_p) - // Cloning since the wrapped exception may still reference the PyThread. - return Status::FromError(llvm::make_error()).Clone(); + return Status::FromError(llvm::make_error()); auto pybuffer = Take(pybuffer_p); num_bytes = 0; auto bytes_written = As(m_py_obj.CallMethod("write", pybuffer)); @@ -1264,8 +1260,7 @@ class BinaryPythonFile : public PythonIOFile { auto pybuffer_obj = m_py_obj.CallMethod("read", (unsigned long long)num_bytes); if (!pybuffer_obj) - // Cloning since the wrapped exception may still reference the PyThread. - return Status::FromError(pybuffer_obj.takeError()).Clone(); + return Status::FromError(pybuffer_obj.takeError()); num_bytes = 0; if (pybuffer_obj.get().IsNone()) { // EOF @@ -1274,8 +1269,7 @@ class BinaryPythonFile : public PythonIOFile { } auto pybuffer = PythonBuffer::Create(pybuffer_obj.get()); if (!pybuffer) - // Cloning since the wrapped exception may still reference the PyThread. - return Status::FromError(pybuffer.takeError()).Clone(); + return Status::FromError(pybuffer.takeError()); memcpy(buf, pybuffer.get().get().buf, pybuffer.get().get().len); num_bytes = pybuffer.get().get().len; return Status(); @@ -1306,8 +1300,7 @@ class TextPythonFile : public PythonIOFile { auto bytes_written = As(m_py_obj.CallMethod("write", pystring.get())); if (!bytes_written) - // Cloning since the wrapped exception may still reference the PyThread. - return Status::FromError(bytes_written.takeError()).Clone(); + return Status::FromError(bytes_written.takeError()); if (bytes_written.get() < 0) return Status::FromErrorString( ".write() method returned a negative number!"); @@ -1328,16 +1321,14 @@ class TextPythonFile : public PythonIOFile { auto pystring = As( m_py_obj.CallMethod("read", (unsigned long long)num_chars)); if (!pystring) - // Cloning since the wrapped exception may still reference the PyThread. - return Status::FromError(pystring.takeError()).Clone(); + return Status::FromError(pystring.takeError()); if (pystring.get().IsNone()) { // EOF return Status(); } auto stringref = pystring.get().AsUTF8(); if (!stringref) - // Cloning since the wrapped exception may still reference the PyThread. - return Status::FromError(stringref.takeError()).Clone(); + return Status::FromError(stringref.takeError()); num_bytes = stringref.get().size(); memcpy(buf, stringref.get().begin(), num_bytes); return Status(); diff --git a/lldb/source/Utility/Status.cpp b/lldb/source/Utility/Status.cpp index faa8d3a83c7ed1..4af3af5fba0185 100644 --- a/lldb/source/Utility/Status.cpp +++ b/lldb/source/Utility/Status.cpp @@ -8,8 +8,6 @@ #include "lldb/Utility/Status.h" -#include "lldb/Utility/LLDBLog.h" -#include "lldb/Utility/Log.h" #include "lldb/Utility/VASPrintf.h" #include "lldb/lldb-defines.h" #include "lldb/lldb-enumerations.h" @@ -39,78 +37,48 @@ class raw_ostream; using namespace lldb; using namespace lldb_private; -char CloneableError::ID; -char CloneableECError::ID; -char MachKernelError::ID; -char Win32Error::ID; -char ExpressionError::ID; - -namespace { -/// A std::error_code category for eErrorTypeGeneric. -class LLDBGenericCategory : public std::error_category { - const char *name() const override { return "LLDBGenericCategory"; } - std::string message(int __ev) const override { return "generic LLDB error"; }; -}; -LLDBGenericCategory &lldb_generic_category() { - static LLDBGenericCategory g_generic_category; - return g_generic_category; -} - -/// A std::error_code category for eErrorTypeExpression. -class ExpressionCategory : public std::error_category { - const char *name() const override { return "LLDBExpressionCategory"; } - std::string message(int __ev) const override { - return ExpressionResultAsCString( - static_cast(__ev)); - }; -}; -ExpressionCategory &expression_category() { - static ExpressionCategory g_expression_category; - return g_expression_category; -} -} // namespace - -Status::Status() : m_error(llvm::Error::success()) {} - -static llvm::Error ErrorFromEnums(Status::ValueType err, ErrorType type, - std::string msg) { - switch (type) { - case eErrorTypeMachKernel: - return llvm::make_error( - std::error_code(err, std::system_category())); - case eErrorTypeWin32: - return llvm::make_error( - std::error_code(err, std::system_category())); - case eErrorTypePOSIX: - if (msg.empty()) - return llvm::errorCodeToError( - std::error_code(err, std::generic_category())); - return llvm::createStringError( - std::move(msg), std::error_code(err, std::generic_category())); - default: - return llvm::createStringError( - std::move(msg), std::error_code(err, lldb_generic_category())); - } -} +Status::Status() {} Status::Status(ValueType err, ErrorType type, std::string msg) - : m_error(ErrorFromEnums(err, type, msg)) {} + : m_code(err), m_type(type), m_string(std::move(msg)) {} -// This logic is confusing because C++ calls the traditional (posix) errno codes +// This logic is confusing because c++ calls the traditional (posix) errno codes // "generic errors", while we use the term "generic" to mean completely // arbitrary (text-based) errors. Status::Status(std::error_code EC) - : m_error(!EC ? llvm::Error::success() : llvm::errorCodeToError(EC)) {} + : m_code(EC.value()), + m_type(EC.category() == std::generic_category() ? eErrorTypePOSIX + : eErrorTypeGeneric), + m_string(EC.message()) {} Status::Status(std::string err_str) - : m_error( - llvm::createStringError(llvm::inconvertibleErrorCode(), err_str)) {} + : m_code(LLDB_GENERIC_ERROR), m_type(eErrorTypeGeneric), + m_string(std::move(err_str)) {} -const Status &Status::operator=(Status &&other) { - Clear(); - llvm::consumeError(std::move(m_error)); - m_error = std::move(other.m_error); - return *this; +Status::Status(llvm::Error error) { + if (!error) { + Clear(); + return; + } + + // if the error happens to be a errno error, preserve the error code + error = llvm::handleErrors( + std::move(error), [&](std::unique_ptr e) -> llvm::Error { + std::error_code ec = e->convertToErrorCode(); + if (ec.category() == std::generic_category()) { + m_code = ec.value(); + m_type = ErrorType::eErrorTypePOSIX; + return llvm::Error::success(); + } + return llvm::Error(std::move(e)); + }); + + // Otherwise, just preserve the message + if (error) { + m_code = LLDB_GENERIC_ERROR; + m_type = eErrorTypeGeneric; + m_string = llvm::toString(std::move(error)); + } } Status Status::FromErrorStringWithFormat(const char *format, ...) { @@ -126,33 +94,25 @@ Status Status::FromErrorStringWithFormat(const char *format, ...) { return Status(string); } -Status Status::FromExpressionError(lldb::ExpressionResults result, - std::string msg) { - return Status(llvm::make_error( - std::error_code(result, expression_category()), msg)); -} +Status Status::FromError(llvm::Error error) { return Status(std::move(error)); } -/// Creates a deep copy of all known errors and converts all other -/// errors to a new llvm::StringError. -static llvm::Error CloneError(const llvm::Error &error) { - llvm::Error result = llvm::Error::success(); - auto clone = [](const llvm::ErrorInfoBase &e) { - if (e.isA()) - return llvm::Error(static_cast(e).Clone()); - return llvm::make_error(e.message(), - e.convertToErrorCode(), true); - }; - visitErrors(error, [&](const llvm::ErrorInfoBase &e) { - result = joinErrors(std::move(result), clone(e)); - }); - return result; +llvm::Error Status::ToError() const { + if (Success()) + return llvm::Error::success(); + if (m_type == ErrorType::eErrorTypePOSIX) + return llvm::errorCodeToError( + std::error_code(m_code, std::generic_category())); + return llvm::createStringError(AsCString()); } -Status Status::FromError(llvm::Error error) { return Status(std::move(error)); } - -llvm::Error Status::ToError() const { return CloneError(m_error); } +Status::~Status() = default; -Status::~Status() { llvm::consumeError(std::move(m_error)); } +const Status &Status::operator=(Status &&other) { + m_code = other.m_code; + m_type = other.m_type; + m_string = std::move(other.m_string); + return *this; +} #ifdef _WIN32 static std::string RetrieveWin32ErrorString(uint32_t error_code) { @@ -180,37 +140,6 @@ static std::string RetrieveWin32ErrorString(uint32_t error_code) { } #endif -std::string MachKernelError::message() const { -#if defined(__APPLE__) - if (const char *s = ::mach_error_string(convertToErrorCode().value())) - return s; -#endif - return "MachKernelError"; -} - -std::string Win32Error::message() const { -#if defined(_WIN32) - return RetrieveWin32ErrorString(convertToErrorCode().value()); -#endif - return "Win32Error"; -} - -std::unique_ptr CloneableECError::Clone() const { - return std::make_unique(convertToErrorCode()); -} - -std::unique_ptr MachKernelError::Clone() const { - return std::make_unique(convertToErrorCode()); -} - -std::unique_ptr Win32Error::Clone() const { - return std::make_unique(convertToErrorCode()); -} - -std::unique_ptr ExpressionError::Clone() const { - return std::make_unique(convertToErrorCode(), message()); -} - // Get the error value as a NULL C string. The error string will be fetched and // cached on demand. The cached error string value will remain until the error // value is changed or cleared. @@ -218,12 +147,29 @@ const char *Status::AsCString(const char *default_error_str) const { if (Success()) return nullptr; - m_string = llvm::toStringWithoutConsuming(m_error); - // Backwards compatibility with older implementations of Status. - if (m_error.isA()) - if (!m_string.empty() && m_string[m_string.size() - 1] == '\n') - m_string.pop_back(); + if (m_string.empty()) { + switch (m_type) { + case eErrorTypeMachKernel: +#if defined(__APPLE__) + if (const char *s = ::mach_error_string(m_code)) + m_string.assign(s); +#endif + break; + + case eErrorTypePOSIX: + m_string = llvm::sys::StrError(m_code); + break; + + case eErrorTypeWin32: +#if defined(_WIN32) + m_string = RetrieveWin32ErrorString(m_code); +#endif + break; + default: + break; + } + } if (m_string.empty()) { if (default_error_str) m_string.assign(default_error_str); @@ -235,64 +181,29 @@ const char *Status::AsCString(const char *default_error_str) const { // Clear the error and any cached error string that it might contain. void Status::Clear() { - if (m_error) - LLDB_LOG_ERRORV(GetLog(LLDBLog::API), std::move(m_error), - "dropping error {0}"); - m_error = llvm::Error::success(); + m_code = 0; + m_type = eErrorTypeInvalid; + m_string.clear(); } -Status::ValueType Status::GetError() const { - Status::ValueType result = 0; - llvm::visitErrors(m_error, [&](const llvm::ErrorInfoBase &error) { - // Return the first only. - if (result) - return; - std::error_code ec = error.convertToErrorCode(); - result = ec.value(); - }); - return result; -} +// Access the error value. +Status::ValueType Status::GetError() const { return m_code; } // Access the error type. -ErrorType Status::GetType() const { - ErrorType result = eErrorTypeInvalid; - llvm::visitErrors(m_error, [&](const llvm::ErrorInfoBase &error) { - // Return the first only. - if (result != eErrorTypeInvalid) - return; - if (error.isA()) - result = eErrorTypeMachKernel; - else if (error.isA()) - result = eErrorTypeWin32; - else if (error.isA()) - result = eErrorTypeExpression; - else if (error.convertToErrorCode().category() == std::generic_category()) - result = eErrorTypePOSIX; - else if (error.convertToErrorCode().category() == lldb_generic_category() || - error.convertToErrorCode() == llvm::inconvertibleErrorCode()) - result = eErrorTypeGeneric; - else - result = eErrorTypeInvalid; - }); - return result; -} +ErrorType Status::GetType() const { return m_type; } -bool Status::Fail() const { - // Note that this does not clear the checked flag in - // m_error. Otherwise we'd need to make this thread-safe. - return m_error.isA(); -} +// Returns true if this object contains a value that describes an error or +// otherwise non-success result. +bool Status::Fail() const { return m_code != 0; } Status Status::FromErrno() { - std::error_code ec = llvm::errnoAsErrorCode(); - if (ec) - return Status::FromError(llvm::make_error(ec)); - return Status(); + // Update the error value to be "errno" and update the type to be "POSIX". + return Status(errno, eErrorTypePOSIX); } // Returns true if the error code in this object is considered a successful // return value. -bool Status::Success() const { return !Fail(); } +bool Status::Success() const { return m_code == 0; } void llvm::format_provider::format( const lldb_private::Status &error, llvm::raw_ostream &OS, diff --git a/lldb/unittests/TestingSupport/Host/SocketTestUtilities.cpp b/lldb/unittests/TestingSupport/Host/SocketTestUtilities.cpp index 86fb5b05ea8009..2455a4f6f5d490 100644 --- a/lldb/unittests/TestingSupport/Host/SocketTestUtilities.cpp +++ b/lldb/unittests/TestingSupport/Host/SocketTestUtilities.cpp @@ -102,16 +102,12 @@ static bool CheckIPSupport(llvm::StringRef Proto, llvm::StringRef Addr) { Proto, Err) .str(); bool HasProtocolError = false; - handleAllErrors( - std::move(Err), - [&](std::unique_ptr ECErr) { - std::error_code ec = ECErr->convertToErrorCode(); - if (ec == - std::make_error_code(std::errc::address_family_not_supported) || - ec == std::make_error_code(std::errc::address_not_available)) - HasProtocolError = true; - }, - [](const llvm::ErrorInfoBase &) {}); + handleAllErrors(std::move(Err), [&](std::unique_ptr ECErr) { + std::error_code ec = ECErr->convertToErrorCode(); + if (ec == std::make_error_code(std::errc::address_family_not_supported) || + ec == std::make_error_code(std::errc::address_not_available)) + HasProtocolError = true; + }); if (HasProtocolError) { GTEST_LOG_(WARNING) << llvm::formatv( diff --git a/lldb/unittests/Utility/StatusTest.cpp b/lldb/unittests/Utility/StatusTest.cpp index e37c94ac17f2d0..be4f2beebcdb52 100644 --- a/lldb/unittests/Utility/StatusTest.cpp +++ b/lldb/unittests/Utility/StatusTest.cpp @@ -70,14 +70,6 @@ TEST(StatusTest, ErrorConversion) { llvm::Error foo = Status::FromErrorString("foo").ToError(); EXPECT_TRUE(bool(foo)); EXPECT_EQ("foo", llvm::toString(std::move(foo))); - - llvm::Error eperm = llvm::errorCodeToError({EPERM, std::generic_category()}); - llvm::Error eintr = llvm::errorCodeToError({EINTR, std::generic_category()}); - llvm::Error elist = llvm::joinErrors(std::move(eperm), std::move(eintr)); - elist = llvm::joinErrors(std::move(elist), llvm::createStringError("foo")); - Status list = Status::FromError(std::move(elist)); - EXPECT_EQ((int)list.GetError(), EPERM); - EXPECT_EQ(list.GetType(), eErrorTypePOSIX); } #ifdef _WIN32 From 6dcde731eb13aeaa8296504ad8178d62c8ecd3eb Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Wed, 18 Sep 2024 17:27:46 -0700 Subject: [PATCH 132/321] Revert "[lldb] Only send "posix" error codes through the gdb-remote protocol" This reverts commit a7c174502aef45b2d33291129cce10c085fef944. --- .../Process/gdb-remote/GDBRemoteCommunicationServer.cpp | 7 +++---- .../gdb-remote/GDBRemoteCommunicationServerTest.cpp | 3 +-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServer.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServer.cpp index d4aa90b2c7731a..9b72cb00352821 100644 --- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServer.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServer.cpp @@ -103,14 +103,13 @@ GDBRemoteCommunicationServer::SendErrorResponse(uint8_t err) { GDBRemoteCommunication::PacketResult GDBRemoteCommunicationServer::SendErrorResponse(const Status &error) { - uint8_t code = error.GetType() == eErrorTypePOSIX ? error.GetError() : 0xff; if (m_send_error_strings) { lldb_private::StreamString packet; - packet.Printf("E%2.2x;", code); + packet.Printf("E%2.2x;", static_cast(error.GetError())); packet.PutStringAsRawHex8(error.AsCString()); return SendPacketNoLock(packet.GetString()); - } - return SendErrorResponse(code); + } else + return SendErrorResponse(error.GetError()); } GDBRemoteCommunication::PacketResult diff --git a/lldb/unittests/Process/gdb-remote/GDBRemoteCommunicationServerTest.cpp b/lldb/unittests/Process/gdb-remote/GDBRemoteCommunicationServerTest.cpp index ba9ca6ea73e3be..69ca1720c04fc9 100644 --- a/lldb/unittests/Process/gdb-remote/GDBRemoteCommunicationServerTest.cpp +++ b/lldb/unittests/Process/gdb-remote/GDBRemoteCommunicationServerTest.cpp @@ -12,7 +12,6 @@ #include "Plugins/Process/gdb-remote/GDBRemoteCommunicationServer.h" #include "lldb/Utility/Connection.h" #include "lldb/Utility/UnimplementedError.h" -#include "lldb/lldb-enumerations.h" namespace lldb_private { namespace process_gdb_remote { @@ -26,7 +25,7 @@ TEST(GDBRemoteCommunicationServerTest, SendErrorResponse_ErrorNumber) { TEST(GDBRemoteCommunicationServerTest, SendErrorResponse_Status) { MockServerWithMockConnection server; - Status status(0x42, lldb::eErrorTypePOSIX, "Test error message"); + Status status(0x42, lldb::eErrorTypeGeneric, "Test error message"); server.SendErrorResponse(status); EXPECT_THAT( From 4e659c6ca3cb6fb24e84b8744516478096f23c73 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Wed, 18 Sep 2024 17:28:49 -0700 Subject: [PATCH 133/321] [NFC][sanitizer] Use InitializePlatformEarly() in test (#109224) Fix windows test after #108921. --- .../lib/sanitizer_common/tests/sanitizer_common_test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_common_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_common_test.cpp index 7fd6bad4c0e6c1..111e55ef36bfb8 100644 --- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_common_test.cpp +++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_common_test.cpp @@ -236,12 +236,12 @@ static void *WorkerThread(void *arg) { } TEST(SanitizerCommon, ThreadStackTlsMain) { - InitTlsSize(); + InitializePlatformEarly(); TestThreadInfo(true); } TEST(SanitizerCommon, ThreadStackTlsWorker) { - InitTlsSize(); + InitializePlatformEarly(); pthread_t t; PTHREAD_CREATE(&t, 0, WorkerThread, 0); PTHREAD_JOIN(t, 0); From 1bda7ba12ebccc663dff224e3bf2e4ad2235c05f Mon Sep 17 00:00:00 2001 From: vporpo Date: Wed, 18 Sep 2024 17:29:59 -0700 Subject: [PATCH 134/321] [SandboxIR] Add Instruction::isStackSaveRestoreIntrinsic() and isMemDepCandidate() (#109212) These are helper functions to be used by the vectorizer's dependency graph. --- llvm/include/llvm/SandboxIR/SandboxIR.h | 22 +++++++ llvm/unittests/SandboxIR/SandboxIRTest.cpp | 71 ++++++++++++++++++++++ 2 files changed, 93 insertions(+) diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h index c01516aa9d31ac..e12063c6147c27 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIR.h +++ b/llvm/include/llvm/SandboxIR/SandboxIR.h @@ -105,6 +105,8 @@ #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/SandboxIR/Tracker.h" @@ -1943,6 +1945,26 @@ class Instruction : public sandboxir::User { /// LangRef.html for the meaning of these flags. void copyFastMathFlags(FastMathFlags FMF); + bool isStackSaveOrRestoreIntrinsic() const { + auto *I = cast(Val); + return match(I, + PatternMatch::m_Intrinsic()) || + match(I, PatternMatch::m_Intrinsic()); + } + + /// We consider \p I as a Memory Dependency Candidate instruction if it + /// reads/write memory or if it has side-effects. This is used by the + /// dependency graph. + bool isMemDepCandidate() const { + auto *I = cast(Val); + return I->mayReadOrWriteMemory() && + (!isa(I) || + (cast(I)->getIntrinsicID() != + Intrinsic::sideeffect && + cast(I)->getIntrinsicID() != + Intrinsic::pseudoprobe)); + } + #ifndef NDEBUG void dumpOS(raw_ostream &OS) const override; #endif diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp index 8807716a52738f..312705caad1a6e 100644 --- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp +++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp @@ -1702,6 +1702,77 @@ define void @foo(i8 %v1) { EXPECT_EQ(I0->getNextNode(), Ret); } +TEST_F(SandboxIRTest, Instruction_isStackSaveOrRestoreIntrinsic) { + parseIR(C, R"IR( +declare void @llvm.sideeffect() +define void @foo(i8 %v1, ptr %ptr) { + %add = add i8 %v1, %v1 + %stacksave = call ptr @llvm.stacksave() + call void @llvm.stackrestore(ptr %stacksave) + call void @llvm.sideeffect() + ret void +} +)IR"); + llvm::Function *LLVMF = &*M->getFunction("foo"); + sandboxir::Context Ctx(C); + sandboxir::Function *F = Ctx.createFunction(LLVMF); + auto *BB = &*F->begin(); + auto It = BB->begin(); + auto *Add = cast(&*It++); + auto *StackSave = cast(&*It++); + auto *StackRestore = cast(&*It++); + auto *Other = cast(&*It++); + auto *Ret = cast(&*It++); + + EXPECT_FALSE(Add->isStackSaveOrRestoreIntrinsic()); + EXPECT_TRUE(StackSave->isStackSaveOrRestoreIntrinsic()); + EXPECT_TRUE(StackRestore->isStackSaveOrRestoreIntrinsic()); + EXPECT_FALSE(Other->isStackSaveOrRestoreIntrinsic()); + EXPECT_FALSE(Ret->isStackSaveOrRestoreIntrinsic()); +} + +TEST_F(SandboxIRTest, Instruction_isMemDepCandidate) { + parseIR(C, R"IR( +declare void @llvm.fake.use(...) +declare void @llvm.sideeffect() +declare void @llvm.pseudoprobe(i64, i64, i32, i64) +declare void @bar() +define void @foo(i8 %v1, ptr %ptr) { + %add0 = add i8 %v1, %v1 + %ld0 = load i8, ptr %ptr + store i8 %v1, ptr %ptr + call void @llvm.sideeffect() + call void @llvm.pseudoprobe(i64 42, i64 1, i32 0, i64 -1) + call void @llvm.fake.use(ptr %ptr) + call void @bar() + ret void +} +)IR"); + llvm::Function *LLVMF = &*M->getFunction("foo"); + sandboxir::Context Ctx(C); + sandboxir::Function *F = Ctx.createFunction(LLVMF); + auto *Arg = F->getArg(0); + auto *BB = &*F->begin(); + auto It = BB->begin(); + auto *Add0 = cast(&*It++); + auto *Ld0 = cast(&*It++); + auto *St0 = cast(&*It++); + auto *SideEffect0 = cast(&*It++); + auto *PseudoProbe0 = cast(&*It++); + auto *OtherIntrinsic0 = cast(&*It++); + auto *CallBar = cast(&*It++); + auto *Ret = cast(&*It++); + + EXPECT_FALSE(Add0->isMemDepCandidate()); + EXPECT_TRUE(Ld0->isMemDepCandidate()); + EXPECT_TRUE(St0->isMemDepCandidate()); + EXPECT_FALSE(SideEffect0->isMemDepCandidate()); + EXPECT_FALSE(PseudoProbe0->isMemDepCandidate()); + EXPECT_TRUE(OtherIntrinsic0->isMemDepCandidate()); + EXPECT_TRUE(CallBar->isMemDepCandidate()); + EXPECT_FALSE(Ret->isMemDepCandidate()); +} + TEST_F(SandboxIRTest, VAArgInst) { parseIR(C, R"IR( define void @foo(ptr %va) { From 258fc7f582877d3bc2a26e62da4f50e467d8c640 Mon Sep 17 00:00:00 2001 From: ofAlpaca Date: Thu, 19 Sep 2024 08:33:37 +0800 Subject: [PATCH 135/321] [Clang] Fix -ast-dump-decl-types crashes on concepts (#108142) Resolve #94928 This PR adds `if (TD->getTemplateDecl())` to prevent `InnerD` becoming `nullptr`, suggested by @firstmoonlight. I also add `-ast-dump-decl-types` option and declare type `CHECK` to the testcase `clang/test/AST/ast-dump-concepts.cpp`. --------- Co-authored-by: Aaron Ballman --- clang/docs/ReleaseNotes.rst | 2 ++ clang/lib/Frontend/ASTConsumers.cpp | 3 ++- clang/test/AST/ast-dump-concepts.cpp | 19 ++++++++++++++++--- 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index d10b284310071e..b8816d7b555e87 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -427,6 +427,8 @@ Miscellaneous Clang Crashes Fixed - Fixed a crash when function has more than 65536 parameters. Now a diagnostic is emitted. (#GH35741) +- Fixed ``-ast-dump`` crashes on codes involving ``concept`` with ``-ast-dump-decl-types``. (#GH94928) + OpenACC Specific Changes ------------------------ diff --git a/clang/lib/Frontend/ASTConsumers.cpp b/clang/lib/Frontend/ASTConsumers.cpp index 7b58eaa04df95a..a6e35452b4fbe6 100644 --- a/clang/lib/Frontend/ASTConsumers.cpp +++ b/clang/lib/Frontend/ASTConsumers.cpp @@ -101,7 +101,8 @@ namespace { if (DumpDeclTypes) { Decl *InnerD = D; if (auto *TD = dyn_cast(D)) - InnerD = TD->getTemplatedDecl(); + if (Decl *TempD = TD->getTemplatedDecl()) + InnerD = TempD; // FIXME: Support OutputFormat in type dumping. // FIXME: Support combining -ast-dump-decl-types with -ast-dump-lookups. diff --git a/clang/test/AST/ast-dump-concepts.cpp b/clang/test/AST/ast-dump-concepts.cpp index a5e0673c241ef4..84d981d2ab8dec 100644 --- a/clang/test/AST/ast-dump-concepts.cpp +++ b/clang/test/AST/ast-dump-concepts.cpp @@ -1,9 +1,9 @@ -// RUN: %clang_cc1 -triple x86_64-unknown-unknown -std=c++2a -ast-dump -ast-dump-filter Foo %s | FileCheck -strict-whitespace %s +// RUN: %clang_cc1 -triple x86_64-unknown-unknown -std=c++2a -ast-dump -ast-dump-decl-types -ast-dump-filter Foo %s | FileCheck -strict-whitespace %s // Test with serialization: // RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown -emit-pch -o %t %s // RUN: %clang_cc1 -x c++ -std=c++20 -triple x86_64-unknown-unknown -include-pch %t \ -// RUN: -ast-dump-all -ast-dump-filter Foo /dev/null \ +// RUN: -ast-dump-all -ast-dump-decl-types -ast-dump-filter Foo /dev/null \ // RUN: | FileCheck --strict-whitespace %s template @@ -56,6 +56,9 @@ struct Foo { // CHECK: CXXFoldExpr {{.*}} template ... Ts> Foo(); + + // CHECK:InjectedClassNameType + // CHECK-NEXT: CXXRecord {{.*}} 'Foo' }; namespace GH82628 { @@ -75,20 +78,28 @@ template concept Foo = C; // CHECK: TemplateTypeParmDecl {{.*}} Concept {{.*}} 'C' (UsingShadow {{.*}} 'C') +// CHECK: QualType +// CHECK-NEXT: `-BuiltinType {{.*}} 'bool' template constexpr bool FooVar = false; // CHECK: ConceptSpecializationExpr {{.*}} UsingShadow {{.*}} 'C' +// CHECK: QualType +// CHECK-NEXT: `-BuiltinType {{.*}} 'bool' template requires C constexpr bool FooVar2 = true; // CHECK: SimpleRequirement // CHECK-NEXT: `-ConceptSpecializationExpr {{.*}} UsingShadow {{.*}} 'C' +// CHECK: QualType +// CHECK-NEXT: `-BuiltinType {{.*}} 'bool' template requires requires (T) { C; } constexpr bool FooVar3 = true; // CHECK: NonTypeTemplateParmDecl // CHECK-NEXT: `-ConceptSpecializationExpr {{.*}} UsingShadow {{.*}} 'C' +// CHECK: QualType +// CHECK-NEXT: `-BuiltinType {{.*}} 'bool' template constexpr bool FooVar4 = bool(T()); @@ -97,7 +108,9 @@ constexpr bool FooVar4 = bool(T()); // CHECK: NonTypeTemplateParmDecl {{.*}} depth 0 index 1 U // CHECK-NEXT: `-ConceptSpecializationExpr {{.*}} UsingShadow {{.*}} 'C' // CHECK: |-TemplateTypeParmDecl {{.*}} Concept {{.*}} 'C' (UsingShadow {{.*}} 'C') depth 0 index 2 V:auto - +// CHECK: FunctionProtoType +// CHECK: `-Concept {{.*}} 'C' +// CHECK: `-TemplateTypeParm {{.*}} 'V:auto' template auto FooFunc(C auto V) -> C decltype(auto) { // FIXME: TypeLocs inside of the function body cannot be dumped via -ast-dump for now. From 8f3fb5d982db63572c11dd602780218ec45df986 Mon Sep 17 00:00:00 2001 From: Xing Guo Date: Thu, 19 Sep 2024 08:39:47 +0800 Subject: [PATCH 136/321] [Doc] Improve documentation for JITLink. (#109163) This patch improves the documentation for JITLink by fixing some typos, correcting indentations and fixing out-dated code examples. --- llvm/docs/JITLink.rst | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/llvm/docs/JITLink.rst b/llvm/docs/JITLink.rst index b0a0dc77880dfd..89027123f47500 100644 --- a/llvm/docs/JITLink.rst +++ b/llvm/docs/JITLink.rst @@ -11,7 +11,7 @@ Introduction This document aims to provide a high-level overview of the design and API of the JITLink library. It assumes some familiarity with linking and relocatable object files, but should not require deep expertise. If you know -what a section, symbol, and relocation are you should find this document +what a section, symbol, and relocation are then you should find this document accessible. If it is not, please submit a patch (:doc:`Contributing`) or file a bug (:doc:`HowToSubmitABug`). @@ -56,7 +56,7 @@ and optimizations that were not possible under MCJIT or RuntimeDyld. ObjectLinkingLayer Plugins -------------------------- -The ``ObjectLinkingLayer::Plugin`` class provides the following methods: +The ``ObjectLinkingLayer::Plugin`` class provides the following methods: * ``modifyPassConfig`` is called each time a LinkGraph is about to be linked. It can be overridden to install JITLink *Passes* to run during the link process. @@ -64,7 +64,7 @@ The ``ObjectLinkingLayer::Plugin`` class provides the following methods: .. code-block:: c++ void modifyPassConfig(MaterializationResponsibility &MR, - const Triple &TT, + jitlink::LinkGraph &G, jitlink::PassConfiguration &Config) * ``notifyLoaded`` is called before the link begins, and can be overridden to @@ -97,7 +97,7 @@ The ``ObjectLinkingLayer::Plugin`` class provides the following methods: .. code-block:: c++ - Error notifyRemovingResources(ResourceKey K) + Error notifyRemovingResources(JITDylib &JD, ResourceKey K) * ``notifyTransferringResources`` is called if/when a request is made to transfer tracking of any resources associated with ``ResourceKey`` @@ -105,7 +105,7 @@ The ``ObjectLinkingLayer::Plugin`` class provides the following methods: .. code-block:: c++ - void notifyTransferringResources(ResourceKey DstKey, + void notifyTransferringResources(JITDylib &JD, ResourceKey DstKey, ResourceKey SrcKey) Plugin authors are required to implement the ``notifyFailed``, @@ -126,7 +126,7 @@ calling the ``addPlugin`` method [1]_. E.g. // Add passes to print the set of defined symbols after dead-stripping. void modifyPassConfig(MaterializationResponsibility &MR, - const Triple &TT, + jitlink::LinkGraph &G, jitlink::PassConfiguration &Config) override { Config.PostPrunePasses.push_back([this](jitlink::LinkGraph &G) { return printAllSymbols(G); @@ -137,10 +137,10 @@ calling the ``addPlugin`` method [1]_. E.g. Error notifyFailed(MaterializationResponsibility &MR) override { return Error::success(); } - Error notifyRemovingResources(ResourceKey K) override { + Error notifyRemovingResources(JITDylib &JD, ResourceKey K) override { return Error::success(); } - void notifyTransferringResources(ResourceKey DstKey, + void notifyTransferringResources(JITDylib &JD, ResourceKey DstKey, ResourceKey SrcKey) override {} // JITLink pass to print all defined symbols in G. @@ -407,7 +407,7 @@ and utilities relevant to the linking process: * ``getPointerSize`` returns the size of a pointer (in bytes) in the executor process. - * ``getEndinaness`` returns the endianness of the executor process. + * ``getEndianness`` returns the endianness of the executor process. * ``allocateString`` copies data from a given ``llvm::Twine`` into the link graph's internal allocator. This can be used to ensure that content @@ -802,7 +802,7 @@ for them by an ``ObjectLinkingLayer`` instance, but they can be created manually ``ObjectLinkingLayer`` usually creates ``LinkGraphs``. #. ``createLinkGraph__`` can be used when - both the object format and architecture are known ahead of time. + both the object format and architecture are known ahead of time. #. ``createLinkGraph_`` can be used when the object format is known ahead of time, but the architecture is not. In this case the From 104f3c180644c8872eaad0b3fcf6a6b948d92a71 Mon Sep 17 00:00:00 2001 From: Slava Zakharin Date: Wed, 18 Sep 2024 17:41:33 -0700 Subject: [PATCH 137/321] Reland "[flang][runtime] Use cuda::std::complex in F18 runtime CUDA build. (#109078)" (#109207) `std::complex` operators do not work for the CUDA device compilation of F18 runtime. This change makes use of `cuda::std::complex` from `libcudacxx`. `cuda::std::complex` does not have specializations for `long double`, so the change is accompanied with a clean-up for `long double` usage. Additional change on top of #109078 is to use `cuda::std::complex` only for the device compilation, otherwise the host compilation fails because `libcudacxx` may not support `long double` specialization at all (depending on the compiler). --- flang/include/flang/Common/float80.h | 43 ++++ flang/include/flang/Runtime/complex.h | 33 +++ flang/include/flang/Runtime/cpp-type.h | 9 +- .../flang/Runtime/matmul-instances.inc | 6 +- flang/include/flang/Runtime/numeric.h | 32 +-- flang/include/flang/Runtime/reduce.h | 214 +++++++++++------- flang/include/flang/Runtime/reduction.h | 112 ++++----- .../include/flang/Runtime/transformational.h | 20 +- flang/runtime/complex-powi.cpp | 23 +- flang/runtime/complex-reduction.c | 8 +- flang/runtime/dot-product.cpp | 21 +- flang/runtime/extrema.cpp | 10 +- flang/runtime/matmul-transpose.cpp | 17 -- flang/runtime/matmul.cpp | 34 +-- flang/runtime/numeric.cpp | 36 +-- flang/runtime/product.cpp | 15 +- flang/runtime/random.cpp | 2 +- flang/runtime/reduce.cpp | 180 ++++++++------- flang/runtime/reduction-templates.h | 4 +- flang/runtime/sum.cpp | 22 +- flang/runtime/transformational.cpp | 8 +- flang/unittests/Runtime/Numeric.cpp | 4 +- flang/unittests/Runtime/Transformational.cpp | 10 +- 23 files changed, 482 insertions(+), 381 deletions(-) create mode 100644 flang/include/flang/Common/float80.h create mode 100644 flang/include/flang/Runtime/complex.h diff --git a/flang/include/flang/Common/float80.h b/flang/include/flang/Common/float80.h new file mode 100644 index 00000000000000..1838f7b13c8bb2 --- /dev/null +++ b/flang/include/flang/Common/float80.h @@ -0,0 +1,43 @@ +/*===-- flang/Common/float80.h --------------------------------------*- C -*-=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===----------------------------------------------------------------------===*/ + +/* This header is usable in both C and C++ code. + * Isolates build compiler checks to determine if the 80-bit + * floating point format is supported via a particular C type. + * It defines CFloat80Type and CppFloat80Type aliases for this + * C type. + */ + +#ifndef FORTRAN_COMMON_FLOAT80_H_ +#define FORTRAN_COMMON_FLOAT80_H_ + +#include "api-attrs.h" +#include + +#if LDBL_MANT_DIG == 64 +#undef HAS_FLOAT80 +#define HAS_FLOAT80 1 +#endif + +#if defined(RT_DEVICE_COMPILATION) && defined(__CUDACC__) +/* + * 'long double' is treated as 'double' in the CUDA device code, + * and there is no support for 80-bit floating point format. + * This is probably true for most offload devices, so RT_DEVICE_COMPILATION + * check should be enough. For the time being, guard it with __CUDACC__ + * as well. + */ +#undef HAS_FLOAT80 +#endif + +#if HAS_FLOAT80 +typedef long double CFloat80Type; +typedef long double CppFloat80Type; +#endif + +#endif /* FORTRAN_COMMON_FLOAT80_H_ */ diff --git a/flang/include/flang/Runtime/complex.h b/flang/include/flang/Runtime/complex.h new file mode 100644 index 00000000000000..be477d244155bd --- /dev/null +++ b/flang/include/flang/Runtime/complex.h @@ -0,0 +1,33 @@ +//===-- include/flang/Runtime/complex.h -------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// A single way to expose C++ complex class in files that can be used +// in F18 runtime build. With inclusion of this file std::complex +// and the related names become available, though, they may correspond +// to alternative definitions (e.g. from cuda::std namespace). + +#ifndef FORTRAN_RUNTIME_COMPLEX_H +#define FORTRAN_RUNTIME_COMPLEX_H + +#include "flang/Common/api-attrs.h" + +#if RT_USE_LIBCUDACXX && defined(RT_DEVICE_COMPILATION) +#include +namespace Fortran::runtime::rtcmplx { +using cuda::std::complex; +using cuda::std::conj; +} // namespace Fortran::runtime::rtcmplx +#else // !(RT_USE_LIBCUDACXX && defined(RT_DEVICE_COMPILATION)) +#include +namespace Fortran::runtime::rtcmplx { +using std::complex; +using std::conj; +} // namespace Fortran::runtime::rtcmplx +#endif // !(RT_USE_LIBCUDACXX && defined(RT_DEVICE_COMPILATION)) + +#endif // FORTRAN_RUNTIME_COMPLEX_H diff --git a/flang/include/flang/Runtime/cpp-type.h b/flang/include/flang/Runtime/cpp-type.h index fe21dd544cf7d8..aef0fbd7ede586 100644 --- a/flang/include/flang/Runtime/cpp-type.h +++ b/flang/include/flang/Runtime/cpp-type.h @@ -13,8 +13,9 @@ #include "flang/Common/Fortran.h" #include "flang/Common/float128.h" +#include "flang/Common/float80.h" #include "flang/Common/uint128.h" -#include +#include "flang/Runtime/complex.h" #include #if __cplusplus >= 202302 #include @@ -70,9 +71,9 @@ template <> struct CppTypeForHelper { using type = double; #endif }; -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 template <> struct CppTypeForHelper { - using type = long double; + using type = CppFloat80Type; }; #endif #if __STDCPP_FLOAT128_T__ @@ -89,7 +90,7 @@ template <> struct CppTypeForHelper { #endif template struct CppTypeForHelper { - using type = std::complex>; + using type = rtcmplx::complex>; }; template <> struct CppTypeForHelper { diff --git a/flang/include/flang/Runtime/matmul-instances.inc b/flang/include/flang/Runtime/matmul-instances.inc index 32c6ab06d25219..88e3067ca029d4 100644 --- a/flang/include/flang/Runtime/matmul-instances.inc +++ b/flang/include/flang/Runtime/matmul-instances.inc @@ -111,7 +111,7 @@ FOREACH_MATMUL_TYPE_PAIR(MATMUL_DIRECT_INSTANCE) FOREACH_MATMUL_TYPE_PAIR_WITH_INT16(MATMUL_INSTANCE) FOREACH_MATMUL_TYPE_PAIR_WITH_INT16(MATMUL_DIRECT_INSTANCE) -#if MATMUL_FORCE_ALL_TYPES || LDBL_MANT_DIG == 64 +#if MATMUL_FORCE_ALL_TYPES || HAS_FLOAT80 MATMUL_INSTANCE(Integer, 16, Real, 10) MATMUL_INSTANCE(Integer, 16, Complex, 10) MATMUL_INSTANCE(Real, 10, Integer, 16) @@ -133,7 +133,7 @@ MATMUL_DIRECT_INSTANCE(Complex, 16, Integer, 16) #endif #endif // MATMUL_FORCE_ALL_TYPES || (defined __SIZEOF_INT128__ && !AVOID_NATIVE_UINT128_T) -#if MATMUL_FORCE_ALL_TYPES || LDBL_MANT_DIG == 64 +#if MATMUL_FORCE_ALL_TYPES || HAS_FLOAT80 #define FOREACH_MATMUL_TYPE_PAIR_WITH_REAL10(macro) \ macro(Integer, 1, Real, 10) \ macro(Integer, 1, Complex, 10) \ @@ -193,7 +193,7 @@ MATMUL_DIRECT_INSTANCE(Complex, 10, Complex, 16) MATMUL_DIRECT_INSTANCE(Complex, 16, Real, 10) MATMUL_DIRECT_INSTANCE(Complex, 16, Complex, 10) #endif -#endif // MATMUL_FORCE_ALL_TYPES || LDBL_MANT_DIG == 64 +#endif // MATMUL_FORCE_ALL_TYPES || HAS_FLOAT80 #if MATMUL_FORCE_ALL_TYPES || (LDBL_MANT_DIG == 113 || HAS_FLOAT128) #define FOREACH_MATMUL_TYPE_PAIR_WITH_REAL16(macro) \ diff --git a/flang/include/flang/Runtime/numeric.h b/flang/include/flang/Runtime/numeric.h index 84a5a7cd7a361c..c3923ee2e0d889 100644 --- a/flang/include/flang/Runtime/numeric.h +++ b/flang/include/flang/Runtime/numeric.h @@ -44,7 +44,7 @@ CppTypeFor RTDECL(Ceiling8_8)( CppTypeFor RTDECL(Ceiling8_16)( CppTypeFor); #endif -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDECL(Ceiling10_1)( CppTypeFor); CppTypeFor RTDECL(Ceiling10_2)( @@ -78,7 +78,7 @@ CppTypeFor RTDECL(ErfcScaled4)( CppTypeFor); CppTypeFor RTDECL(ErfcScaled8)( CppTypeFor); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDECL(ErfcScaled10)( CppTypeFor); #endif @@ -96,7 +96,7 @@ CppTypeFor RTDECL(Exponent8_4)( CppTypeFor); CppTypeFor RTDECL(Exponent8_8)( CppTypeFor); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDECL(Exponent10_4)( CppTypeFor); CppTypeFor RTDECL(Exponent10_8)( @@ -134,7 +134,7 @@ CppTypeFor RTDECL(Floor8_8)( CppTypeFor RTDECL(Floor8_16)( CppTypeFor); #endif -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDECL(Floor10_1)( CppTypeFor); CppTypeFor RTDECL(Floor10_2)( @@ -168,7 +168,7 @@ CppTypeFor RTDECL(Fraction4)( CppTypeFor); CppTypeFor RTDECL(Fraction8)( CppTypeFor); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDECL(Fraction10)( CppTypeFor); #endif @@ -180,7 +180,7 @@ CppTypeFor RTDECL(Fraction16)( // ISNAN / IEEE_IS_NAN bool RTDECL(IsNaN4)(CppTypeFor); bool RTDECL(IsNaN8)(CppTypeFor); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 bool RTDECL(IsNaN10)(CppTypeFor); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 @@ -212,7 +212,7 @@ CppTypeFor RTDECL(ModReal4)( CppTypeFor RTDECL(ModReal8)( CppTypeFor, CppTypeFor, const char *sourceFile = nullptr, int sourceLine = 0); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDECL(ModReal10)( CppTypeFor, CppTypeFor, const char *sourceFile = nullptr, int sourceLine = 0); @@ -247,7 +247,7 @@ CppTypeFor RTDECL(ModuloReal4)( CppTypeFor RTDECL(ModuloReal8)( CppTypeFor, CppTypeFor, const char *sourceFile = nullptr, int sourceLine = 0); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDECL(ModuloReal10)( CppTypeFor, CppTypeFor, const char *sourceFile = nullptr, int sourceLine = 0); @@ -283,7 +283,7 @@ CppTypeFor RTDECL(Nint8_8)( CppTypeFor RTDECL(Nint8_16)( CppTypeFor); #endif -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDECL(Nint10_1)( CppTypeFor); CppTypeFor RTDECL(Nint10_2)( @@ -319,7 +319,7 @@ CppTypeFor RTDECL(Nearest4)( CppTypeFor, bool positive); CppTypeFor RTDECL(Nearest8)( CppTypeFor, bool positive); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDECL(Nearest10)( CppTypeFor, bool positive); #endif @@ -333,7 +333,7 @@ CppTypeFor RTDECL(RRSpacing4)( CppTypeFor); CppTypeFor RTDECL(RRSpacing8)( CppTypeFor); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDECL(RRSpacing10)( CppTypeFor); #endif @@ -347,7 +347,7 @@ CppTypeFor RTDECL(SetExponent4)( CppTypeFor, std::int64_t); CppTypeFor RTDECL(SetExponent8)( CppTypeFor, std::int64_t); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDECL(SetExponent10)( CppTypeFor, std::int64_t); #endif @@ -361,7 +361,7 @@ CppTypeFor RTDECL(Scale4)( CppTypeFor, std::int64_t); CppTypeFor RTDECL(Scale8)( CppTypeFor, std::int64_t); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDECL(Scale10)( CppTypeFor, std::int64_t); #endif @@ -410,7 +410,7 @@ CppTypeFor RTDECL(Spacing4)( CppTypeFor); CppTypeFor RTDECL(Spacing8)( CppTypeFor); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDECL(Spacing10)( CppTypeFor); #endif @@ -425,7 +425,7 @@ CppTypeFor RTDECL(FPow4i)( CppTypeFor RTDECL(FPow8i)( CppTypeFor b, CppTypeFor e); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDECL(FPow10i)( CppTypeFor b, CppTypeFor e); @@ -442,7 +442,7 @@ CppTypeFor RTDECL(FPow4k)( CppTypeFor RTDECL(FPow8k)( CppTypeFor b, CppTypeFor e); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDECL(FPow10k)( CppTypeFor b, CppTypeFor e); diff --git a/flang/include/flang/Runtime/reduce.h b/flang/include/flang/Runtime/reduce.h index 60f54c393b4bbd..c016b37f9592a1 100644 --- a/flang/include/flang/Runtime/reduce.h +++ b/flang/include/flang/Runtime/reduce.h @@ -188,22 +188,26 @@ void RTDECL(ReduceReal8DimValue)(Descriptor &result, const Descriptor &array, ValueReductionOperation, const char *source, int line, int dim, const Descriptor *mask = nullptr, const double *identity = nullptr, bool ordered = true); -#if LDBL_MANT_DIG == 64 -long double RTDECL(ReduceReal10Ref)(const Descriptor &, - ReferenceReductionOperation, const char *source, int line, - int dim = 0, const Descriptor *mask = nullptr, - const long double *identity = nullptr, bool ordered = true); -long double RTDECL(ReduceReal10Value)(const Descriptor &, - ValueReductionOperation, const char *source, int line, - int dim = 0, const Descriptor *mask = nullptr, - const long double *identity = nullptr, bool ordered = true); +#if HAS_FLOAT80 +CppTypeFor RTDECL(ReduceReal10Ref)(const Descriptor &, + ReferenceReductionOperation>, + const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, + const CppTypeFor *identity = nullptr, + bool ordered = true); +CppTypeFor RTDECL(ReduceReal10Value)(const Descriptor &, + ValueReductionOperation>, + const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, + const CppTypeFor *identity = nullptr, + bool ordered = true); void RTDECL(ReduceReal10DimRef)(Descriptor &result, const Descriptor &array, - ReferenceReductionOperation, const char *source, int line, - int dim, const Descriptor *mask = nullptr, - const long double *identity = nullptr, bool ordered = true); + ReferenceReductionOperation>, + const char *source, int line, int dim, const Descriptor *mask = nullptr, + const CppTypeFor *identity = nullptr, + bool ordered = true); void RTDECL(ReduceReal10DimValue)(Descriptor &result, const Descriptor &array, - ValueReductionOperation, const char *source, int line, int dim, - const Descriptor *mask = nullptr, const long double *identity = nullptr, + ValueReductionOperation>, + const char *source, int line, int dim, const Descriptor *mask = nullptr, + const CppTypeFor *identity = nullptr, bool ordered = true); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 @@ -225,112 +229,152 @@ void RTDECL(ReduceReal16DimValue)(Descriptor &result, const Descriptor &array, const CppFloat128Type *identity = nullptr, bool ordered = true); #endif -void RTDECL(CppReduceComplex2Ref)(std::complex &, const Descriptor &, - ReferenceReductionOperation>, const char *source, - int line, int dim = 0, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); -void RTDECL(CppReduceComplex2Value)(std::complex &, const Descriptor &, - ValueReductionOperation>, const char *source, int line, - int dim = 0, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); +void RTDECL(CppReduceComplex2Ref)(CppTypeFor &, + const Descriptor &, + ReferenceReductionOperation>, + const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, + const CppTypeFor *identity = nullptr, + bool ordered = true); +void RTDECL(CppReduceComplex2Value)(CppTypeFor &, + const Descriptor &, + ValueReductionOperation>, + const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, + const CppTypeFor *identity = nullptr, + bool ordered = true); void RTDECL(CppReduceComplex2DimRef)(Descriptor &result, - const Descriptor &array, ReferenceReductionOperation>, + const Descriptor &array, + ReferenceReductionOperation>, const char *source, int line, int dim, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); + const CppTypeFor *identity = nullptr, + bool ordered = true); void RTDECL(CppReduceComplex2DimValue)(Descriptor &result, - const Descriptor &array, ValueReductionOperation>, + const Descriptor &array, + ValueReductionOperation>, const char *source, int line, int dim, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); -void RTDECL(CppReduceComplex3Ref)(std::complex &, const Descriptor &, - ReferenceReductionOperation>, const char *source, - int line, int dim = 0, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); -void RTDECL(CppReduceComplex3Value)(std::complex &, const Descriptor &, - ValueReductionOperation>, const char *source, int line, - int dim = 0, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); + const CppTypeFor *identity = nullptr, + bool ordered = true); +void RTDECL(CppReduceComplex3Ref)(CppTypeFor &, + const Descriptor &, + ReferenceReductionOperation>, + const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, + const CppTypeFor *identity = nullptr, + bool ordered = true); +void RTDECL(CppReduceComplex3Value)(CppTypeFor &, + const Descriptor &, + ValueReductionOperation>, + const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, + const CppTypeFor *identity = nullptr, + bool ordered = true); void RTDECL(CppReduceComplex3DimRef)(Descriptor &result, - const Descriptor &array, ReferenceReductionOperation>, + const Descriptor &array, + ReferenceReductionOperation>, const char *source, int line, int dim, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); + const CppTypeFor *identity = nullptr, + bool ordered = true); void RTDECL(CppReduceComplex3DimValue)(Descriptor &result, - const Descriptor &array, ValueReductionOperation>, + const Descriptor &array, + ValueReductionOperation>, const char *source, int line, int dim, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); -void RTDECL(CppReduceComplex4Ref)(std::complex &, const Descriptor &, - ReferenceReductionOperation>, const char *source, - int line, int dim = 0, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); -void RTDECL(CppReduceComplex4Value)(std::complex &, const Descriptor &, - ValueReductionOperation>, const char *source, int line, - int dim = 0, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); + const CppTypeFor *identity = nullptr, + bool ordered = true); +void RTDECL(CppReduceComplex4Ref)(CppTypeFor &, + const Descriptor &, + ReferenceReductionOperation>, + const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, + const CppTypeFor *identity = nullptr, + bool ordered = true); +void RTDECL(CppReduceComplex4Value)(CppTypeFor &, + const Descriptor &, + ValueReductionOperation>, + const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, + const CppTypeFor *identity = nullptr, + bool ordered = true); void RTDECL(CppReduceComplex4DimRef)(Descriptor &result, - const Descriptor &array, ReferenceReductionOperation>, + const Descriptor &array, + ReferenceReductionOperation>, const char *source, int line, int dim, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); + const CppTypeFor *identity = nullptr, + bool ordered = true); void RTDECL(CppReduceComplex4DimValue)(Descriptor &result, - const Descriptor &array, ValueReductionOperation>, + const Descriptor &array, + ValueReductionOperation>, const char *source, int line, int dim, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); -void RTDECL(CppReduceComplex8Ref)(std::complex &, const Descriptor &, - ReferenceReductionOperation>, const char *source, - int line, int dim = 0, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); -void RTDECL(CppReduceComplex8Value)(std::complex &, const Descriptor &, - ValueReductionOperation>, const char *source, int line, - int dim = 0, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); + const CppTypeFor *identity = nullptr, + bool ordered = true); +void RTDECL(CppReduceComplex8Ref)(CppTypeFor &, + const Descriptor &, + ReferenceReductionOperation>, + const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, + const CppTypeFor *identity = nullptr, + bool ordered = true); +void RTDECL(CppReduceComplex8Value)(CppTypeFor &, + const Descriptor &, + ValueReductionOperation>, + const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, + const CppTypeFor *identity = nullptr, + bool ordered = true); void RTDECL(CppReduceComplex8DimRef)(Descriptor &result, - const Descriptor &array, ReferenceReductionOperation>, + const Descriptor &array, + ReferenceReductionOperation>, const char *source, int line, int dim, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); + const CppTypeFor *identity = nullptr, + bool ordered = true); void RTDECL(CppReduceComplex8DimValue)(Descriptor &result, - const Descriptor &array, ValueReductionOperation>, + const Descriptor &array, + ValueReductionOperation>, const char *source, int line, int dim, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); -#if LDBL_MANT_DIG == 64 -void RTDECL(CppReduceComplex10Ref)(std::complex &, - const Descriptor &, ReferenceReductionOperation>, + const CppTypeFor *identity = nullptr, + bool ordered = true); +#if HAS_FLOAT80 +void RTDECL(CppReduceComplex10Ref)(CppTypeFor &, + const Descriptor &, + ReferenceReductionOperation>, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); -void RTDECL(CppReduceComplex10Value)(std::complex &, - const Descriptor &, ValueReductionOperation>, + const CppTypeFor *identity = nullptr, + bool ordered = true); +void RTDECL(CppReduceComplex10Value)(CppTypeFor &, + const Descriptor &, + ValueReductionOperation>, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); + const CppTypeFor *identity = nullptr, + bool ordered = true); void RTDECL(CppReduceComplex10DimRef)(Descriptor &result, const Descriptor &array, - ReferenceReductionOperation>, const char *source, - int line, int dim, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); + ReferenceReductionOperation>, + const char *source, int line, int dim, const Descriptor *mask = nullptr, + const CppTypeFor *identity = nullptr, + bool ordered = true); void RTDECL(CppReduceComplex10DimValue)(Descriptor &result, - const Descriptor &array, ValueReductionOperation>, + const Descriptor &array, + ValueReductionOperation>, const char *source, int line, int dim, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, bool ordered = true); + const CppTypeFor *identity = nullptr, + bool ordered = true); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 -void RTDECL(CppReduceComplex16Ref)(std::complex &, +void RTDECL(CppReduceComplex16Ref)(CppTypeFor &, const Descriptor &, - ReferenceReductionOperation>, + ReferenceReductionOperation>, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, + const CppTypeFor *identity = nullptr, bool ordered = true); -void RTDECL(CppReduceComplex16Value)(std::complex &, - const Descriptor &, ValueReductionOperation>, +void RTDECL(CppReduceComplex16Value)(CppTypeFor &, + const Descriptor &, + ValueReductionOperation>, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, + const CppTypeFor *identity = nullptr, bool ordered = true); void RTDECL(CppReduceComplex16DimRef)(Descriptor &result, const Descriptor &array, - ReferenceReductionOperation>, + ReferenceReductionOperation>, const char *source, int line, int dim, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, + const CppTypeFor *identity = nullptr, bool ordered = true); void RTDECL(CppReduceComplex16DimValue)(Descriptor &result, const Descriptor &array, - ValueReductionOperation>, const char *source, - int line, int dim, const Descriptor *mask = nullptr, - const std::complex *identity = nullptr, + ValueReductionOperation>, + const char *source, int line, int dim, const Descriptor *mask = nullptr, + const CppTypeFor *identity = nullptr, bool ordered = true); #endif diff --git a/flang/include/flang/Runtime/reduction.h b/flang/include/flang/Runtime/reduction.h index 97986c12e8a10e..7eafacee69d034 100644 --- a/flang/include/flang/Runtime/reduction.h +++ b/flang/include/flang/Runtime/reduction.h @@ -68,34 +68,35 @@ float RTDECL(SumReal4)(const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); double RTDECL(SumReal8)(const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -#if LDBL_MANT_DIG == 64 -long double RTDECL(SumReal10)(const Descriptor &, const char *source, int line, - int dim = 0, const Descriptor *mask = nullptr); +#if HAS_FLOAT80 +CppTypeFor RTDECL(SumReal10)(const Descriptor &, + const char *source, int line, int dim = 0, + const Descriptor *mask = nullptr); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 CppFloat128Type RTDECL(SumReal16)(const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); #endif -void RTDECL(CppSumComplex2)(std::complex &, const Descriptor &, - const char *source, int line, int dim = 0, +void RTDECL(CppSumComplex2)(CppTypeFor &, + const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -void RTDECL(CppSumComplex3)(std::complex &, const Descriptor &, - const char *source, int line, int dim = 0, +void RTDECL(CppSumComplex3)(CppTypeFor &, + const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -void RTDECL(CppSumComplex4)(std::complex &, const Descriptor &, - const char *source, int line, int dim = 0, +void RTDECL(CppSumComplex4)(CppTypeFor &, + const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -void RTDECL(CppSumComplex8)(std::complex &, const Descriptor &, - const char *source, int line, int dim = 0, +void RTDECL(CppSumComplex8)(CppTypeFor &, + const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -#if LDBL_MANT_DIG == 64 -void RTDECL(CppSumComplex10)(std::complex &, const Descriptor &, - const char *source, int line, int dim = 0, +#if HAS_FLOAT80 +void RTDECL(CppSumComplex10)(CppTypeFor &, + const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 -void RTDECL(CppSumComplex16)(std::complex &, +void RTDECL(CppSumComplex16)(CppTypeFor &, const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); #endif @@ -128,34 +129,35 @@ float RTDECL(ProductReal4)(const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); double RTDECL(ProductReal8)(const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -#if LDBL_MANT_DIG == 64 -long double RTDECL(ProductReal10)(const Descriptor &, const char *source, - int line, int dim = 0, const Descriptor *mask = nullptr); +#if HAS_FLOAT80 +CppTypeFor RTDECL(ProductReal10)(const Descriptor &, + const char *source, int line, int dim = 0, + const Descriptor *mask = nullptr); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 CppFloat128Type RTDECL(ProductReal16)(const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); #endif -void RTDECL(CppProductComplex2)(std::complex &, const Descriptor &, - const char *source, int line, int dim = 0, +void RTDECL(CppProductComplex2)(CppTypeFor &, + const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -void RTDECL(CppProductComplex3)(std::complex &, const Descriptor &, - const char *source, int line, int dim = 0, +void RTDECL(CppProductComplex3)(CppTypeFor &, + const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -void RTDECL(CppProductComplex4)(std::complex &, const Descriptor &, - const char *source, int line, int dim = 0, +void RTDECL(CppProductComplex4)(CppTypeFor &, + const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -void RTDECL(CppProductComplex8)(std::complex &, const Descriptor &, - const char *source, int line, int dim = 0, +void RTDECL(CppProductComplex8)(CppTypeFor &, + const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -#if LDBL_MANT_DIG == 64 -void RTDECL(CppProductComplex10)(std::complex &, +#if HAS_FLOAT80 +void RTDECL(CppProductComplex10)(CppTypeFor &, const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 -void RTDECL(CppProductComplex16)(std::complex &, +void RTDECL(CppProductComplex16)(CppTypeFor &, const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); #endif @@ -307,9 +309,10 @@ float RTDECL(MaxvalReal4)(const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); double RTDECL(MaxvalReal8)(const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -#if LDBL_MANT_DIG == 64 -long double RTDECL(MaxvalReal10)(const Descriptor &, const char *source, - int line, int dim = 0, const Descriptor *mask = nullptr); +#if HAS_FLOAT80 +CppTypeFor RTDECL(MaxvalReal10)(const Descriptor &, + const char *source, int line, int dim = 0, + const Descriptor *mask = nullptr); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 CppFloat128Type RTDECL(MaxvalReal16)(const Descriptor &, const char *source, @@ -338,9 +341,10 @@ float RTDECL(MinvalReal4)(const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); double RTDECL(MinvalReal8)(const Descriptor &, const char *source, int line, int dim = 0, const Descriptor *mask = nullptr); -#if LDBL_MANT_DIG == 64 -long double RTDECL(MinvalReal10)(const Descriptor &, const char *source, - int line, int dim = 0, const Descriptor *mask = nullptr); +#if HAS_FLOAT80 +CppTypeFor RTDECL(MinvalReal10)(const Descriptor &, + const char *source, int line, int dim = 0, + const Descriptor *mask = nullptr); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 CppFloat128Type RTDECL(MinvalReal16)(const Descriptor &, const char *source, @@ -363,12 +367,12 @@ float RTDECL(Norm2_4)( const Descriptor &, const char *source, int line, int dim = 0); double RTDECL(Norm2_8)( const Descriptor &, const char *source, int line, int dim = 0); -#if LDBL_MANT_DIG == 64 -long double RTDECL(Norm2_10)( +#if HAS_FLOAT80 +CppTypeFor RTDECL(Norm2_10)( const Descriptor &, const char *source, int line, int dim = 0); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 -long double RTDECL(Norm2_16)( +CppFloat128Type RTDECL(Norm2_16)( const Descriptor &, const char *source, int line, int dim = 0); void RTDECL(Norm2DimReal16)( Descriptor &, const Descriptor &, int dim, const char *source, int line); @@ -413,29 +417,33 @@ float RTDECL(DotProductReal4)(const Descriptor &, const Descriptor &, const char *source = nullptr, int line = 0); double RTDECL(DotProductReal8)(const Descriptor &, const Descriptor &, const char *source = nullptr, int line = 0); -#if LDBL_MANT_DIG == 64 -long double RTDECL(DotProductReal10)(const Descriptor &, const Descriptor &, - const char *source = nullptr, int line = 0); +#if HAS_FLOAT80 +CppTypeFor RTDECL(DotProductReal10)(const Descriptor &, + const Descriptor &, const char *source = nullptr, int line = 0); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 CppFloat128Type RTDECL(DotProductReal16)(const Descriptor &, const Descriptor &, const char *source = nullptr, int line = 0); #endif -void RTDECL(CppDotProductComplex2)(std::complex &, const Descriptor &, - const Descriptor &, const char *source = nullptr, int line = 0); -void RTDECL(CppDotProductComplex3)(std::complex &, const Descriptor &, - const Descriptor &, const char *source = nullptr, int line = 0); -void RTDECL(CppDotProductComplex4)(std::complex &, const Descriptor &, - const Descriptor &, const char *source = nullptr, int line = 0); -void RTDECL(CppDotProductComplex8)(std::complex &, const Descriptor &, - const Descriptor &, const char *source = nullptr, int line = 0); -#if LDBL_MANT_DIG == 64 -void RTDECL(CppDotProductComplex10)(std::complex &, +void RTDECL(CppDotProductComplex2)(CppTypeFor &, + const Descriptor &, const Descriptor &, const char *source = nullptr, + int line = 0); +void RTDECL(CppDotProductComplex3)(CppTypeFor &, + const Descriptor &, const Descriptor &, const char *source = nullptr, + int line = 0); +void RTDECL(CppDotProductComplex4)(CppTypeFor &, + const Descriptor &, const Descriptor &, const char *source = nullptr, + int line = 0); +void RTDECL(CppDotProductComplex8)(CppTypeFor &, + const Descriptor &, const Descriptor &, const char *source = nullptr, + int line = 0); +#if HAS_FLOAT80 +void RTDECL(CppDotProductComplex10)(CppTypeFor &, const Descriptor &, const Descriptor &, const char *source = nullptr, int line = 0); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 -void RTDECL(CppDotProductComplex16)(std::complex &, +void RTDECL(CppDotProductComplex16)(CppTypeFor &, const Descriptor &, const Descriptor &, const char *source = nullptr, int line = 0); #endif diff --git a/flang/include/flang/Runtime/transformational.h b/flang/include/flang/Runtime/transformational.h index a39b872f376a69..faeaa1baa39ae2 100644 --- a/flang/include/flang/Runtime/transformational.h +++ b/flang/include/flang/Runtime/transformational.h @@ -45,10 +45,12 @@ void RTDECL(BesselJn_4)(Descriptor &result, int32_t n1, int32_t n2, float x, void RTDECL(BesselJn_8)(Descriptor &result, int32_t n1, int32_t n2, double x, double bn2, double bn2_1, const char *sourceFile = nullptr, int line = 0); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 void RTDECL(BesselJn_10)(Descriptor &result, int32_t n1, int32_t n2, - long double x, long double bn2, long double bn2_1, - const char *sourceFile = nullptr, int line = 0); + CppTypeFor x, + CppTypeFor bn2, + CppTypeFor bn2_1, const char *sourceFile = nullptr, + int line = 0); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 @@ -69,7 +71,7 @@ void RTDECL(BesselJnX0_4)(Descriptor &result, int32_t n1, int32_t n2, void RTDECL(BesselJnX0_8)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 void RTDECL(BesselJnX0_10)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); #endif @@ -91,10 +93,12 @@ void RTDECL(BesselYn_4)(Descriptor &result, int32_t n1, int32_t n2, float x, void RTDECL(BesselYn_8)(Descriptor &result, int32_t n1, int32_t n2, double x, double bn1, double bn1_1, const char *sourceFile = nullptr, int line = 0); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 void RTDECL(BesselYn_10)(Descriptor &result, int32_t n1, int32_t n2, - long double x, long double bn1, long double bn1_1, - const char *sourceFile = nullptr, int line = 0); + CppTypeFor x, + CppTypeFor bn1, + CppTypeFor bn1_1, const char *sourceFile = nullptr, + int line = 0); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 @@ -115,7 +119,7 @@ void RTDECL(BesselYnX0_4)(Descriptor &result, int32_t n1, int32_t n2, void RTDECL(BesselYnX0_8)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 void RTDECL(BesselYnX0_10)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); #endif diff --git a/flang/runtime/complex-powi.cpp b/flang/runtime/complex-powi.cpp index 77031e40242791..d7a63724b96c8f 100644 --- a/flang/runtime/complex-powi.cpp +++ b/flang/runtime/complex-powi.cpp @@ -7,11 +7,13 @@ * ===-----------------------------------------------------------------------=== */ #include "flang/Common/float128.h" +#include "flang/Runtime/cpp-type.h" #include "flang/Runtime/entry-names.h" #include #include #include +namespace Fortran::runtime { #ifdef __clang_major__ #pragma clang diagnostic ignored "-Wc99-extensions" #endif @@ -114,35 +116,35 @@ extern "C" Qcomplex RTNAME(cqpowk)(Qcomplex base, std::int64_t exp) { // MSVC doesn't allow including or in C++17 mode to get // the Windows definitions of these structs so just redefine here. struct Fcomplex { - float re; - float im; + CppTypeFor re; + CppTypeFor im; }; struct Dcomplex { - double re; - double im; + CppTypeFor re; + CppTypeFor im; }; extern "C" Fcomplex RTNAME(cpowi)(Fcomplex base, std::int32_t exp) { - auto cppbase = *(std::complex *)(&base); + auto cppbase = *(CppTypeFor *)(&base); auto cppres = tgpowi(cppbase, exp); return *(Fcomplex *)(&cppres); } extern "C" Dcomplex RTNAME(zpowi)(Dcomplex base, std::int32_t exp) { - auto cppbase = *(std::complex *)(&base); + auto cppbase = *(CppTypeFor *)(&base); auto cppres = tgpowi(cppbase, exp); return *(Dcomplex *)(&cppres); } extern "C" Fcomplex RTNAME(cpowk)(Fcomplex base, std::int64_t exp) { - auto cppbase = *(std::complex *)(&base); + auto cppbase = *(CppTypeFor *)(&base); auto cppres = tgpowi(cppbase, exp); return *(Fcomplex *)(&cppres); } extern "C" Dcomplex RTNAME(zpowk)(Dcomplex base, std::int64_t exp) { - auto cppbase = *(std::complex *)(&base); + auto cppbase = *(CppTypeFor *)(&base); auto cppres = tgpowi(cppbase, exp); return *(Dcomplex *)(&cppres); } @@ -154,15 +156,16 @@ struct Qcomplex { }; extern "C" Dcomplex RTNAME(cqpowi)(Qcomplex base, std::int32_t exp) { - auto cppbase = *(std::complex *)(&base); + auto cppbase = *(rtcmplx::complex *)(&base); auto cppres = tgpowi(cppbase, exp); return *(Qcomplex *)(&cppres); } extern "C" Dcomplex RTNAME(cqpowk)(Qcomplex base, std::int64_t exp) { - auto cppbase = *(std::complex *)(&base); + auto cppbase = *(rtcmplx::complex *)(&base); auto cppres = tgpowi(cppbase, exp); return *(Qcomplex *)(&cppres); } #endif #endif +} // namespace Fortran::runtime diff --git a/flang/runtime/complex-reduction.c b/flang/runtime/complex-reduction.c index 37ce3fa410016b..232c5452488f1a 100644 --- a/flang/runtime/complex-reduction.c +++ b/flang/runtime/complex-reduction.c @@ -119,7 +119,7 @@ ADAPT_REDUCTION(SumComplex4, float_Complex_t, CppComplexFloat, CMPLXF, REDUCTION_ARGS, REDUCTION_ARG_NAMES) ADAPT_REDUCTION(SumComplex8, double_Complex_t, CppComplexDouble, CMPLX, REDUCTION_ARGS, REDUCTION_ARG_NAMES) -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 ADAPT_REDUCTION(SumComplex10, long_double_Complex_t, CppComplexLongDouble, CMPLXL, REDUCTION_ARGS, REDUCTION_ARG_NAMES) #endif @@ -133,7 +133,7 @@ ADAPT_REDUCTION(ProductComplex4, float_Complex_t, CppComplexFloat, CMPLXF, REDUCTION_ARGS, REDUCTION_ARG_NAMES) ADAPT_REDUCTION(ProductComplex8, double_Complex_t, CppComplexDouble, CMPLX, REDUCTION_ARGS, REDUCTION_ARG_NAMES) -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 ADAPT_REDUCTION(ProductComplex10, long_double_Complex_t, CppComplexLongDouble, CMPLXL, REDUCTION_ARGS, REDUCTION_ARG_NAMES) #endif @@ -147,7 +147,7 @@ ADAPT_REDUCTION(DotProductComplex4, float_Complex_t, CppComplexFloat, CMPLXF, DOT_PRODUCT_ARGS, DOT_PRODUCT_ARG_NAMES) ADAPT_REDUCTION(DotProductComplex8, double_Complex_t, CppComplexDouble, CMPLX, DOT_PRODUCT_ARGS, DOT_PRODUCT_ARG_NAMES) -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 ADAPT_REDUCTION(DotProductComplex10, long_double_Complex_t, CppComplexLongDouble, CMPLXL, DOT_PRODUCT_ARGS, DOT_PRODUCT_ARG_NAMES) #endif @@ -173,7 +173,7 @@ ADAPT_REDUCTION(ReduceComplex8Ref, double_Complex_t, CppComplexDouble, CMPLX, ADAPT_REDUCTION(ReduceComplex8Value, double_Complex_t, CppComplexDouble, CMPLX, RARGS, REDUCE_ARG_NAMES) #undef RARGS -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 #define RARGS REDUCE_ARGS(long_double_Complex_t, long_double_Complex_t_ref_op) ADAPT_REDUCTION(ReduceComplex10Ref, long_double_Complex_t, CppComplexLongDouble, CMPLXL, RARGS, REDUCE_ARG_NAMES) diff --git a/flang/runtime/dot-product.cpp b/flang/runtime/dot-product.cpp index 977698269bcb46..aafef379fad43c 100644 --- a/flang/runtime/dot-product.cpp +++ b/flang/runtime/dot-product.cpp @@ -21,11 +21,6 @@ namespace Fortran::runtime { // Beware: DOT_PRODUCT of COMPLEX data uses the complex conjugate of the first // argument; MATMUL does not. -// Suppress the warnings about calling __host__-only std::complex operators, -// defined in C++ STD header files, from __device__ code. -RT_DIAG_PUSH -RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN - // General accumulator for any type and stride; this is not used for // contiguous numeric vectors. template @@ -42,7 +37,7 @@ class Accumulator { const XT &xElement{*x_.Element(&xAt)}; const YT &yElement{*y_.Element(&yAt)}; if constexpr (RCAT == TypeCategory::Complex) { - sum_ += std::conj(static_cast(xElement)) * + sum_ += rtcmplx::conj(static_cast(xElement)) * static_cast(yElement); } else { sum_ += static_cast(xElement) * static_cast(yElement); @@ -77,9 +72,9 @@ static inline RT_API_ATTRS CppTypeFor DoDotProduct( // TODO: call BLAS-1 SDOT or SDSDOT } else if constexpr (std::is_same_v) { // TODO: call BLAS-1 DDOT - } else if constexpr (std::is_same_v>) { + } else if constexpr (std::is_same_v>) { // TODO: call BLAS-1 CDOTC - } else if constexpr (std::is_same_v>) { + } else if constexpr (std::is_same_v>) { // TODO: call BLAS-1 ZDOTC } } @@ -89,12 +84,12 @@ static inline RT_API_ATTRS CppTypeFor DoDotProduct( AccumType accum{}; if constexpr (RCAT == TypeCategory::Complex) { for (SubscriptValue j{0}; j < n; ++j) { - // std::conj() may instantiate its argument twice, + // conj() may instantiate its argument twice, // so xp has to be incremented separately. // This is a workaround for an alleged bug in clang, // that shows up as: // warning: multiple unsequenced modifications to 'xp' - accum += std::conj(static_cast(*xp)) * + accum += rtcmplx::conj(static_cast(*xp)) * static_cast(*yp++); xp++; } @@ -117,8 +112,6 @@ static inline RT_API_ATTRS CppTypeFor DoDotProduct( return static_cast(accumulator.GetResult()); } -RT_DIAG_POP - template struct DotProduct { using Result = CppTypeFor; template struct DP1 { @@ -197,7 +190,7 @@ CppTypeFor RTDEF(DotProductReal8)( const Descriptor &x, const Descriptor &y, const char *source, int line) { return DotProduct{}(x, y, source, line); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(DotProductReal10)( const Descriptor &x, const Descriptor &y, const char *source, int line) { return DotProduct{}(x, y, source, line); @@ -218,7 +211,7 @@ void RTDEF(CppDotProductComplex8)(CppTypeFor &result, const Descriptor &x, const Descriptor &y, const char *source, int line) { result = DotProduct{}(x, y, source, line); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 void RTDEF(CppDotProductComplex10)( CppTypeFor &result, const Descriptor &x, const Descriptor &y, const char *source, int line) { diff --git a/flang/runtime/extrema.cpp b/flang/runtime/extrema.cpp index d6e9633372f524..2658709b7de86b 100644 --- a/flang/runtime/extrema.cpp +++ b/flang/runtime/extrema.cpp @@ -236,7 +236,7 @@ void RTDEF(MaxlocReal8)(Descriptor &result, const Descriptor &x, int kind, TotalNumericMaxOrMinLoc( "MAXLOC", result, x, kind, source, line, mask, back); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 void RTDEF(MaxlocReal10)(Descriptor &result, const Descriptor &x, int kind, const char *source, int line, const Descriptor *mask, bool back) { TotalNumericMaxOrMinLoc( @@ -292,7 +292,7 @@ void RTDEF(MinlocReal8)(Descriptor &result, const Descriptor &x, int kind, TotalNumericMaxOrMinLoc( "MINLOC", result, x, kind, source, line, mask, back); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 void RTDEF(MinlocReal10)(Descriptor &result, const Descriptor &x, int kind, const char *source, int line, const Descriptor *mask, bool back) { TotalNumericMaxOrMinLoc( @@ -614,7 +614,7 @@ CppTypeFor RTDEF(MaxvalReal8)(const Descriptor &x, return TotalNumericMaxOrMin( x, source, line, dim, mask, "MAXVAL"); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(MaxvalReal10)(const Descriptor &x, const char *source, int line, int dim, const Descriptor *mask) { return TotalNumericMaxOrMin( @@ -674,7 +674,7 @@ CppTypeFor RTDEF(MinvalReal8)(const Descriptor &x, return TotalNumericMaxOrMin( x, source, line, dim, mask, "MINVAL"); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(MinvalReal10)(const Descriptor &x, const char *source, int line, int dim, const Descriptor *mask) { return TotalNumericMaxOrMin( @@ -730,7 +730,7 @@ CppTypeFor RTDEF(Norm2_8)( return GetTotalReduction( x, source, line, dim, nullptr, Norm2Accumulator<8>{x}, "NORM2"); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(Norm2_10)( const Descriptor &x, const char *source, int line, int dim) { return GetTotalReduction( diff --git a/flang/runtime/matmul-transpose.cpp b/flang/runtime/matmul-transpose.cpp index 283472650a1c69..bafa05056bebc4 100644 --- a/flang/runtime/matmul-transpose.cpp +++ b/flang/runtime/matmul-transpose.cpp @@ -32,11 +32,6 @@ namespace { using namespace Fortran::runtime; -// Suppress the warnings about calling __host__-only std::complex operators, -// defined in C++ STD header files, from __device__ code. -RT_DIAG_PUSH -RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN - // Contiguous numeric TRANSPOSE(matrix)*matrix multiplication // TRANSPOSE(matrix(n, rows)) * matrix(n,cols) -> // matrix(rows, n) * matrix(n,cols) -> matrix(rows,cols) @@ -91,8 +86,6 @@ inline static RT_API_ATTRS void MatrixTransposedTimesMatrix( } } -RT_DIAG_POP - template inline static RT_API_ATTRS void MatrixTransposedTimesMatrixHelper( CppTypeFor *RESTRICT product, SubscriptValue rows, @@ -118,9 +111,6 @@ inline static RT_API_ATTRS void MatrixTransposedTimesMatrixHelper( } } -RT_DIAG_PUSH -RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN - // Contiguous numeric matrix*vector multiplication // matrix(rows,n) * column vector(n) -> column vector(rows) // Straightforward algorithm: @@ -158,8 +148,6 @@ inline static RT_API_ATTRS void MatrixTransposedTimesVector( } } -RT_DIAG_POP - template inline static RT_API_ATTRS void MatrixTransposedTimesVectorHelper( CppTypeFor *RESTRICT product, SubscriptValue rows, @@ -174,9 +162,6 @@ inline static RT_API_ATTRS void MatrixTransposedTimesVectorHelper( } } -RT_DIAG_PUSH -RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN - // Implements an instance of MATMUL for given argument types. template @@ -341,8 +326,6 @@ inline static RT_API_ATTRS void DoMatmulTranspose( } } -RT_DIAG_POP - template struct MatmulTransposeHelper { diff --git a/flang/runtime/matmul.cpp b/flang/runtime/matmul.cpp index 252557e2f9e7ad..a5737a9bc62075 100644 --- a/flang/runtime/matmul.cpp +++ b/flang/runtime/matmul.cpp @@ -31,11 +31,6 @@ namespace { using namespace Fortran::runtime; -// Suppress the warnings about calling __host__-only std::complex operators, -// defined in C++ STD header files, from __device__ code. -RT_DIAG_PUSH -RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN - // General accumulator for any type and stride; this is not used for // contiguous numeric cases. template @@ -112,8 +107,6 @@ inline RT_API_ATTRS void MatrixTimesMatrix( } } -RT_DIAG_POP - template inline RT_API_ATTRS void MatrixTimesMatrixHelper( CppTypeFor *RESTRICT product, SubscriptValue rows, @@ -139,9 +132,6 @@ inline RT_API_ATTRS void MatrixTimesMatrixHelper( } } -RT_DIAG_PUSH -RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN - // Contiguous numeric matrix*vector multiplication // matrix(rows,n) * column vector(n) -> column vector(rows) // Straightforward algorithm: @@ -179,8 +169,6 @@ inline RT_API_ATTRS void MatrixTimesVector( } } -RT_DIAG_POP - template inline RT_API_ATTRS void MatrixTimesVectorHelper( CppTypeFor *RESTRICT product, SubscriptValue rows, @@ -194,9 +182,6 @@ inline RT_API_ATTRS void MatrixTimesVectorHelper( } } -RT_DIAG_PUSH -RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN - // Contiguous numeric vector*matrix multiplication // row vector(n) * matrix(n,cols) -> row vector(cols) // Straightforward algorithm: @@ -235,8 +220,6 @@ inline RT_API_ATTRS void VectorTimesMatrix( } } -RT_DIAG_POP - template inline RT_API_ATTRS void VectorTimesMatrixHelper( @@ -251,9 +234,6 @@ inline RT_API_ATTRS void VectorTimesMatrixHelper( } } -RT_DIAG_PUSH -RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN - // Implements an instance of MATMUL for given argument types. template @@ -344,9 +324,9 @@ static inline RT_API_ATTRS void DoMatmul( // TODO: try using CUTLASS for device. } else if constexpr (std::is_same_v) { // TODO: call BLAS-3 DGEMM - } else if constexpr (std::is_same_v>) { + } else if constexpr (std::is_same_v>) { // TODO: call BLAS-3 CGEMM - } else if constexpr (std::is_same_v>) { + } else if constexpr (std::is_same_v>) { // TODO: call BLAS-3 ZGEMM } } @@ -361,9 +341,9 @@ static inline RT_API_ATTRS void DoMatmul( // TODO: call BLAS-2 SGEMV(x,y) } else if constexpr (std::is_same_v) { // TODO: call BLAS-2 DGEMV(x,y) - } else if constexpr (std::is_same_v>) { + } else if constexpr (std::is_same_v>) { // TODO: call BLAS-2 CGEMV(x,y) - } else if constexpr (std::is_same_v>) { + } else if constexpr (std::is_same_v>) { // TODO: call BLAS-2 ZGEMV(x,y) } } @@ -377,9 +357,9 @@ static inline RT_API_ATTRS void DoMatmul( // TODO: call BLAS-2 SGEMV(y,x) } else if constexpr (std::is_same_v) { // TODO: call BLAS-2 DGEMV(y,x) - } else if constexpr (std::is_same_v>) { + } else if constexpr (std::is_same_v>) { // TODO: call BLAS-2 CGEMV(y,x) - } else if constexpr (std::is_same_v>) { + } else if constexpr (std::is_same_v>) { // TODO: call BLAS-2 ZGEMV(y,x) } } @@ -441,8 +421,6 @@ static inline RT_API_ATTRS void DoMatmul( } } -RT_DIAG_POP - template struct MatmulHelper { diff --git a/flang/runtime/numeric.cpp b/flang/runtime/numeric.cpp index 9a8ddc6615564d..23f8da3f81f176 100644 --- a/flang/runtime/numeric.cpp +++ b/flang/runtime/numeric.cpp @@ -144,7 +144,7 @@ inline RT_API_ATTRS CppTypeFor SelectedRealKind( #ifdef FLANG_RUNTIME_NO_REAL_3 mask &= ~(1 << 3); #endif -#if LDBL_MANT_DIG < 64 || defined FLANG_RUNTIME_NO_REAL_10 +#if !HAS_FLOAT80 || defined FLANG_RUNTIME_NO_REAL_10 mask &= ~(1 << 10); #endif #if LDBL_MANT_DIG < 64 || defined FLANG_RUNTIME_NO_REAL_16 @@ -276,7 +276,7 @@ CppTypeFor RTDEF(Ceiling8_16)( return Ceiling>(x); } #endif -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(Ceiling10_1)( CppTypeFor x) { return Ceiling>(x); @@ -332,7 +332,7 @@ CppTypeFor RTDEF(ErfcScaled8)( CppTypeFor x) { return ErfcScaled(x); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(ErfcScaled10)( CppTypeFor x) { return ErfcScaled(x); @@ -361,7 +361,7 @@ CppTypeFor RTDEF(Exponent8_8)( CppTypeFor x) { return Exponent>(x); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(Exponent10_4)( CppTypeFor x) { return Exponent>(x); @@ -416,7 +416,7 @@ CppTypeFor RTDEF(Floor8_16)( return Floor>(x); } #endif -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(Floor10_1)( CppTypeFor x) { return Floor>(x); @@ -472,7 +472,7 @@ CppTypeFor RTDEF(Fraction8)( CppTypeFor x) { return Fraction(x); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(Fraction10)( CppTypeFor x) { return Fraction(x); @@ -485,7 +485,7 @@ bool RTDEF(IsFinite4)(CppTypeFor x) { bool RTDEF(IsFinite8)(CppTypeFor x) { return std::isfinite(x); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 bool RTDEF(IsFinite10)(CppTypeFor x) { return std::isfinite(x); } @@ -501,7 +501,7 @@ bool RTDEF(IsNaN4)(CppTypeFor x) { bool RTDEF(IsNaN8)(CppTypeFor x) { return std::isnan(x); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 bool RTDEF(IsNaN10)(CppTypeFor x) { return std::isnan(x); } @@ -553,7 +553,7 @@ CppTypeFor RTDEF(ModReal8)( const char *sourceFile, int sourceLine) { return RealMod(x, p, sourceFile, sourceLine); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(ModReal10)( CppTypeFor x, CppTypeFor p, const char *sourceFile, int sourceLine) { @@ -603,7 +603,7 @@ CppTypeFor RTDEF(ModuloReal8)( const char *sourceFile, int sourceLine) { return RealMod(x, p, sourceFile, sourceLine); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(ModuloReal10)( CppTypeFor x, CppTypeFor p, const char *sourceFile, int sourceLine) { @@ -619,7 +619,7 @@ CppTypeFor RTDEF(Nearest8)( CppTypeFor x, bool positive) { return Nearest<53>(x, positive); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(Nearest10)( CppTypeFor x, bool positive) { return Nearest<64>(x, positive); @@ -670,7 +670,7 @@ CppTypeFor RTDEF(Nint8_16)( return Nint>(x); } #endif -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(Nint10_1)( CppTypeFor x) { return Nint>(x); @@ -726,7 +726,7 @@ CppTypeFor RTDEF(RRSpacing8)( CppTypeFor x) { return RRSpacing<53>(x); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(RRSpacing10)( CppTypeFor x) { return RRSpacing<64>(x); @@ -741,7 +741,7 @@ CppTypeFor RTDEF(SetExponent8)( CppTypeFor x, std::int64_t p) { return SetExponent(x, p); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(SetExponent10)( CppTypeFor x, std::int64_t p) { return SetExponent(x, p); @@ -756,7 +756,7 @@ CppTypeFor RTDEF(Scale8)( CppTypeFor x, std::int64_t p) { return Scale(x, p); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(Scale10)( CppTypeFor x, std::int64_t p) { return Scale(x, p); @@ -876,7 +876,7 @@ CppTypeFor RTDEF(Spacing8)( CppTypeFor x) { return Spacing<53>(x); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(Spacing10)( CppTypeFor x) { return Spacing<64>(x); @@ -893,7 +893,7 @@ CppTypeFor RTDEF(FPow8i)( CppTypeFor e) { return FPowI(b, e); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(FPow10i)( CppTypeFor b, CppTypeFor e) { @@ -918,7 +918,7 @@ CppTypeFor RTDEF(FPow8k)( CppTypeFor e) { return FPowI(b, e); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(FPow10k)( CppTypeFor b, CppTypeFor e) { diff --git a/flang/runtime/product.cpp b/flang/runtime/product.cpp index 7fc0fcd3b107de..39b40d82b05401 100644 --- a/flang/runtime/product.cpp +++ b/flang/runtime/product.cpp @@ -36,16 +36,11 @@ template class NonComplexProductAccumulator { INTERMEDIATE product_{1}; }; -// Suppress the warnings about calling __host__-only std::complex operators, -// defined in C++ STD header files, from __device__ code. -RT_DIAG_PUSH -RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN - template class ComplexProductAccumulator { public: explicit RT_API_ATTRS ComplexProductAccumulator(const Descriptor &array) : array_{array} {} - RT_API_ATTRS void Reinitialize() { product_ = std::complex{1, 0}; } + RT_API_ATTRS void Reinitialize() { product_ = rtcmplx::complex{1, 0}; } template RT_API_ATTRS void GetResult(A *p, int /*zeroBasedDim*/ = -1) const { using ResultPart = typename A::value_type; @@ -60,11 +55,9 @@ template class ComplexProductAccumulator { private: const Descriptor &array_; - std::complex product_{1, 0}; + rtcmplx::complex product_{1, 0}; }; -RT_DIAG_POP - extern "C" { RT_EXT_API_GROUP_BEGIN @@ -116,7 +109,7 @@ CppTypeFor RTDEF(ProductReal8)(const Descriptor &x, NonComplexProductAccumulator>{x}, "PRODUCT"); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(ProductReal10)(const Descriptor &x, const char *source, int line, int dim, const Descriptor *mask) { return GetTotalReduction(x, source, line, dim, mask, @@ -147,7 +140,7 @@ void RTDEF(CppProductComplex8)(CppTypeFor &result, mask, ComplexProductAccumulator>{x}, "PRODUCT"); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 void RTDEF(CppProductComplex10)(CppTypeFor &result, const Descriptor &x, const char *source, int line, int dim, const Descriptor *mask) { diff --git a/flang/runtime/random.cpp b/flang/runtime/random.cpp index 69de9b8c96fb5d..9ec961fd058745 100644 --- a/flang/runtime/random.cpp +++ b/flang/runtime/random.cpp @@ -66,7 +66,7 @@ void RTNAME(RandomNumber)( return; case 10: if constexpr (HasCppTypeFor) { -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 Generate, 64>(harvest); return; #endif diff --git a/flang/runtime/reduce.cpp b/flang/runtime/reduce.cpp index 2f4bb6ea159cf4..6b62e1cf1e76f1 100644 --- a/flang/runtime/reduce.cpp +++ b/flang/runtime/reduce.cpp @@ -395,45 +395,49 @@ void RTDEF(ReduceReal8DimValue)(Descriptor &result, const Descriptor &array, PartialReduction(result, array, array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator); } -#if LDBL_MANT_DIG == 64 -long double RTDEF(ReduceReal10Ref)(const Descriptor &array, - ReferenceReductionOperation operation, const char *source, - int line, int dim, const Descriptor *mask, const long double *identity, - bool ordered) { +#if HAS_FLOAT80 +CppTypeFor RTDEF(ReduceReal10Ref)( + const Descriptor &array, + ReferenceReductionOperation> operation, + const char *source, int line, int dim, const Descriptor *mask, + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; return GetTotalReduction(array, source, line, dim, mask, - ReduceAccumulator{ + ReduceAccumulator, false>{ array, operation, identity, terminator}, "REDUCE"); } -long double RTDEF(ReduceReal10Value)(const Descriptor &array, - ValueReductionOperation operation, const char *source, - int line, int dim, const Descriptor *mask, const long double *identity, - bool ordered) { +CppTypeFor RTDEF(ReduceReal10Value)( + const Descriptor &array, + ValueReductionOperation> operation, + const char *source, int line, int dim, const Descriptor *mask, + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; return GetTotalReduction(array, source, line, dim, mask, - ReduceAccumulator{ + ReduceAccumulator, true>{ array, operation, identity, terminator}, "REDUCE"); } void RTDEF(ReduceReal10DimRef)(Descriptor &result, const Descriptor &array, - ReferenceReductionOperation operation, const char *source, - int line, int dim, const Descriptor *mask, const long double *identity, - bool ordered) { + ReferenceReductionOperation> operation, + const char *source, int line, int dim, const Descriptor *mask, + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; - using Accumulator = ReduceAccumulator; + using Accumulator = + ReduceAccumulator, false>; Accumulator accumulator{array, operation, identity, terminator}; PartialReduction(result, array, array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator); } void RTDEF(ReduceReal10DimValue)(Descriptor &result, const Descriptor &array, - ValueReductionOperation operation, const char *source, - int line, int dim, const Descriptor *mask, const long double *identity, - bool ordered) { + ValueReductionOperation> operation, + const char *source, int line, int dim, const Descriptor *mask, + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; - using Accumulator = ReduceAccumulator; + using Accumulator = + ReduceAccumulator, true>; Accumulator accumulator{array, operation, identity, terminator}; PartialReduction(result, array, array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator); @@ -484,187 +488,199 @@ void RTDEF(ReduceReal16DimValue)(Descriptor &result, const Descriptor &array, } #endif -void RTDEF(CppReduceComplex4Ref)(std::complex &result, +void RTDEF(CppReduceComplex4Ref)(CppTypeFor &result, const Descriptor &array, - ReferenceReductionOperation> operation, + ReferenceReductionOperation> operation, const char *source, int line, int dim, const Descriptor *mask, - const std::complex *identity, bool ordered) { + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; result = GetTotalReduction(array, source, line, dim, mask, - ReduceAccumulator, false>{ + ReduceAccumulator, false>{ array, operation, identity, terminator}, "REDUCE"); } -void RTDEF(CppReduceComplex4Value)(std::complex &result, +void RTDEF(CppReduceComplex4Value)(CppTypeFor &result, const Descriptor &array, - ValueReductionOperation> operation, const char *source, - int line, int dim, const Descriptor *mask, - const std::complex *identity, bool ordered) { + ValueReductionOperation> operation, + const char *source, int line, int dim, const Descriptor *mask, + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; result = GetTotalReduction(array, source, line, dim, mask, - ReduceAccumulator, true>{ + ReduceAccumulator, true>{ array, operation, identity, terminator}, "REDUCE"); } void RTDEF(CppReduceComplex4DimRef)(Descriptor &result, const Descriptor &array, - ReferenceReductionOperation> operation, + ReferenceReductionOperation> operation, const char *source, int line, int dim, const Descriptor *mask, - const std::complex *identity, bool ordered) { + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; - using Accumulator = ReduceAccumulator, false>; + using Accumulator = + ReduceAccumulator, false>; Accumulator accumulator{array, operation, identity, terminator}; PartialReduction(result, array, array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator); } void RTDEF(CppReduceComplex4DimValue)(Descriptor &result, const Descriptor &array, - ValueReductionOperation> operation, const char *source, - int line, int dim, const Descriptor *mask, - const std::complex *identity, bool ordered) { + ValueReductionOperation> operation, + const char *source, int line, int dim, const Descriptor *mask, + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; - using Accumulator = ReduceAccumulator, true>; + using Accumulator = + ReduceAccumulator, true>; Accumulator accumulator{array, operation, identity, terminator}; PartialReduction(result, array, array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator); } -void RTDEF(CppReduceComplex8Ref)(std::complex &result, +void RTDEF(CppReduceComplex8Ref)(CppTypeFor &result, const Descriptor &array, - ReferenceReductionOperation> operation, + ReferenceReductionOperation> operation, const char *source, int line, int dim, const Descriptor *mask, - const std::complex *identity, bool ordered) { + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; result = GetTotalReduction(array, source, line, dim, mask, - ReduceAccumulator, false>{ + ReduceAccumulator, false>{ array, operation, identity, terminator}, "REDUCE"); } -void RTDEF(CppReduceComplex8Value)(std::complex &result, +void RTDEF(CppReduceComplex8Value)(CppTypeFor &result, const Descriptor &array, - ValueReductionOperation> operation, const char *source, - int line, int dim, const Descriptor *mask, - const std::complex *identity, bool ordered) { + ValueReductionOperation> operation, + const char *source, int line, int dim, const Descriptor *mask, + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; result = GetTotalReduction(array, source, line, dim, mask, - ReduceAccumulator, true>{ + ReduceAccumulator, true>{ array, operation, identity, terminator}, "REDUCE"); } void RTDEF(CppReduceComplex8DimRef)(Descriptor &result, const Descriptor &array, - ReferenceReductionOperation> operation, + ReferenceReductionOperation> operation, const char *source, int line, int dim, const Descriptor *mask, - const std::complex *identity, bool ordered) { + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; - using Accumulator = ReduceAccumulator, false>; + using Accumulator = + ReduceAccumulator, false>; Accumulator accumulator{array, operation, identity, terminator}; PartialReduction(result, array, array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator); } void RTDEF(CppReduceComplex8DimValue)(Descriptor &result, const Descriptor &array, - ValueReductionOperation> operation, const char *source, - int line, int dim, const Descriptor *mask, - const std::complex *identity, bool ordered) { + ValueReductionOperation> operation, + const char *source, int line, int dim, const Descriptor *mask, + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; - using Accumulator = ReduceAccumulator, true>; + using Accumulator = + ReduceAccumulator, true>; Accumulator accumulator{array, operation, identity, terminator}; PartialReduction(result, array, array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator); } -#if LDBL_MANT_DIG == 64 -void RTDEF(CppReduceComplex10Ref)(std::complex &result, +#if HAS_FLOAT80 +void RTDEF(CppReduceComplex10Ref)(CppTypeFor &result, const Descriptor &array, - ReferenceReductionOperation> operation, + ReferenceReductionOperation> + operation, const char *source, int line, int dim, const Descriptor *mask, - const std::complex *identity, bool ordered) { + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; result = GetTotalReduction(array, source, line, dim, mask, - ReduceAccumulator, false>{ + ReduceAccumulator, false>{ array, operation, identity, terminator}, "REDUCE"); } -void RTDEF(CppReduceComplex10Value)(std::complex &result, - const Descriptor &array, - ValueReductionOperation> operation, +void RTDEF(CppReduceComplex10Value)( + CppTypeFor &result, const Descriptor &array, + ValueReductionOperation> operation, const char *source, int line, int dim, const Descriptor *mask, - const std::complex *identity, bool ordered) { + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; result = GetTotalReduction(array, source, line, dim, mask, - ReduceAccumulator, true>{ + ReduceAccumulator, true>{ array, operation, identity, terminator}, "REDUCE"); } void RTDEF(CppReduceComplex10DimRef)(Descriptor &result, const Descriptor &array, - ReferenceReductionOperation> operation, + ReferenceReductionOperation> + operation, const char *source, int line, int dim, const Descriptor *mask, - const std::complex *identity, bool ordered) { + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; - using Accumulator = ReduceAccumulator, false>; + using Accumulator = + ReduceAccumulator, false>; Accumulator accumulator{array, operation, identity, terminator}; PartialReduction(result, array, array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator); } void RTDEF(CppReduceComplex10DimValue)(Descriptor &result, const Descriptor &array, - ValueReductionOperation> operation, + ValueReductionOperation> operation, const char *source, int line, int dim, const Descriptor *mask, - const std::complex *identity, bool ordered) { + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; - using Accumulator = ReduceAccumulator, true>; + using Accumulator = + ReduceAccumulator, true>; Accumulator accumulator{array, operation, identity, terminator}; PartialReduction(result, array, array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator); } #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 -void RTDEF(CppReduceComplex16Ref)(std::complex &result, +void RTDEF(CppReduceComplex16Ref)(CppTypeFor &result, const Descriptor &array, - ReferenceReductionOperation> operation, + ReferenceReductionOperation> + operation, const char *source, int line, int dim, const Descriptor *mask, - const std::complex *identity, bool ordered) { + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; result = GetTotalReduction(array, source, line, dim, mask, - ReduceAccumulator, false>{ + ReduceAccumulator, false>{ array, operation, identity, terminator}, "REDUCE"); } -void RTDEF(CppReduceComplex16Value)(std::complex &result, - const Descriptor &array, - ValueReductionOperation> operation, +void RTDEF(CppReduceComplex16Value)( + CppTypeFor &result, const Descriptor &array, + ValueReductionOperation> operation, const char *source, int line, int dim, const Descriptor *mask, - const std::complex *identity, bool ordered) { + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; result = GetTotalReduction(array, source, line, dim, mask, - ReduceAccumulator, true>{ + ReduceAccumulator, true>{ array, operation, identity, terminator}, "REDUCE"); } void RTDEF(CppReduceComplex16DimRef)(Descriptor &result, const Descriptor &array, - ReferenceReductionOperation> operation, + ReferenceReductionOperation> + operation, const char *source, int line, int dim, const Descriptor *mask, - const std::complex *identity, bool ordered) { + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; - using Accumulator = ReduceAccumulator, false>; + using Accumulator = + ReduceAccumulator, false>; Accumulator accumulator{array, operation, identity, terminator}; PartialReduction(result, array, array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator); } void RTDEF(CppReduceComplex16DimValue)(Descriptor &result, const Descriptor &array, - ValueReductionOperation> operation, + ValueReductionOperation> operation, const char *source, int line, int dim, const Descriptor *mask, - const std::complex *identity, bool ordered) { + const CppTypeFor *identity, bool ordered) { Terminator terminator{source, line}; - using Accumulator = ReduceAccumulator, true>; + using Accumulator = + ReduceAccumulator, true>; Accumulator accumulator{array, operation, identity, terminator}; PartialReduction(result, array, array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator); diff --git a/flang/runtime/reduction-templates.h b/flang/runtime/reduction-templates.h index a51404c9637620..6b7d57f98384ae 100644 --- a/flang/runtime/reduction-templates.h +++ b/flang/runtime/reduction-templates.h @@ -321,8 +321,8 @@ RT_VAR_GROUP_BEGIN static constexpr RT_CONST_VAR_ATTRS int Norm2LargestLDKind { #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 16 -#elif LDBL_MANT_DIG == 64 - 10 +#elif HAS_FLOAT80 + 10 #else 8 #endif diff --git a/flang/runtime/sum.cpp b/flang/runtime/sum.cpp index 63d8c9029a0ef5..88c6c914e1e243 100644 --- a/flang/runtime/sum.cpp +++ b/flang/runtime/sum.cpp @@ -141,18 +141,18 @@ CppTypeFor RTDEF(SumReal8)(const Descriptor &x, return GetTotalReduction( x, source, line, dim, mask, RealSumAccumulator{x}, "SUM"); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 CppTypeFor RTDEF(SumReal10)(const Descriptor &x, const char *source, int line, int dim, const Descriptor *mask) { - return GetTotalReduction( - x, source, line, dim, mask, RealSumAccumulator{x}, "SUM"); + return GetTotalReduction(x, source, line, dim, mask, + RealSumAccumulator>{x}, "SUM"); } #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 CppTypeFor RTDEF(SumReal16)(const Descriptor &x, const char *source, int line, int dim, const Descriptor *mask) { - return GetTotalReduction( - x, source, line, dim, mask, RealSumAccumulator{x}, "SUM"); + return GetTotalReduction(x, source, line, dim, mask, + RealSumAccumulator>{x}, "SUM"); } #endif @@ -168,20 +168,22 @@ void RTDEF(CppSumComplex8)(CppTypeFor &result, result = GetTotalReduction( x, source, line, dim, mask, ComplexSumAccumulator{x}, "SUM"); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 void RTDEF(CppSumComplex10)(CppTypeFor &result, const Descriptor &x, const char *source, int line, int dim, const Descriptor *mask) { - result = GetTotalReduction( - x, source, line, dim, mask, ComplexSumAccumulator{x}, "SUM"); + result = + GetTotalReduction(x, source, line, dim, mask, + ComplexSumAccumulator>{x}, "SUM"); } #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 void RTDEF(CppSumComplex16)(CppTypeFor &result, const Descriptor &x, const char *source, int line, int dim, const Descriptor *mask) { - result = GetTotalReduction( - x, source, line, dim, mask, ComplexSumAccumulator{x}, "SUM"); + result = + GetTotalReduction(x, source, line, dim, mask, + ComplexSumAccumulator>{x}, "SUM"); } #endif diff --git a/flang/runtime/transformational.cpp b/flang/runtime/transformational.cpp index b6b204be4418c9..0ce18171274e42 100644 --- a/flang/runtime/transformational.cpp +++ b/flang/runtime/transformational.cpp @@ -342,7 +342,7 @@ void RTDEF(BesselJn_8)(Descriptor &result, int32_t n1, int32_t n2, result, n1, n2, x, bn2, bn2_1, sourceFile, line); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 void RTDEF(BesselJn_10)(Descriptor &result, int32_t n1, int32_t n2, CppTypeFor x, CppTypeFor bn2, @@ -375,7 +375,7 @@ void RTDEF(BesselJnX0_8)(Descriptor &result, int32_t n1, int32_t n2, DoBesselJnX0(result, n1, n2, sourceFile, line); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 void RTDEF(BesselJnX0_10)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile, int line) { DoBesselJnX0(result, n1, n2, sourceFile, line); @@ -405,7 +405,7 @@ void RTDEF(BesselYn_8)(Descriptor &result, int32_t n1, int32_t n2, result, n1, n2, x, bn1, bn1_1, sourceFile, line); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 void RTDEF(BesselYn_10)(Descriptor &result, int32_t n1, int32_t n2, CppTypeFor x, CppTypeFor bn1, @@ -438,7 +438,7 @@ void RTDEF(BesselYnX0_8)(Descriptor &result, int32_t n1, int32_t n2, DoBesselYnX0(result, n1, n2, sourceFile, line); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 void RTDEF(BesselYnX0_10)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile, int line) { DoBesselYnX0(result, n1, n2, sourceFile, line); diff --git a/flang/unittests/Runtime/Numeric.cpp b/flang/unittests/Runtime/Numeric.cpp index 799756aab3839a..3e574c06b091e8 100644 --- a/flang/unittests/Runtime/Numeric.cpp +++ b/flang/unittests/Runtime/Numeric.cpp @@ -34,7 +34,7 @@ TEST(Numeric, Floor) { TEST(Numeric, Erfc_scaled) { EXPECT_NEAR(RTNAME(ErfcScaled4)(Real<4>{20.0}), 0.02817434874, 1.0e-8); EXPECT_NEAR(RTNAME(ErfcScaled8)(Real<8>{20.0}), 0.02817434874, 1.0e-11); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 EXPECT_NEAR(RTNAME(ErfcScaled10)(Real<10>{20.0}), 0.02817434874, 1.0e-8); #endif } @@ -295,7 +295,7 @@ TEST(Numeric, FPowI) { EXPECT_EQ(RTNAME(FPow8k)(Real<8>{-3}, Int<8>{3}), Real<8>{-27}); EXPECT_EQ(RTNAME(FPow8k)(Real<8>{-2}, Int<8>{-3}), Real<8>{-0.125}); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 EXPECT_EQ(RTNAME(FPow10i)(Real<10>{0}, Int<4>{0}), Real<10>{1}); EXPECT_EQ(RTNAME(FPow10i)(Real<10>{0.3}, Int<4>{0}), Real<10>{1}); EXPECT_EQ(RTNAME(FPow10i)(Real<10>{2}, Int<4>{-1}), Real<10>{0.5}); diff --git a/flang/unittests/Runtime/Transformational.cpp b/flang/unittests/Runtime/Transformational.cpp index 5836e70c740f9a..b36ea0a60c670c 100644 --- a/flang/unittests/Runtime/Transformational.cpp +++ b/flang/unittests/Runtime/Transformational.cpp @@ -108,7 +108,7 @@ template static void testBesselJnX0(BesselX0FuncType rtFunc) { static void testBesselJn() { testBesselJn<4>(RTNAME(BesselJn_4)); testBesselJn<8>(RTNAME(BesselJn_8)); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 testBesselJn<10>(RTNAME(BesselJn_10)); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 @@ -117,7 +117,7 @@ static void testBesselJn() { testBesselJnX0<4>(RTNAME(BesselJnX0_4)); testBesselJnX0<8>(RTNAME(BesselJnX0_8)); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 testBesselJnX0<10>(RTNAME(BesselJnX0_10)); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 @@ -201,7 +201,7 @@ template static void testBesselYnX0(BesselX0FuncType rtFunc) { static void testBesselYn() { testBesselYn<4>(RTNAME(BesselYn_4)); testBesselYn<8>(RTNAME(BesselYn_8)); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 testBesselYn<10>(RTNAME(BesselYn_10)); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 @@ -210,7 +210,7 @@ static void testBesselYn() { testBesselYnX0<4>(RTNAME(BesselYnX0_4)); testBesselYnX0<8>(RTNAME(BesselYnX0_8)); -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 testBesselYnX0<10>(RTNAME(BesselYnX0_10)); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 @@ -523,7 +523,7 @@ TEST(Transformational, Unpack) { result.Destroy(); } -#if LDBL_MANT_DIG == 64 +#if HAS_FLOAT80 // Make sure the destination descriptor is created by the runtime // with proper element size, when REAL*10 maps to 'long double'. #define Real10CppType long double From 9f5139ccee0ca6134edeb61d15dd4ae123f5149d Mon Sep 17 00:00:00 2001 From: Vasileios Porpodas Date: Wed, 18 Sep 2024 17:55:57 -0700 Subject: [PATCH 138/321] [SandboxIR] Fix unused variable build error --- llvm/unittests/SandboxIR/SandboxIRTest.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp index 312705caad1a6e..1c946d6cb05a82 100644 --- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp +++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp @@ -1751,7 +1751,6 @@ define void @foo(i8 %v1, ptr %ptr) { llvm::Function *LLVMF = &*M->getFunction("foo"); sandboxir::Context Ctx(C); sandboxir::Function *F = Ctx.createFunction(LLVMF); - auto *Arg = F->getArg(0); auto *BB = &*F->begin(); auto It = BB->begin(); auto *Add0 = cast(&*It++); From 615bd9ee60ca213d0e93a7ddc5c1bf48418952e9 Mon Sep 17 00:00:00 2001 From: jimingham Date: Wed, 18 Sep 2024 18:10:43 -0700 Subject: [PATCH 139/321] Add docs and an example use of the scripted command get_flags API. (#109176) The API is present, and we even have a test for it, but it isn't documented so no one probably knows you can set requirements for your scripted commands. This just adds docs and uses it appropriately in the `framestats` example command. --- lldb/docs/use/python-reference.rst | 6 ++++++ lldb/examples/python/cmdtemplate.py | 3 +++ 2 files changed, 9 insertions(+) diff --git a/lldb/docs/use/python-reference.rst b/lldb/docs/use/python-reference.rst index 041e541a96f083..b12048f1af067d 100644 --- a/lldb/docs/use/python-reference.rst +++ b/lldb/docs/use/python-reference.rst @@ -562,6 +562,12 @@ which should implement the following interface: this call should return the short help text for this command[1] def get_long_help(self): this call should return the long help text for this command[1] + def get_flags(self): + this will be called when the command is added to the command interpreter, + and should return a flag field made from or-ing together the appropriate + elements of the lldb.CommandFlags enum to specify the requirements of this command. + The CommandInterpreter will make sure all these requirements are met, and will + return the standard lldb error if they are not.[1] def get_repeat_command(self, command): The auto-repeat command is what will get executed when the user types just a return at the next prompt after this command is run. Even if your command diff --git a/lldb/examples/python/cmdtemplate.py b/lldb/examples/python/cmdtemplate.py index 9a96888508b6f2..b6a21cba7113e6 100644 --- a/lldb/examples/python/cmdtemplate.py +++ b/lldb/examples/python/cmdtemplate.py @@ -25,6 +25,9 @@ def register_lldb_command(cls, debugger, module_name): '--help" for detailed help.'.format(cls.program) ) + def get_flags(self): + return lldb.eCommandRequiresFrame | lldb.eCommandProcessMustBePaused + def setup_command_definition(self): self.ov_parser.add_option( From d21a43579e36af4aa90bf541aa8bab33e7500297 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 18 Sep 2024 17:24:43 -0700 Subject: [PATCH 140/321] [LegalizeVectorOps][RISCV] Don't scalarize FNEG in ExpandFNEG if FSUB is marked Promote. We have a special check that tries to determine if vector FP operations are supported for the type to determine whether to scalarize or not. If FP arithmetic would be promoted, don't unroll. This improves Zvfhmin codegen on RISC-V. --- .../SelectionDAG/LegalizeVectorOps.cpp | 2 +- .../CodeGen/RISCV/rvv/fixed-vectors-fp.ll | 1509 ++++++----------- 2 files changed, 539 insertions(+), 972 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 3dc5affacc5a76..5d433204d5da08 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -1806,7 +1806,7 @@ SDValue VectorLegalizer::ExpandFNEG(SDNode *Node) { // FIXME: The FSUB check is here to force unrolling v1f64 vectors on AArch64. if (!TLI.isOperationLegalOrCustom(ISD::XOR, IntVT) || - !(TLI.isOperationLegalOrCustom(ISD::FSUB, VT) || VT.isScalableVector())) + !(TLI.isOperationLegalOrCustomOrPromote(ISD::FSUB, VT) || VT.isScalableVector())) return SDValue(); SDLoc DL(Node); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll index b5c40fbfaac6c9..5ab8eab091c2e4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll @@ -428,50 +428,11 @@ define void @fneg_v8f16(ptr %x) { ; ; ZVFHMIN-LABEL: fneg_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: addi sp, sp, -16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: mv a1, sp -; ZVFHMIN-NEXT: vse16.v v8, (a1) -; ZVFHMIN-NEXT: flh fa5, 2(sp) -; ZVFHMIN-NEXT: flh fa4, 0(sp) -; ZVFHMIN-NEXT: flh fa3, 4(sp) -; ZVFHMIN-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-NEXT: fmv.x.h a2, fa4 -; ZVFHMIN-NEXT: lui a3, 1048568 -; ZVFHMIN-NEXT: fmv.x.h a4, fa3 -; ZVFHMIN-NEXT: flh fa5, 6(sp) -; ZVFHMIN-NEXT: lui a5, 8 -; ZVFHMIN-NEXT: xor a2, a2, a5 -; ZVFHMIN-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-NEXT: flh fa5, 10(sp) -; ZVFHMIN-NEXT: xor a1, a1, a3 -; ZVFHMIN-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-NEXT: xor a4, a4, a3 -; ZVFHMIN-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-NEXT: flh fa5, 8(sp) -; ZVFHMIN-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-NEXT: xor a2, a2, a3 -; ZVFHMIN-NEXT: vslide1down.vx v8, v8, a2 -; ZVFHMIN-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-NEXT: flh fa5, 12(sp) -; ZVFHMIN-NEXT: xor a1, a1, a3 -; ZVFHMIN-NEXT: xor a2, a2, a5 -; ZVFHMIN-NEXT: vmv.v.x v9, a2 -; ZVFHMIN-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-NEXT: flh fa5, 14(sp) -; ZVFHMIN-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-NEXT: xor a2, a2, a3 -; ZVFHMIN-NEXT: vslide1down.vx v9, v9, a2 -; ZVFHMIN-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-NEXT: xor a1, a1, a3 -; ZVFHMIN-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-NEXT: vse16.v v9, (a0) -; ZVFHMIN-NEXT: addi sp, sp, 16 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 +; ZVFHMIN-NEXT: vse16.v v8, (a0) ; ZVFHMIN-NEXT: ret %a = load <8 x half>, ptr %x %b = fneg <8 x half> %a @@ -490,52 +451,11 @@ define void @fneg_v6f16(ptr %x) { ; ; ZVFHMIN-LABEL: fneg_v6f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: addi sp, sp, -16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: mv a1, sp -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vse16.v v8, (a1) -; ZVFHMIN-NEXT: flh fa5, 2(sp) -; ZVFHMIN-NEXT: flh fa4, 0(sp) -; ZVFHMIN-NEXT: flh fa3, 4(sp) -; ZVFHMIN-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-NEXT: fmv.x.h a2, fa4 -; ZVFHMIN-NEXT: lui a3, 1048568 -; ZVFHMIN-NEXT: fmv.x.h a4, fa3 -; ZVFHMIN-NEXT: flh fa5, 6(sp) -; ZVFHMIN-NEXT: lui a5, 8 -; ZVFHMIN-NEXT: xor a2, a2, a5 -; ZVFHMIN-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-NEXT: flh fa5, 10(sp) -; ZVFHMIN-NEXT: xor a1, a1, a3 -; ZVFHMIN-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-NEXT: xor a4, a4, a3 -; ZVFHMIN-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-NEXT: flh fa5, 8(sp) -; ZVFHMIN-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-NEXT: xor a2, a2, a3 -; ZVFHMIN-NEXT: vslide1down.vx v8, v8, a2 -; ZVFHMIN-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-NEXT: flh fa5, 12(sp) -; ZVFHMIN-NEXT: xor a1, a1, a3 -; ZVFHMIN-NEXT: xor a2, a2, a5 -; ZVFHMIN-NEXT: vmv.v.x v9, a2 -; ZVFHMIN-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-NEXT: flh fa5, 14(sp) -; ZVFHMIN-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-NEXT: xor a2, a2, a3 -; ZVFHMIN-NEXT: vslide1down.vx v9, v9, a2 -; ZVFHMIN-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-NEXT: xor a1, a1, a3 -; ZVFHMIN-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, mu -; ZVFHMIN-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-NEXT: vse16.v v9, (a0) -; ZVFHMIN-NEXT: addi sp, sp, 16 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 +; ZVFHMIN-NEXT: vse16.v v8, (a0) ; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = fneg <6 x half> %a @@ -2013,48 +1933,50 @@ define void @copysign_neg_v8f16(ptr %x, ptr %y) { ; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu ; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v8, (a1) ; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v9, (a0) +; ZVFHMIN-ZFH-RV32-NEXT: lui a1, 8 +; ZVFHMIN-ZFH-RV32-NEXT: vxor.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV32-NEXT: mv a1, sp +; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v9, (a1) ; ZVFHMIN-ZFH-RV32-NEXT: addi a1, sp, 16 ; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 30(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 28(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 24(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 26(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 2(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 18(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 0(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 16(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa3, fa2 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 4(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 20(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 6(sp) ; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 22(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa0, 20(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh ft0, 16(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh ft1, 18(sp) -; ZVFHMIN-ZFH-RV32-NEXT: mv a1, sp -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v9, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: flh ft2, 2(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh ft3, 0(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h ft1, ft2, ft1 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, ft1 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h ft0, ft3, ft0 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, ft0 -; ZVFHMIN-ZFH-RV32-NEXT: flh ft0, 4(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa4, fa3 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa2, fa1 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 10(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 26(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 8(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 24(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa4, fa3 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a5, fa4 ; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa2 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 12(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 28(sp) ; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV32-NEXT: flh ft1, 6(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa0, ft0, fa0 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa1, ft1, fa1 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa0, 10(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa1 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 8(sp) -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa2, fa0, fa2 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa2 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa3, fa1, fa3 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa3 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 12(sp) +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 14(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 30(sp) ; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a5 ; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 14(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa4, fa3, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa5, fa2, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa4 ; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 ; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.i v0, 15 ; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 @@ -2070,48 +1992,50 @@ define void @copysign_neg_v8f16(ptr %x, ptr %y) { ; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu ; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v8, (a1) ; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v9, (a0) +; ZVFHMIN-ZFH-RV64-NEXT: lui a1, 8 +; ZVFHMIN-ZFH-RV64-NEXT: vxor.vx v8, v8, a1 +; ZVFHMIN-ZFH-RV64-NEXT: mv a1, sp +; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v9, (a1) ; ZVFHMIN-ZFH-RV64-NEXT: addi a1, sp, 16 ; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 30(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 28(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 24(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 26(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 2(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 18(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 0(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 16(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa3, fa2 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 4(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 20(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 6(sp) ; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 22(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa0, 20(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh ft0, 16(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh ft1, 18(sp) -; ZVFHMIN-ZFH-RV64-NEXT: mv a1, sp -; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v9, (a1) -; ZVFHMIN-ZFH-RV64-NEXT: flh ft2, 2(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh ft3, 0(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h ft1, ft2, ft1 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, ft1 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h ft0, ft3, ft0 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, ft0 -; ZVFHMIN-ZFH-RV64-NEXT: flh ft0, 4(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa4, fa3 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa2, fa1 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 10(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 26(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 8(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 24(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa4, fa3 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a5, fa4 ; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa2 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 12(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 28(sp) ; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV64-NEXT: flh ft1, 6(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa0, ft0, fa0 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa1, ft1, fa1 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa0, 10(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa1 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 8(sp) -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa2, fa0, fa2 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa2 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa3, fa1, fa3 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa3 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 12(sp) +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 14(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 30(sp) ; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a5 ; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 14(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa4, fa3, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa5, fa2, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa4 ; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 ; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.i v0, 15 ; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 @@ -2127,83 +2051,76 @@ define void @copysign_neg_v8f16(ptr %x, ptr %y) { ; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu ; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v8, (a1) ; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v9, (a0) -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a1, sp, 16 -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 30(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 28(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 24(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 26(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 22(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: lui a1, 8 +; ZVFHMIN-ZFHIN-RV32-NEXT: vxor.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: mv a2, sp +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a2) +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a2, sp, 16 +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a2) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 2(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 18(sp) ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 20(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a6, fa3 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 0(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a4, fa4 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 16(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a7, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: not a1, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: not a2, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t0, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 18(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: not a4, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: not t1, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: not a6, a6 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: mv a5, sp -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a5) -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 2(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: not a7, a7 -; ZVFHMIN-ZFHIN-RV32-NEXT: not t0, t0 -; ZVFHMIN-ZFHIN-RV32-NEXT: not t2, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 4(sp) ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: lui t3, 8 -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a3, t3, -1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and t4, a5, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 0(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lui a5, 1048568 -; ZVFHMIN-ZFHIN-RV32-NEXT: and t2, t2, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: or t2, t4, t2 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t4, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: and t4, t4, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: and t0, t0, t3 -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 4(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: or t0, t4, t0 -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, t0 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, t2 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 20(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a6, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a7, fa3 +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a2, a1, -1 ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t0, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: and t0, t0, a3 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 6(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and a7, a7, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a7, t0, a7 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a7 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a7, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a7, a7, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: and t1, a3, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: lui a3, 1048568 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t2, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 22(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, t1, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a5, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a6, a6, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t1, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 10(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and a6, a6, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a6, a7, a6 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a6 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a5, a5, a6 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 26(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a7, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a6, t0, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a5, a5, a6 ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a6, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 8(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and a6, a6, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a7, t1, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a6, a6, a7 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a7, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a7, a7, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, t3 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, t2, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a7, t1, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t0, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 24(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: or a5, a5, a7 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 12(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a7, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v9, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a6 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a4, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a6, a6, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a4, a6 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a5, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 28(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: and a6, t0, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a6, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 14(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a2, a4, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a5, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 30(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a5, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a4, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a3 ; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a2, a1 ; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.i v0, 15 ; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a1 @@ -2219,83 +2136,76 @@ define void @copysign_neg_v8f16(ptr %x, ptr %y) { ; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu ; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v8, (a1) ; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v9, (a0) -; ZVFHMIN-ZFHIN-RV64-NEXT: addi a1, sp, 16 -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 30(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 28(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 24(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 26(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 22(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: lui a1, 8 +; ZVFHMIN-ZFHIN-RV64-NEXT: vxor.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: mv a2, sp +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v9, (a2) +; ZVFHMIN-ZFHIN-RV64-NEXT: addi a2, sp, 16 +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a2) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 2(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 18(sp) ; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 20(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a5, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a6, fa3 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 0(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a4, fa4 ; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 16(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a7, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: not a1, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: not a2, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h t0, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 18(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: not a4, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: not t1, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: not a6, a6 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: mv a5, sp -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v9, (a5) -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 2(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: not a7, a7 -; ZVFHMIN-ZFHIN-RV64-NEXT: not t0, t0 -; ZVFHMIN-ZFHIN-RV64-NEXT: not t2, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 4(sp) ; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a5, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: lui t3, 8 -; ZVFHMIN-ZFHIN-RV64-NEXT: addiw a3, t3, -1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and t4, a5, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 0(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lui a5, 1048568 -; ZVFHMIN-ZFHIN-RV64-NEXT: and t2, t2, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: or t2, t4, t2 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h t4, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: and t4, t4, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: and t0, t0, t3 -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 4(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: or t0, t4, t0 -; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v8, t0 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, t2 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 20(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a6, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a7, fa3 +; ZVFHMIN-ZFHIN-RV64-NEXT: addiw a2, a1, -1 ; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h t0, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: and t0, t0, a3 ; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 6(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a7, a7, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a7, t0, a7 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a7 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a7, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a7, a7, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: and t1, a3, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: lui a3, 1048568 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a4, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h t2, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 22(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: or a4, t1, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a5, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a6, a6, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h t1, fa5 ; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 10(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a6, a6, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a6, a7, a6 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a6 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a5, a5, a6 +; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v8, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 26(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a7, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a6, t0, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a5, a5, a6 ; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a6, fa5 ; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 8(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a6, a6, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a7, t1, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a6, a6, a7 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a7, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a7, a7, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a4, t3 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, t2, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a7, t1, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h t0, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 24(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: or a5, a5, a7 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a4, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a5, fa5 ; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 12(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: or a4, a7, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v9, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a6 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a4, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a4, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a6, a6, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a4, a4, a6 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a5, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 28(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: and a6, t0, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a6, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa5 ; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 14(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a2, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a2, a4, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a2, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a1, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a5, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a1, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 30(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a5, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a4, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a1, a3 ; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a2, a1 ; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.i v0, 15 ; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a1 @@ -2328,49 +2238,51 @@ define void @copysign_neg_v6f16(ptr %x, ptr %y) { ; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, ma ; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v8, (a1) ; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v9, (a0) -; ZVFHMIN-ZFH-RV32-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFH-RV32-NEXT: lui a1, 8 ; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 30(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 28(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 24(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 26(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 22(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa0, 20(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh ft0, 16(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh ft1, 18(sp) +; ZVFHMIN-ZFH-RV32-NEXT: vxor.vx v8, v8, a1 ; ZVFHMIN-ZFH-RV32-NEXT: mv a1, sp ; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v9, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: flh ft2, 2(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh ft3, 0(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h ft1, ft2, ft1 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, ft1 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h ft0, ft3, ft0 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, ft0 -; ZVFHMIN-ZFH-RV32-NEXT: flh ft0, 4(sp) +; ZVFHMIN-ZFH-RV32-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 2(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 18(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 0(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 16(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa3, fa2 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 4(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 20(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 6(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 22(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa4, fa3 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa2, fa1 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 10(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 26(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 8(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 24(sp) +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa4, fa3 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a5, fa4 ; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa2 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 12(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 28(sp) ; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV32-NEXT: flh ft1, 6(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa0, ft0, fa0 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa1, ft1, fa1 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa0, 10(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa1 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 8(sp) -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa2, fa0, fa2 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa2 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa3, fa1, fa3 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa3 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 12(sp) +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 14(sp) +; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 30(sp) ; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a5 ; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 14(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa4, fa3, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa5, fa2, fa5 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa4 ; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 ; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.i v0, 15 ; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 @@ -2387,49 +2299,51 @@ define void @copysign_neg_v6f16(ptr %x, ptr %y) { ; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, ma ; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v8, (a1) ; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v9, (a0) -; ZVFHMIN-ZFH-RV64-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFH-RV64-NEXT: lui a1, 8 ; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 30(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 28(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 24(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 26(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 22(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa0, 20(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh ft0, 16(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh ft1, 18(sp) +; ZVFHMIN-ZFH-RV64-NEXT: vxor.vx v8, v8, a1 ; ZVFHMIN-ZFH-RV64-NEXT: mv a1, sp ; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v9, (a1) -; ZVFHMIN-ZFH-RV64-NEXT: flh ft2, 2(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh ft3, 0(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h ft1, ft2, ft1 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, ft1 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h ft0, ft3, ft0 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, ft0 -; ZVFHMIN-ZFH-RV64-NEXT: flh ft0, 4(sp) +; ZVFHMIN-ZFH-RV64-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 2(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 18(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 0(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 16(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa3, fa2 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 4(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 20(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 6(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 22(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa4, fa3 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa2, fa1 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 10(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 26(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 8(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 24(sp) +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa4, fa3 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a5, fa4 ; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa2 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 12(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 28(sp) ; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV64-NEXT: flh ft1, 6(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa0, ft0, fa0 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa1, ft1, fa1 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa0, 10(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa1 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 8(sp) -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa2, fa0, fa2 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa2 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa3, fa1, fa3 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa3 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 12(sp) +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a3 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 14(sp) +; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 30(sp) ; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a5 ; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 14(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa4, fa3, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa5, fa2, fa5 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa4 ; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 ; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.i v0, 15 ; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 @@ -2446,84 +2360,77 @@ define void @copysign_neg_v6f16(ptr %x, ptr %y) { ; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, ma ; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v8, (a1) ; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v9, (a0) -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFHIN-RV32-NEXT: lui a1, 8 ; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 30(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 28(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 24(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 26(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 22(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: vxor.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: mv a2, sp +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a2) +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a2, sp, 16 +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a2) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 2(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 18(sp) ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 20(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a6, fa3 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 0(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a4, fa4 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 16(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a7, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: not a1, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: not a2, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t0, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 18(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: not a4, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: not t1, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: not a6, a6 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: mv a5, sp -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a5) -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 2(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: not a7, a7 -; ZVFHMIN-ZFHIN-RV32-NEXT: not t0, t0 -; ZVFHMIN-ZFHIN-RV32-NEXT: not t2, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 4(sp) ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: lui t3, 8 -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a3, t3, -1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and t4, a5, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 0(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lui a5, 1048568 -; ZVFHMIN-ZFHIN-RV32-NEXT: and t2, t2, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: or t2, t4, t2 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t4, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: and t4, t4, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: and t0, t0, t3 -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 4(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: or t0, t4, t0 -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, t0 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, t2 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 20(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a6, fa4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a7, fa3 +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a2, a1, -1 ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t0, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: and t0, t0, a3 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 6(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and a7, a7, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a7, t0, a7 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a7 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a7, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a7, a7, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: and t1, a3, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: lui a3, 1048568 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t2, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 22(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, t1, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a5, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a6, a6, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t1, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 10(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and a6, a6, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a6, a7, a6 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a6 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a5, a5, a6 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 26(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a7, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a6, t0, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a5, a5, a6 ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a6, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 8(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and a6, a6, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a7, t1, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a6, a6, a7 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a7, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a7, a7, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, t3 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, t2, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a7, t1, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h t0, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 24(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: or a5, a5, a7 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 12(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a7, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v9, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a6 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a4, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a6, a6, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a4, a6 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a5, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 28(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: and a6, t0, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a6, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 14(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a2, a4, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a5, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 30(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a5, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a4, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a3 ; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a2, a1 ; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.i v0, 15 ; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a1 @@ -2540,84 +2447,77 @@ define void @copysign_neg_v6f16(ptr %x, ptr %y) { ; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, ma ; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v8, (a1) ; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v9, (a0) -; ZVFHMIN-ZFHIN-RV64-NEXT: addi a1, sp, 16 +; ZVFHMIN-ZFHIN-RV64-NEXT: lui a1, 8 ; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 30(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 28(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 24(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 26(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 22(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: vxor.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: mv a2, sp +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v9, (a2) +; ZVFHMIN-ZFHIN-RV64-NEXT: addi a2, sp, 16 +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a2) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 2(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 18(sp) ; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 20(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a5, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a6, fa3 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 0(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a4, fa4 ; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 16(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a7, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: not a1, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: not a2, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h t0, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 18(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: not a4, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: not t1, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: not a6, a6 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: mv a5, sp -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v9, (a5) -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 2(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: not a7, a7 -; ZVFHMIN-ZFHIN-RV64-NEXT: not t0, t0 -; ZVFHMIN-ZFHIN-RV64-NEXT: not t2, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 4(sp) ; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a5, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: lui t3, 8 -; ZVFHMIN-ZFHIN-RV64-NEXT: addiw a3, t3, -1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and t4, a5, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 0(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lui a5, 1048568 -; ZVFHMIN-ZFHIN-RV64-NEXT: and t2, t2, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: or t2, t4, t2 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h t4, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: and t4, t4, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: and t0, t0, t3 -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 4(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: or t0, t4, t0 -; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v8, t0 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, t2 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 20(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a6, fa4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a7, fa3 +; ZVFHMIN-ZFHIN-RV64-NEXT: addiw a2, a1, -1 ; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h t0, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: and t0, t0, a3 ; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 6(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a7, a7, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a7, t0, a7 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a7 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a7, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a7, a7, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: and t1, a3, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: lui a3, 1048568 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a4, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h t2, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 22(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: or a4, t1, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a5, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a6, a6, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h t1, fa5 ; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 10(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a6, a6, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a6, a7, a6 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a6 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a5, a5, a6 +; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v8, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 26(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a7, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a6, t0, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a5, a5, a6 ; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a6, fa5 ; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 8(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a6, a6, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a7, t1, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a6, a6, a7 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a7, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a7, a7, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a4, t3 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, t2, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a7, t1, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h t0, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 24(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: or a5, a5, a7 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a4, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a5, fa5 ; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 12(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: or a4, a7, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v9, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a6 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a4, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a4, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a6, a6, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a4, a4, a6 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a5, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 28(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: and a6, t0, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a6, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa5 ; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 14(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a2, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a2, a4, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a2, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a1, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a5, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a1, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a4, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 30(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a5, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a4, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a1, a3 ; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a2, a1 ; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.i v0, 15 ; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a1 @@ -2698,26 +2598,28 @@ define void @copysign_neg_trunc_v4f16_v4f32(ptr %x, ptr %y) { ; ZVFHMIN-ZFH-RV32-NEXT: mv a1, sp ; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) ; ZVFHMIN-ZFH-RV32-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-ZFH-RV32-NEXT: lui a1, 8 +; ZVFHMIN-ZFH-RV32-NEXT: vxor.vx v8, v8, a1 ; ZVFHMIN-ZFH-RV32-NEXT: addi a1, sp, 8 ; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) ; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 2(sp) ; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 10(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa5, fa5, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa4 ; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 0(sp) ; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 8(sp) ; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 4(sp) ; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 12(sp) ; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa5, fa4, fa3 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa4, fa3 ; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa5, fa2, fa1 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa2, fa1 ; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a3, fa5 ; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 6(sp) ; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 14(sp) ; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2 ; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 ; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa5, fa5, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa4 ; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 ; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 ; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a0) @@ -2734,26 +2636,28 @@ define void @copysign_neg_trunc_v4f16_v4f32(ptr %x, ptr %y) { ; ZVFHMIN-ZFH-RV64-NEXT: mv a1, sp ; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) ; ZVFHMIN-ZFH-RV64-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-ZFH-RV64-NEXT: lui a1, 8 +; ZVFHMIN-ZFH-RV64-NEXT: vxor.vx v8, v8, a1 ; ZVFHMIN-ZFH-RV64-NEXT: addi a1, sp, 8 ; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) ; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 2(sp) ; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 10(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa5, fa5, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa4 ; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 0(sp) ; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 8(sp) ; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 4(sp) ; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 12(sp) ; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa5, fa4, fa3 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa4, fa3 ; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa5, fa2, fa1 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa2, fa1 ; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a3, fa5 ; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 6(sp) ; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 14(sp) ; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v8, a2 ; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 ; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa5, fa5, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa4 ; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 ; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 ; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a0) @@ -2770,44 +2674,41 @@ define void @copysign_neg_trunc_v4f16_v4f32(ptr %x, ptr %y) { ; ZVFHMIN-ZFHIN-RV32-NEXT: mv a1, sp ; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) ; ZVFHMIN-ZFHIN-RV32-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a1, sp, 8 -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV32-NEXT: lui a1, 8 +; ZVFHMIN-ZFHIN-RV32-NEXT: vxor.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a2, sp, 8 +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a2) ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 2(sp) ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 10(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: lui a2, 8 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 0(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 0(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a4, a2, -1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: not a3, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a4, fa3 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 8(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a5, a1, -1 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a5 ; ZVFHMIN-ZFHIN-RV32-NEXT: lui a6, 1048568 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a3, a6 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a1, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a7, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 4(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a5, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: not a3, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a3, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a3, a6 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a2, a2, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a7, a1 ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 12(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: or a2, a5, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a4, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a1 ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 6(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a3, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: not a1, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a3, a5 ; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a6 ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 14(sp) ; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a3, a1 ; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a5 ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: not a1, a1 ; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a6 ; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a2, a1 ; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 @@ -2825,44 +2726,41 @@ define void @copysign_neg_trunc_v4f16_v4f32(ptr %x, ptr %y) { ; ZVFHMIN-ZFHIN-RV64-NEXT: mv a1, sp ; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) ; ZVFHMIN-ZFHIN-RV64-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-ZFHIN-RV64-NEXT: addi a1, sp, 8 -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV64-NEXT: lui a1, 8 +; ZVFHMIN-ZFHIN-RV64-NEXT: vxor.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: addi a2, sp, 8 +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a2) ; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 2(sp) ; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 10(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: lui a2, 8 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 0(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa5 ; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 0(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: addiw a4, a2, -1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a1, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: not a3, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a4, fa3 ; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 8(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: addiw a5, a1, -1 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a2, a5 ; ZVFHMIN-ZFHIN-RV64-NEXT: lui a6, 1048568 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a3, a6 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a1, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a7, fa5 ; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 4(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a5, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: not a3, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a3, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a3, a6 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a2, a2, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a7, a1 ; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa5 ; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 12(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: or a2, a5, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a4, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a4, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v8, a1 ; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa5 ; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 6(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a3, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: not a1, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a3, a5 ; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a1, a6 ; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa5 ; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 14(sp) ; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a3, a1 ; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a2, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a2, a5 ; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: not a1, a1 ; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a1, a6 ; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a2, a1 ; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1 @@ -2903,26 +2801,28 @@ define void @copysign_neg_trunc_v3f16_v3f32(ptr %x, ptr %y) { ; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma ; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) ; ZVFHMIN-ZFH-RV32-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-ZFH-RV32-NEXT: lui a1, 8 +; ZVFHMIN-ZFH-RV32-NEXT: vxor.vx v8, v8, a1 ; ZVFHMIN-ZFH-RV32-NEXT: addi a1, sp, 8 ; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) ; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 2(sp) ; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 10(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa5, fa5, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa4 ; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 0(sp) ; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 8(sp) ; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 4(sp) ; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 12(sp) ; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa5, fa4, fa3 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa4, fa3 ; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa5, fa2, fa1 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa2, fa1 ; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a3, fa5 ; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 6(sp) ; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 14(sp) ; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2 ; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 ; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnjn.h fa5, fa5, fa4 +; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa4 ; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 ; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 ; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 3, e16, mf4, ta, ma @@ -2941,26 +2841,28 @@ define void @copysign_neg_trunc_v3f16_v3f32(ptr %x, ptr %y) { ; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 4, e16, mf4, ta, ma ; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) ; ZVFHMIN-ZFH-RV64-NEXT: vfncvt.f.f.w v8, v9 +; ZVFHMIN-ZFH-RV64-NEXT: lui a1, 8 +; ZVFHMIN-ZFH-RV64-NEXT: vxor.vx v8, v8, a1 ; ZVFHMIN-ZFH-RV64-NEXT: addi a1, sp, 8 ; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) ; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 2(sp) ; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 10(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa5, fa5, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa4 ; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 0(sp) ; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 8(sp) ; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 4(sp) ; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 12(sp) ; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa5, fa4, fa3 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa4, fa3 ; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa5, fa2, fa1 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa2, fa1 ; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a3, fa5 ; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 6(sp) ; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 14(sp) ; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v8, a2 ; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 ; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnjn.h fa5, fa5, fa4 +; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa4 ; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 ; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 ; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 3, e16, mf4, ta, ma @@ -2979,44 +2881,41 @@ define void @copysign_neg_trunc_v3f16_v3f32(ptr %x, ptr %y) { ; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma ; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) ; ZVFHMIN-ZFHIN-RV32-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a1, sp, 8 -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV32-NEXT: lui a1, 8 +; ZVFHMIN-ZFHIN-RV32-NEXT: vxor.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a2, sp, 8 +; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a2) ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 2(sp) ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa4, 10(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: lui a2, 8 +; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 0(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa4 -; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 0(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a4, a2, -1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: not a3, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a4, fa3 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 8(sp) +; ZVFHMIN-ZFHIN-RV32-NEXT: addi a5, a1, -1 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a5 ; ZVFHMIN-ZFHIN-RV32-NEXT: lui a6, 1048568 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a3, a6 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a1, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a7, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 4(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a5, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: not a3, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a3, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a3, a6 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a2, a2, a3 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a7, a1 ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 12(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: or a2, a5, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, a5 +; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a4, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a1 ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 6(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a3, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: not a1, a1 +; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a3, a5 ; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a6 ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa5 ; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 14(sp) ; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a3, a1 ; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a4 +; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a5 ; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: not a1, a1 ; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a6 ; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a2, a1 ; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 @@ -3036,44 +2935,41 @@ define void @copysign_neg_trunc_v3f16_v3f32(ptr %x, ptr %y) { ; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 4, e16, mf4, ta, ma ; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) ; ZVFHMIN-ZFHIN-RV64-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-ZFHIN-RV64-NEXT: addi a1, sp, 8 -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) +; ZVFHMIN-ZFHIN-RV64-NEXT: lui a1, 8 +; ZVFHMIN-ZFHIN-RV64-NEXT: vxor.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: addi a2, sp, 8 +; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a2) ; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 2(sp) ; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa4, 10(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: lui a2, 8 +; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 0(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa5 ; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa4 -; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 0(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: addiw a4, a2, -1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a1, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: not a3, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a5, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a4, fa3 ; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 8(sp) +; ZVFHMIN-ZFHIN-RV64-NEXT: addiw a5, a1, -1 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a2, a5 ; ZVFHMIN-ZFHIN-RV64-NEXT: lui a6, 1048568 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a3, a6 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a1, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa5 +; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a7, fa5 ; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 4(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a5, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: not a3, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a3, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a3, a6 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a2, a2, a3 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a7, a1 ; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa5 ; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 12(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: or a2, a5, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a4, a5 +; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a4, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v8, a1 ; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa5 ; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 6(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a3, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: not a1, a1 +; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a2 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a3, a5 ; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a1, a6 ; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa5 ; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 14(sp) ; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a3, a1 ; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a2, a4 +; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a2, a5 ; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: not a1, a1 ; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a1, a6 ; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a2, a1 ; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1 @@ -3336,59 +3232,20 @@ define void @fmsub_v8f16(ptr %x, ptr %y, ptr %z) { ; ; ZVFHMIN-LABEL: fmsub_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: addi sp, sp, -16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, mu +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a2) ; ZVFHMIN-NEXT: vle16.v v9, (a0) ; ZVFHMIN-NEXT: vle16.v v10, (a1) -; ZVFHMIN-NEXT: mv a1, sp -; ZVFHMIN-NEXT: vse16.v v8, (a1) -; ZVFHMIN-NEXT: flh fa5, 2(sp) -; ZVFHMIN-NEXT: flh fa4, 0(sp) -; ZVFHMIN-NEXT: flh fa3, 4(sp) -; ZVFHMIN-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-NEXT: fmv.x.h a2, fa4 -; ZVFHMIN-NEXT: lui a3, 1048568 -; ZVFHMIN-NEXT: fmv.x.h a4, fa3 -; ZVFHMIN-NEXT: flh fa5, 6(sp) -; ZVFHMIN-NEXT: lui a5, 8 -; ZVFHMIN-NEXT: xor a2, a2, a5 -; ZVFHMIN-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-NEXT: flh fa5, 10(sp) -; ZVFHMIN-NEXT: xor a1, a1, a3 -; ZVFHMIN-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-NEXT: xor a4, a4, a3 -; ZVFHMIN-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-NEXT: flh fa5, 8(sp) -; ZVFHMIN-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-NEXT: xor a2, a2, a3 -; ZVFHMIN-NEXT: vslide1down.vx v8, v8, a2 -; ZVFHMIN-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-NEXT: flh fa5, 12(sp) -; ZVFHMIN-NEXT: xor a1, a1, a3 -; ZVFHMIN-NEXT: xor a2, a2, a5 -; ZVFHMIN-NEXT: vmv.v.x v11, a2 -; ZVFHMIN-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-NEXT: flh fa5, 14(sp) -; ZVFHMIN-NEXT: vslide1down.vx v11, v11, a1 -; ZVFHMIN-NEXT: xor a2, a2, a3 -; ZVFHMIN-NEXT: vslide1down.vx v11, v11, a2 -; ZVFHMIN-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-NEXT: xor a1, a1, a3 -; ZVFHMIN-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-NEXT: vslide1down.vx v11, v11, a1 -; ZVFHMIN-NEXT: vslidedown.vi v11, v8, 4, v0.t -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v11 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v9, v11, v8 +; ZVFHMIN-NEXT: vfmadd.vv v9, v8, v11 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: vse16.v v8, (a0) -; ZVFHMIN-NEXT: addi sp, sp, 16 ; ZVFHMIN-NEXT: ret %a = load <8 x half>, ptr %x %b = load <8 x half>, ptr %y @@ -3412,60 +3269,21 @@ define void @fmsub_v6f16(ptr %x, ptr %y, ptr %z) { ; ; ZVFHMIN-LABEL: fmsub_v6f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: addi sp, sp, -16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a2) ; ZVFHMIN-NEXT: vle16.v v9, (a0) ; ZVFHMIN-NEXT: vle16.v v10, (a1) -; ZVFHMIN-NEXT: mv a1, sp -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, mu -; ZVFHMIN-NEXT: vse16.v v8, (a1) -; ZVFHMIN-NEXT: flh fa5, 2(sp) -; ZVFHMIN-NEXT: flh fa4, 0(sp) -; ZVFHMIN-NEXT: flh fa3, 4(sp) -; ZVFHMIN-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-NEXT: fmv.x.h a2, fa4 -; ZVFHMIN-NEXT: lui a3, 1048568 -; ZVFHMIN-NEXT: fmv.x.h a4, fa3 -; ZVFHMIN-NEXT: flh fa5, 6(sp) -; ZVFHMIN-NEXT: lui a5, 8 -; ZVFHMIN-NEXT: xor a2, a2, a5 -; ZVFHMIN-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-NEXT: flh fa5, 10(sp) -; ZVFHMIN-NEXT: xor a1, a1, a3 -; ZVFHMIN-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-NEXT: xor a4, a4, a3 -; ZVFHMIN-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-NEXT: flh fa5, 8(sp) -; ZVFHMIN-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-NEXT: xor a2, a2, a3 -; ZVFHMIN-NEXT: vslide1down.vx v8, v8, a2 -; ZVFHMIN-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-NEXT: flh fa5, 12(sp) -; ZVFHMIN-NEXT: xor a1, a1, a3 -; ZVFHMIN-NEXT: xor a2, a2, a5 -; ZVFHMIN-NEXT: vmv.v.x v11, a2 -; ZVFHMIN-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-NEXT: flh fa5, 14(sp) -; ZVFHMIN-NEXT: vslide1down.vx v11, v11, a1 -; ZVFHMIN-NEXT: xor a2, a2, a3 -; ZVFHMIN-NEXT: vslide1down.vx v11, v11, a2 -; ZVFHMIN-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-NEXT: xor a1, a1, a3 -; ZVFHMIN-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-NEXT: vslide1down.vx v11, v11, a1 -; ZVFHMIN-NEXT: vslidedown.vi v11, v8, 4, v0.t -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v11 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v9, v11, v8 +; ZVFHMIN-NEXT: vfmadd.vv v9, v8, v11 ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: vse16.v v8, (a0) -; ZVFHMIN-NEXT: addi sp, sp, 16 ; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = load <6 x half>, ptr %y @@ -3854,187 +3672,14 @@ define void @fneg_v16f16(ptr %x) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-RV32-LABEL: fneg_v16f16: -; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: addi sp, sp, -64 -; ZVFHMIN-RV32-NEXT: .cfi_def_cfa_offset 64 -; ZVFHMIN-RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; ZVFHMIN-RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill -; ZVFHMIN-RV32-NEXT: .cfi_offset ra, -4 -; ZVFHMIN-RV32-NEXT: .cfi_offset s0, -8 -; ZVFHMIN-RV32-NEXT: addi s0, sp, 64 -; ZVFHMIN-RV32-NEXT: .cfi_def_cfa s0, 0 -; ZVFHMIN-RV32-NEXT: andi sp, sp, -32 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 16, e16, m1, ta, mu -; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: mv a1, sp -; ZVFHMIN-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-RV32-NEXT: flh fa5, 2(sp) -; ZVFHMIN-RV32-NEXT: flh fa4, 0(sp) -; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-RV32-NEXT: flh fa5, 4(sp) -; ZVFHMIN-RV32-NEXT: flh fa3, 6(sp) -; ZVFHMIN-RV32-NEXT: fmv.x.h a3, fa4 -; ZVFHMIN-RV32-NEXT: flh fa4, 8(sp) -; ZVFHMIN-RV32-NEXT: fmv.x.h a4, fa5 -; ZVFHMIN-RV32-NEXT: fmv.x.h a5, fa3 -; ZVFHMIN-RV32-NEXT: flh fa5, 10(sp) -; ZVFHMIN-RV32-NEXT: fmv.x.h a6, fa4 -; ZVFHMIN-RV32-NEXT: lui a1, 1048568 -; ZVFHMIN-RV32-NEXT: xor a2, a2, a1 -; ZVFHMIN-RV32-NEXT: fmv.x.h a7, fa5 -; ZVFHMIN-RV32-NEXT: flh fa5, 12(sp) -; ZVFHMIN-RV32-NEXT: lui t0, 8 -; ZVFHMIN-RV32-NEXT: xor a3, a3, t0 -; ZVFHMIN-RV32-NEXT: vmv.v.x v8, a3 -; ZVFHMIN-RV32-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-RV32-NEXT: flh fa5, 14(sp) -; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a2 -; ZVFHMIN-RV32-NEXT: xor a4, a4, a1 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-RV32-NEXT: flh fa5, 18(sp) -; ZVFHMIN-RV32-NEXT: xor a5, a5, a1 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a5 -; ZVFHMIN-RV32-NEXT: xor a4, a6, a1 -; ZVFHMIN-RV32-NEXT: fmv.x.h a5, fa5 -; ZVFHMIN-RV32-NEXT: flh fa5, 16(sp) -; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-RV32-NEXT: xor a4, a7, a1 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-RV32-NEXT: fmv.x.h a4, fa5 -; ZVFHMIN-RV32-NEXT: flh fa5, 20(sp) -; ZVFHMIN-RV32-NEXT: xor a3, a3, a1 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-RV32-NEXT: xor a2, a2, a1 -; ZVFHMIN-RV32-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-RV32-NEXT: flh fa5, 22(sp) -; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a2 -; ZVFHMIN-RV32-NEXT: xor a2, a4, t0 -; ZVFHMIN-RV32-NEXT: vmv.v.x v9, a2 -; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-RV32-NEXT: flh fa5, 24(sp) -; ZVFHMIN-RV32-NEXT: xor a5, a5, a1 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a5 -; ZVFHMIN-RV32-NEXT: xor a3, a3, a1 -; ZVFHMIN-RV32-NEXT: fmv.x.h a4, fa5 -; ZVFHMIN-RV32-NEXT: flh fa5, 26(sp) -; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a3 -; ZVFHMIN-RV32-NEXT: xor a2, a2, a1 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a2 -; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-RV32-NEXT: flh fa5, 28(sp) -; ZVFHMIN-RV32-NEXT: xor a4, a4, a1 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a4 -; ZVFHMIN-RV32-NEXT: xor a2, a2, a1 -; ZVFHMIN-RV32-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-RV32-NEXT: flh fa5, 30(sp) -; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a2 -; ZVFHMIN-RV32-NEXT: xor a3, a3, a1 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a3 -; ZVFHMIN-RV32-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-RV32-NEXT: xor a1, a2, a1 -; ZVFHMIN-RV32-NEXT: li a2, 255 -; ZVFHMIN-RV32-NEXT: vmv.s.x v0, a2 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-RV32-NEXT: vslidedown.vi v9, v8, 8, v0.t -; ZVFHMIN-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-RV32-NEXT: addi sp, s0, -64 -; ZVFHMIN-RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; ZVFHMIN-RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; ZVFHMIN-RV32-NEXT: addi sp, sp, 64 -; ZVFHMIN-RV32-NEXT: ret -; -; ZVFHMIN-RV64-LABEL: fneg_v16f16: -; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: addi sp, sp, -64 -; ZVFHMIN-RV64-NEXT: .cfi_def_cfa_offset 64 -; ZVFHMIN-RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; ZVFHMIN-RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill -; ZVFHMIN-RV64-NEXT: .cfi_offset ra, -8 -; ZVFHMIN-RV64-NEXT: .cfi_offset s0, -16 -; ZVFHMIN-RV64-NEXT: addi s0, sp, 64 -; ZVFHMIN-RV64-NEXT: .cfi_def_cfa s0, 0 -; ZVFHMIN-RV64-NEXT: andi sp, sp, -32 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 16, e16, m1, ta, mu -; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV64-NEXT: mv a1, sp -; ZVFHMIN-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-RV64-NEXT: flh fa5, 2(sp) -; ZVFHMIN-RV64-NEXT: flh fa4, 0(sp) -; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-RV64-NEXT: flh fa5, 4(sp) -; ZVFHMIN-RV64-NEXT: flh fa3, 6(sp) -; ZVFHMIN-RV64-NEXT: fmv.x.h a3, fa4 -; ZVFHMIN-RV64-NEXT: flh fa4, 8(sp) -; ZVFHMIN-RV64-NEXT: fmv.x.h a4, fa5 -; ZVFHMIN-RV64-NEXT: fmv.x.h a5, fa3 -; ZVFHMIN-RV64-NEXT: flh fa5, 10(sp) -; ZVFHMIN-RV64-NEXT: fmv.x.h a6, fa4 -; ZVFHMIN-RV64-NEXT: lui a1, 1048568 -; ZVFHMIN-RV64-NEXT: xor a2, a2, a1 -; ZVFHMIN-RV64-NEXT: fmv.x.h a7, fa5 -; ZVFHMIN-RV64-NEXT: flh fa5, 12(sp) -; ZVFHMIN-RV64-NEXT: lui t0, 8 -; ZVFHMIN-RV64-NEXT: xor a3, a3, t0 -; ZVFHMIN-RV64-NEXT: vmv.v.x v8, a3 -; ZVFHMIN-RV64-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-RV64-NEXT: flh fa5, 14(sp) -; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a2 -; ZVFHMIN-RV64-NEXT: xor a4, a4, a1 -; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-RV64-NEXT: flh fa5, 18(sp) -; ZVFHMIN-RV64-NEXT: xor a5, a5, a1 -; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a5 -; ZVFHMIN-RV64-NEXT: xor a4, a6, a1 -; ZVFHMIN-RV64-NEXT: fmv.x.h a5, fa5 -; ZVFHMIN-RV64-NEXT: flh fa5, 16(sp) -; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-RV64-NEXT: xor a4, a7, a1 -; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-RV64-NEXT: fmv.x.h a4, fa5 -; ZVFHMIN-RV64-NEXT: flh fa5, 20(sp) -; ZVFHMIN-RV64-NEXT: xor a3, a3, a1 -; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-RV64-NEXT: xor a2, a2, a1 -; ZVFHMIN-RV64-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-RV64-NEXT: flh fa5, 22(sp) -; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a2 -; ZVFHMIN-RV64-NEXT: xor a2, a4, t0 -; ZVFHMIN-RV64-NEXT: vmv.v.x v9, a2 -; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-RV64-NEXT: flh fa5, 24(sp) -; ZVFHMIN-RV64-NEXT: xor a5, a5, a1 -; ZVFHMIN-RV64-NEXT: vslide1down.vx v9, v9, a5 -; ZVFHMIN-RV64-NEXT: xor a3, a3, a1 -; ZVFHMIN-RV64-NEXT: fmv.x.h a4, fa5 -; ZVFHMIN-RV64-NEXT: flh fa5, 26(sp) -; ZVFHMIN-RV64-NEXT: vslide1down.vx v9, v9, a3 -; ZVFHMIN-RV64-NEXT: xor a2, a2, a1 -; ZVFHMIN-RV64-NEXT: vslide1down.vx v9, v9, a2 -; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-RV64-NEXT: flh fa5, 28(sp) -; ZVFHMIN-RV64-NEXT: xor a4, a4, a1 -; ZVFHMIN-RV64-NEXT: vslide1down.vx v9, v9, a4 -; ZVFHMIN-RV64-NEXT: xor a2, a2, a1 -; ZVFHMIN-RV64-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-RV64-NEXT: flh fa5, 30(sp) -; ZVFHMIN-RV64-NEXT: vslide1down.vx v9, v9, a2 -; ZVFHMIN-RV64-NEXT: xor a3, a3, a1 -; ZVFHMIN-RV64-NEXT: vslide1down.vx v9, v9, a3 -; ZVFHMIN-RV64-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-RV64-NEXT: xor a1, a2, a1 -; ZVFHMIN-RV64-NEXT: li a2, 255 -; ZVFHMIN-RV64-NEXT: vmv.s.x v0, a2 -; ZVFHMIN-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-RV64-NEXT: vslidedown.vi v9, v8, 8, v0.t -; ZVFHMIN-RV64-NEXT: vse16.v v9, (a0) -; ZVFHMIN-RV64-NEXT: addi sp, s0, -64 -; ZVFHMIN-RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; ZVFHMIN-RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; ZVFHMIN-RV64-NEXT: addi sp, sp, 64 -; ZVFHMIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: fneg_v16f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m1, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 +; ZVFHMIN-NEXT: vse16.v v8, (a0) +; ZVFHMIN-NEXT: ret %a = load <16 x half>, ptr %x %b = fneg <16 x half> %a store <16 x half> %b, ptr %x @@ -5286,60 +4931,21 @@ define void @fmsub_vf_v8f16(ptr %x, ptr %y, half %z) { ; ; ZVFHMIN-LABEL: fmsub_vf_v8f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: addi sp, sp, -16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, mu -; ZVFHMIN-NEXT: vle16.v v9, (a1) +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: mv a1, sp -; ZVFHMIN-NEXT: vse16.v v9, (a1) -; ZVFHMIN-NEXT: flh fa5, 2(sp) -; ZVFHMIN-NEXT: flh fa4, 0(sp) +; ZVFHMIN-NEXT: vle16.v v9, (a1) ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-NEXT: flh fa5, 4(sp) -; ZVFHMIN-NEXT: fmv.x.h a3, fa4 -; ZVFHMIN-NEXT: vmv.v.x v9, a1 -; ZVFHMIN-NEXT: lui a1, 1048568 -; ZVFHMIN-NEXT: fmv.x.h a4, fa5 -; ZVFHMIN-NEXT: flh fa5, 6(sp) -; ZVFHMIN-NEXT: lui a5, 8 -; ZVFHMIN-NEXT: xor a3, a3, a5 -; ZVFHMIN-NEXT: vmv.v.x v10, a3 -; ZVFHMIN-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-NEXT: flh fa5, 10(sp) -; ZVFHMIN-NEXT: xor a2, a2, a1 -; ZVFHMIN-NEXT: vslide1down.vx v10, v10, a2 -; ZVFHMIN-NEXT: xor a4, a4, a1 -; ZVFHMIN-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-NEXT: flh fa5, 8(sp) -; ZVFHMIN-NEXT: vslide1down.vx v10, v10, a4 -; ZVFHMIN-NEXT: xor a3, a3, a1 -; ZVFHMIN-NEXT: vslide1down.vx v10, v10, a3 -; ZVFHMIN-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-NEXT: flh fa5, 12(sp) -; ZVFHMIN-NEXT: xor a2, a2, a1 -; ZVFHMIN-NEXT: xor a3, a3, a5 -; ZVFHMIN-NEXT: vmv.v.x v11, a3 -; ZVFHMIN-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-NEXT: flh fa5, 14(sp) -; ZVFHMIN-NEXT: vslide1down.vx v11, v11, a2 -; ZVFHMIN-NEXT: xor a3, a3, a1 -; ZVFHMIN-NEXT: vslide1down.vx v11, v11, a3 -; ZVFHMIN-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-NEXT: xor a1, a2, a1 -; ZVFHMIN-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-NEXT: vslide1down.vx v11, v11, a1 -; ZVFHMIN-NEXT: vslidedown.vi v11, v10, 4, v0.t -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v11 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 +; ZVFHMIN-NEXT: vmv.v.x v10, a1 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vxor.vx v9, v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v8, v11, v10 +; ZVFHMIN-NEXT: vfmadd.vv v8, v9, v11 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8 ; ZVFHMIN-NEXT: vse16.v v9, (a0) -; ZVFHMIN-NEXT: addi sp, sp, 16 ; ZVFHMIN-NEXT: ret %a = load <8 x half>, ptr %x %b = load <8 x half>, ptr %y @@ -5363,65 +4969,26 @@ define void @fmsub_vf_v6f16(ptr %x, ptr %y, half %z) { ; ; ZVFHMIN-LABEL: fmsub_vf_v6f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: addi sp, sp, -16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vle16.v v9, (a1) ; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: mv a1, sp -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, mu -; ZVFHMIN-NEXT: vse16.v v9, (a1) -; ZVFHMIN-NEXT: flh fa5, 2(sp) -; ZVFHMIN-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-NEXT: flh fa5, 0(sp) -; ZVFHMIN-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-NEXT: li a4, 192 -; ZVFHMIN-NEXT: vmv.s.x v0, a4 -; ZVFHMIN-NEXT: fmv.x.h a4, fa5 -; ZVFHMIN-NEXT: flh fa5, 4(sp) -; ZVFHMIN-NEXT: vmv.v.x v9, a1 -; ZVFHMIN-NEXT: vmerge.vxm v9, v9, a3, v0 -; ZVFHMIN-NEXT: lui a1, 1048568 -; ZVFHMIN-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-NEXT: flh fa5, 6(sp) -; ZVFHMIN-NEXT: lui a5, 8 -; ZVFHMIN-NEXT: xor a4, a4, a5 -; ZVFHMIN-NEXT: vmv.v.x v10, a4 -; ZVFHMIN-NEXT: fmv.x.h a4, fa5 -; ZVFHMIN-NEXT: flh fa5, 10(sp) -; ZVFHMIN-NEXT: xor a2, a2, a1 -; ZVFHMIN-NEXT: vslide1down.vx v10, v10, a2 -; ZVFHMIN-NEXT: xor a3, a3, a1 -; ZVFHMIN-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-NEXT: flh fa5, 8(sp) -; ZVFHMIN-NEXT: vslide1down.vx v10, v10, a3 -; ZVFHMIN-NEXT: xor a4, a4, a1 -; ZVFHMIN-NEXT: vslide1down.vx v10, v10, a4 -; ZVFHMIN-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-NEXT: flh fa5, 12(sp) -; ZVFHMIN-NEXT: xor a2, a2, a1 -; ZVFHMIN-NEXT: xor a3, a3, a5 -; ZVFHMIN-NEXT: vmv.v.x v11, a3 -; ZVFHMIN-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-NEXT: flh fa5, 14(sp) -; ZVFHMIN-NEXT: vslide1down.vx v11, v11, a2 -; ZVFHMIN-NEXT: xor a3, a3, a1 -; ZVFHMIN-NEXT: vslide1down.vx v11, v11, a3 -; ZVFHMIN-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-NEXT: xor a1, a2, a1 -; ZVFHMIN-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-NEXT: vslide1down.vx v11, v11, a1 -; ZVFHMIN-NEXT: vslidedown.vi v11, v10, 4, v0.t -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v11 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 +; ZVFHMIN-NEXT: vle16.v v9, (a1) +; ZVFHMIN-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-NEXT: li a2, 192 +; ZVFHMIN-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v10, a2 +; ZVFHMIN-NEXT: vmerge.vxm v10, v10, a1, v0 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vxor.vx v9, v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfmadd.vv v9, v11, v10 ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: vse16.v v8, (a0) -; ZVFHMIN-NEXT: addi sp, sp, 16 ; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = load <6 x half>, ptr %y From e494e2a29449a5ce7fce16b5dc1d0033b1ba69e8 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 18 Sep 2024 18:29:32 -0700 Subject: [PATCH 141/321] [MachineVerifier] Improve G_EXTRACT_SUBVECTOR checking (#109202) Check that the destination of G_EXTRACT_SUBVECTOR is smaller than the source. Improve wording of error messages. --- llvm/lib/CodeGen/MachineVerifier.cpp | 15 ++++++++++----- .../test_g_extract_subvector.mir | 17 ++++++++++------- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp index 1fcbeeec6f64cc..651de06cfac25d 100644 --- a/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/llvm/lib/CodeGen/MachineVerifier.cpp @@ -1769,7 +1769,7 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) { } if (!SrcTy.isVector()) { - report("First source must be a vector", MI); + report("Source must be a vector", MI); break; } @@ -1783,6 +1783,12 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) { break; } + if (ElementCount::isKnownGT(DstTy.getElementCount(), + SrcTy.getElementCount())) { + report("Destination vector must be smaller than source vector", MI); + break; + } + uint64_t Idx = IndexOp.getImm(); uint64_t DstMinLen = DstTy.getElementCount().getKnownMinValue(); if (Idx % DstMinLen != 0) { @@ -1793,10 +1799,9 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) { } uint64_t SrcMinLen = SrcTy.getElementCount().getKnownMinValue(); - if (SrcTy.isScalable() == DstTy.isScalable() && - (Idx >= SrcMinLen || Idx + DstMinLen > SrcMinLen)) { - report("Source type and index must not cause extract to overrun to the " - "destination type", + if (Idx >= SrcMinLen || Idx + DstMinLen > SrcMinLen) { + report("Destination type and index must not cause extract to overrun the " + "source vector", MI); break; } diff --git a/llvm/test/MachineVerifier/test_g_extract_subvector.mir b/llvm/test/MachineVerifier/test_g_extract_subvector.mir index 6a0b7ebfb4b0b2..3b9c8314f7ceca 100644 --- a/llvm/test/MachineVerifier/test_g_extract_subvector.mir +++ b/llvm/test/MachineVerifier/test_g_extract_subvector.mir @@ -10,6 +10,7 @@ body: | %1:_() = G_IMPLICIT_DEF %2:_() = G_IMPLICIT_DEF + ; CHECK: generic instruction must use register operands ; CHECK: G_EXTRACT_SUBVECTOR first source must be a register %3:_() = G_EXTRACT_SUBVECTOR 1, 0 @@ -19,7 +20,7 @@ body: | ; CHECK: Destination type must be a vector %5:_(s32) = G_EXTRACT_SUBVECTOR %2, 0 - ; CHECK: First source must be a vector + ; CHECK: Source must be a vector %6:_() = G_EXTRACT_SUBVECTOR %0, 0 %7:_() = G_IMPLICIT_DEF @@ -27,27 +28,27 @@ body: | ; CHECK: Element type of vectors must be the same %8:_() = G_EXTRACT_SUBVECTOR %7, 0 - ; CHECK: Index must be a multiple of the destination vector's minimum vector length + ; CHECK: Destination vector must be smaller than source vector %9:_() = G_EXTRACT_SUBVECTOR %1, 3 - ; CHECK: Index must be a multiple of the destination vector's minimum vector length + ; CHECK: Destination vector must be smaller than source vector %10:_() = G_EXTRACT_SUBVECTOR %1, 2 - ; CHECK: Source type and index must not cause extract to overrun to the destination type + ; CHECK: Destination type and index must not cause extract to overrun the source vector %11:_() = G_EXTRACT_SUBVECTOR %1, 4 %12:_() = G_IMPLICIT_DEF - ; CHECK: Source type and index must not cause extract to overrun to the destination type + ; CHECK: Destination type and index must not cause extract to overrun the source vector %13:_() = G_EXTRACT_SUBVECTOR %12, 3 %14:_(<2 x s32>) = G_IMPLICIT_DEF %15:_(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: Source type and index must not cause extract to overrun to the destination type + ; CHECK: Destination type and index must not cause extract to overrun the source vector %16:_(<2 x s32>) = G_EXTRACT_SUBVECTOR %14, 4 - ; CHECK: Source type and index must not cause extract to overrun to the destination type + ; CHECK: Destination type and index must not cause extract to overrun the source vector %17:_(<3 x s32>) = G_EXTRACT_SUBVECTOR %15, 3 ; CHECK: Vector types must both be fixed or both be scalable @@ -56,5 +57,7 @@ body: | ; CHECK: Vector types must both be fixed or both be scalable %19:_(<2 x s32>) = G_EXTRACT_SUBVECTOR %12, 0 + ; CHECK: Index must be a multiple of the destination vector's minimum vector length + %20:_() = G_EXTRACT_SUBVECTOR %12, 1 ... From 009398b3b37f7d653b4371120944f74cad934992 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 18 Sep 2024 18:31:16 -0700 Subject: [PATCH 142/321] [MachineVerifier] Improve checks for G_INSERT_SUBVECTOR. (#109209) -Improve messages. -Remove redundant checks that are handled in generic code. -Add check that the subvector is smaller than the vector. -Add checks that subvector is smaller than the vector. --- llvm/lib/CodeGen/MachineVerifier.cpp | 32 ++++++++++++------- .../test_g_insert_subvector.mir | 28 +++++++++++++--- 2 files changed, 45 insertions(+), 15 deletions(-) diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp index 651de06cfac25d..27664207d1e696 100644 --- a/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/llvm/lib/CodeGen/MachineVerifier.cpp @@ -1710,7 +1710,6 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) { } LLT DstTy = MRI->getType(MI->getOperand(0).getReg()); - LLT Src0Ty = MRI->getType(Src0Op.getReg()); LLT Src1Ty = MRI->getType(Src1Op.getReg()); if (!DstTy.isVector()) { @@ -1718,33 +1717,44 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) { break; } - if (!Src0Ty.isVector()) { - report("First source must be a vector", MI); + if (!Src1Ty.isVector()) { + report("Second source must be a vector", MI); break; } - if (!Src1Ty.isVector()) { - report("Second source must be a vector", MI); + if (DstTy.getElementType() != Src1Ty.getElementType()) { + report("Element type of vectors must be the same", MI); break; } - if (DstTy != Src0Ty) { - report("Destination type must match the first source vector type", MI); + if (Src1Ty.isScalable() != DstTy.isScalable()) { + report("Vector types must both be fixed or both be scalable", MI); break; } - if (Src0Ty.getElementType() != Src1Ty.getElementType()) { - report("Element type of source vectors must be the same", MI); + if (ElementCount::isKnownGT(Src1Ty.getElementCount(), + DstTy.getElementCount())) { + report("Second source must be smaller than destination vector", MI); break; } - if (IndexOp.getImm() != 0 && - IndexOp.getImm() % Src1Ty.getElementCount().getKnownMinValue() != 0) { + uint64_t Idx = IndexOp.getImm(); + uint64_t Src1MinLen = Src1Ty.getElementCount().getKnownMinValue(); + if (IndexOp.getImm() % Src1MinLen != 0) { report("Index must be a multiple of the second source vector's " "minimum vector length", MI); break; } + + uint64_t DstMinLen = DstTy.getElementCount().getKnownMinValue(); + if (Idx >= DstMinLen || Idx + Src1MinLen > DstMinLen) { + report("Subvector type and index must not cause insert to overrun the " + "vector being inserted into", + MI); + break; + } + break; } case TargetOpcode::G_EXTRACT_SUBVECTOR: { diff --git a/llvm/test/MachineVerifier/test_g_insert_subvector.mir b/llvm/test/MachineVerifier/test_g_insert_subvector.mir index 62ddd28919b205..84cb249d74eb79 100644 --- a/llvm/test/MachineVerifier/test_g_insert_subvector.mir +++ b/llvm/test/MachineVerifier/test_g_insert_subvector.mir @@ -11,9 +11,11 @@ body: | %1:_() = G_IMPLICIT_DEF %2:_() = G_IMPLICIT_DEF + ; CHECK: generic instruction must use register operands ; CHECK: G_INSERT_SUBVECTOR first source must be a register %3:_() = G_INSERT_SUBVECTOR 1, %2, 0 + ; CHECK: generic instruction must use register operands ; CHECK: G_INSERT_SUBVECTOR second source must be a register %4:_() = G_INSERT_SUBVECTOR %1, 1, 0 @@ -23,18 +25,18 @@ body: | ; CHECK: Destination type must be a vector %6:_(s32) = G_INSERT_SUBVECTOR %1, %2, 0 - ; CHECK: First source must be a vector + ; CHECK: Type mismatch in generic instruction %7:_() = G_INSERT_SUBVECTOR %0, %2, 0 ; CHECK: Second source must be a vector %8:_() = G_INSERT_SUBVECTOR %1, %0, 0 - ; CHECK: Destination type must match the first source vector type + ; CHECK: Type mismatch in generic instruction %9:_() = G_INSERT_SUBVECTOR %2, %1, 0 %10:_() = G_IMPLICIT_DEF - ; CHECK: Element type of source vectors must be the same + ; CHECK: Element type of vectors must be the same %11:_() = G_INSERT_SUBVECTOR %1, %10, 0 %12:_() = G_IMPLICIT_DEF @@ -43,5 +45,23 @@ body: | %13:_() = G_INSERT_SUBVECTOR %12, %1, 3 ; CHECK: Index must be a multiple of the second source vector's minimum vector length - %13:_() = G_INSERT_SUBVECTOR %12, %1, 1 + %14:_() = G_INSERT_SUBVECTOR %12, %1, 1 + + %15:_() = G_IMPLICIT_DEF + + ; CHECK: Second source must be smaller than destination vector + %16:_() = G_INSERT_SUBVECTOR %1, %15, 0 + + ; CHECK: Subvector type and index must not cause insert to overrun the vector being inserted into + %17:_() = G_INSERT_SUBVECTOR %12, %1, 4 + + %18:_() = G_IMPLICIT_DEF + + ; CHECK: Subvector type and index must not cause insert to overrun the vector being inserted into + %19:_() = G_INSERT_SUBVECTOR %18, %1, 2 + + %20:_(<2 x s32>) = G_IMPLICIT_DEF + + ; CHECK: Vector types must both be fixed or both be scalable + %21:_() = G_INSERT_SUBVECTOR %12, %20, 2 ... From 87dc3e89e72bb8d42d742c6a916f5fdee0bf853b Mon Sep 17 00:00:00 2001 From: Jianjian Guan Date: Thu, 19 Sep 2024 10:33:36 +0800 Subject: [PATCH 143/321] [mlir][LLVMIR] Add more vector predication intrinsic ops (#107663) This revision adds vector predication smax, smin, umax and umin intrinsic ops. --- .../mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td | 4 ++++ mlir/test/Dialect/LLVMIR/roundtrip.mlir | 19 +++++++++++++++++++ mlir/test/Target/LLVMIR/Import/intrinsic.ll | 12 ++++++++++++ .../test/Target/LLVMIR/llvmir-intrinsics.mlir | 16 ++++++++++++++++ 4 files changed, 51 insertions(+) diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td index 3822eb3b3f1f6c..5031426033aea1 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td @@ -1156,6 +1156,10 @@ def LLVM_VPShlOp : LLVM_VPBinaryI<"shl">; def LLVM_VPOrOp : LLVM_VPBinaryI<"or">; def LLVM_VPAndOp : LLVM_VPBinaryI<"and">; def LLVM_VPXorOp : LLVM_VPBinaryI<"xor">; +def LLVM_VPSMaxOp : LLVM_VPBinaryI<"smax">; +def LLVM_VPSMinOp : LLVM_VPBinaryI<"smin">; +def LLVM_VPUMaxOp : LLVM_VPBinaryI<"umax">; +def LLVM_VPUMinOp : LLVM_VPBinaryI<"umin">; // Float Binary def LLVM_VPFAddOp : LLVM_VPBinaryF<"fadd">; diff --git a/mlir/test/Dialect/LLVMIR/roundtrip.mlir b/mlir/test/Dialect/LLVMIR/roundtrip.mlir index ff16bb0f857dda..0b251b81e97870 100644 --- a/mlir/test/Dialect/LLVMIR/roundtrip.mlir +++ b/mlir/test/Dialect/LLVMIR/roundtrip.mlir @@ -729,3 +729,22 @@ llvm.func @test_notail() -> i32 { %0 = llvm.call notail @tail_call_target() : () -> i32 llvm.return %0 : i32 } + +// CHECK-LABEL: @vector_predication_intrinsics +// CHECK-SAME: (%[[ARG0:.*]]: vector<8xi32>, %[[ARG1:.*]]: vector<8xi32>, %[[ARG2:.*]]: vector<8xi1>, %[[ARG3:.*]]: i32) +llvm.func @vector_predication_intrinsics(%A: vector<8xi32>, %B: vector<8xi32>, + %mask: vector<8xi1>, %evl: i32) { + // CHECK-NEXT: "llvm.intr.vp.smax"(%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]) + "llvm.intr.vp.smax" (%A, %B, %mask, %evl) : + (vector<8xi32>, vector<8xi32>, vector<8xi1>, i32) -> vector<8xi32> + // CHECK-NEXT: "llvm.intr.vp.smin"(%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]) + "llvm.intr.vp.smin" (%A, %B, %mask, %evl) : + (vector<8xi32>, vector<8xi32>, vector<8xi1>, i32) -> vector<8xi32> + // CHECK-NEXT: "llvm.intr.vp.umax"(%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]) + "llvm.intr.vp.umax" (%A, %B, %mask, %evl) : + (vector<8xi32>, vector<8xi32>, vector<8xi1>, i32) -> vector<8xi32> + // CHECK-NEXT: "llvm.intr.vp.umin"(%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]) + "llvm.intr.vp.umin" (%A, %B, %mask, %evl) : + (vector<8xi32>, vector<8xi32>, vector<8xi1>, i32) -> vector<8xi32> + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/Import/intrinsic.ll b/mlir/test/Target/LLVMIR/Import/intrinsic.ll index 0fa82cef0a0f5a..2fc2c3c6c32ffa 100644 --- a/mlir/test/Target/LLVMIR/Import/intrinsic.ll +++ b/mlir/test/Target/LLVMIR/Import/intrinsic.ll @@ -897,6 +897,14 @@ define void @vector_predication_intrinsics(<8 x i32> %0, <8 x i32> %1, <8 x floa %59 = call <8 x ptr> @llvm.vp.inttoptr.v8p0.v8i64(<8 x i64> %4, <8 x i1> %11, i32 %12) ; CHECK: "llvm.intr.vp.fmuladd"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (vector<8xf32>, vector<8xf32>, vector<8xf32>, vector<8xi1>, i32) -> vector<8xf32> %60 = call <8 x float> @llvm.vp.fmuladd.v8f32(<8 x float> %2, <8 x float> %3, <8 x float> %3, <8 x i1> %11, i32 %12) + ; CHECK: "llvm.intr.vp.smax"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (vector<8xi32>, vector<8xi32>, vector<8xi1>, i32) -> vector<8xi32> + %61 = call <8 x i32> @llvm.vp.smax.v8i32(<8 x i32> %0, <8 x i32> %1, <8 x i1> %11, i32 %12) + ; CHECK: "llvm.intr.vp.smin"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (vector<8xi32>, vector<8xi32>, vector<8xi1>, i32) -> vector<8xi32> + %62 = call <8 x i32> @llvm.vp.smin.v8i32(<8 x i32> %0, <8 x i32> %1, <8 x i1> %11, i32 %12) + ; CHECK: "llvm.intr.vp.umax"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (vector<8xi32>, vector<8xi32>, vector<8xi1>, i32) -> vector<8xi32> + %63 = call <8 x i32> @llvm.vp.umax.v8i32(<8 x i32> %0, <8 x i32> %1, <8 x i1> %11, i32 %12) + ; CHECK: "llvm.intr.vp.umin"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (vector<8xi32>, vector<8xi32>, vector<8xi1>, i32) -> vector<8xi32> + %64 = call <8 x i32> @llvm.vp.umin.v8i32(<8 x i32> %0, <8 x i32> %1, <8 x i1> %11, i32 %12) ret void } @@ -1113,6 +1121,10 @@ declare <8 x float> @llvm.vp.frem.v8f32(<8 x float>, <8 x float>, <8 x i1>, i32) declare <8 x float> @llvm.vp.fneg.v8f32(<8 x float>, <8 x i1>, i32) declare <8 x float> @llvm.vp.fma.v8f32(<8 x float>, <8 x float>, <8 x float>, <8 x i1>, i32) declare <8 x float> @llvm.vp.fmuladd.v8f32(<8 x float>, <8 x float>, <8 x float>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.smax.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.smin.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.umax.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.umin.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) declare i32 @llvm.vp.reduce.add.v8i32(i32, <8 x i32>, <8 x i1>, i32) declare i32 @llvm.vp.reduce.mul.v8i32(i32, <8 x i32>, <8 x i1>, i32) declare i32 @llvm.vp.reduce.and.v8i32(i32, <8 x i32>, <8 x i1>, i32) diff --git a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir index e2eadf14fc97e9..de0dc8d21584fe 100644 --- a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir +++ b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir @@ -798,6 +798,18 @@ llvm.func @vector_predication_intrinsics(%A: vector<8xi32>, %B: vector<8xi32>, // CHECK: call <8 x i32> @llvm.vp.xor.v8i32 "llvm.intr.vp.xor" (%A, %B, %mask, %evl) : (vector<8xi32>, vector<8xi32>, vector<8xi1>, i32) -> vector<8xi32> + // CHECK: call <8 x i32> @llvm.vp.smax.v8i32 + "llvm.intr.vp.smax" (%A, %B, %mask, %evl) : + (vector<8xi32>, vector<8xi32>, vector<8xi1>, i32) -> vector<8xi32> + // CHECK: call <8 x i32> @llvm.vp.smin.v8i32 + "llvm.intr.vp.smin" (%A, %B, %mask, %evl) : + (vector<8xi32>, vector<8xi32>, vector<8xi1>, i32) -> vector<8xi32> + // CHECK: call <8 x i32> @llvm.vp.umax.v8i32 + "llvm.intr.vp.umax" (%A, %B, %mask, %evl) : + (vector<8xi32>, vector<8xi32>, vector<8xi1>, i32) -> vector<8xi32> + // CHECK: call <8 x i32> @llvm.vp.umin.v8i32 + "llvm.intr.vp.umin" (%A, %B, %mask, %evl) : + (vector<8xi32>, vector<8xi32>, vector<8xi1>, i32) -> vector<8xi32> // CHECK: call <8 x float> @llvm.vp.fadd.v8f32 "llvm.intr.vp.fadd" (%C, %D, %mask, %evl) : @@ -1123,6 +1135,10 @@ llvm.func @experimental_constrained_fptrunc(%s: f64, %v: vector<4xf32>) { // CHECK-DAG: declare <8 x i32> @llvm.vp.or.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) // CHECK-DAG: declare <8 x i32> @llvm.vp.and.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) // CHECK-DAG: declare <8 x i32> @llvm.vp.xor.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +// CHECK-DAG: declare <8 x i32> @llvm.vp.smax.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +// CHECK-DAG: declare <8 x i32> @llvm.vp.smin.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +// CHECK-DAG: declare <8 x i32> @llvm.vp.umax.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +// CHECK-DAG: declare <8 x i32> @llvm.vp.umin.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) // CHECK-DAG: declare <8 x float> @llvm.vp.fadd.v8f32(<8 x float>, <8 x float>, <8 x i1>, i32) // CHECK-DAG: declare <8 x float> @llvm.vp.fsub.v8f32(<8 x float>, <8 x float>, <8 x i1>, i32) // CHECK-DAG: declare <8 x float> @llvm.vp.fmul.v8f32(<8 x float>, <8 x float>, <8 x i1>, i32) From c9aa9d53b6f92d9780430ab8239ea9117574c95d Mon Sep 17 00:00:00 2001 From: Gedare Bloom Date: Wed, 18 Sep 2024 20:44:09 -0600 Subject: [PATCH 144/321] [clang-format] Fix regression in BAS_AlwaysBreak for-await (#108634) Fixes #108589. --- clang/lib/Format/ContinuationIndenter.cpp | 3 ++- clang/unittests/Format/FormatTestJS.cpp | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp index f29f8796ea9290..4e9ae41b566f49 100644 --- a/clang/lib/Format/ContinuationIndenter.cpp +++ b/clang/lib/Format/ContinuationIndenter.cpp @@ -809,7 +809,8 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun, if (Tok.Previous->isIf()) return Style.AlignAfterOpenBracket == FormatStyle::BAS_AlwaysBreak; return !Tok.Previous->isOneOf(TT_CastRParen, tok::kw_for, tok::kw_while, - tok::kw_switch); + tok::kw_switch) && + !(Style.isJavaScript() && Tok.Previous->is(Keywords.kw_await)); }; auto IsFunctionCallParen = [](const FormatToken &Tok) { return Tok.is(tok::l_paren) && Tok.ParameterCount > 0 && Tok.Previous && diff --git a/clang/unittests/Format/FormatTestJS.cpp b/clang/unittests/Format/FormatTestJS.cpp index 57c021c76867f7..4b15e7b7da3393 100644 --- a/clang/unittests/Format/FormatTestJS.cpp +++ b/clang/unittests/Format/FormatTestJS.cpp @@ -2870,6 +2870,11 @@ TEST_F(FormatTestJS, BreakAfterOpenBracket) { verifyFormat("failedUserIds.push(await subscriptioxxxxxxxxxxxxnSubset.map(\n" " subscxxxxxxxxxxxxription => subscription.getUserId()));", Style); + verifyFormat("for await (const packageId of ops.api.iterateEmbeddedFiles(\n" + " this.getFileId().getDriveFile(),\n" + " )) {\n" + "}", + Style); } } // namespace format From cdf447baa50e837961384fab1e4d087da30b6f3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Wed, 18 Sep 2024 20:22:06 -0700 Subject: [PATCH 145/321] [flang][cuda] Add function to allocate and deallocate device module variable (#109213) This patch adds new runtime entry points that perform the simple allocation/deallocation of module allocatable variable with cuda attributes. When the allocation is initiated on the host, the descriptor on the device is synchronized. Both descriptors point to the same data on the device. This is the first PR of a stack. --- .../include/flang/Runtime/CUDA/allocatable.h | 34 +++++++++ flang/include/flang/Runtime/CUDA/allocator.h | 11 --- flang/include/flang/Runtime/CUDA/common.h | 30 ++++++++ flang/include/flang/Runtime/CUDA/descriptor.h | 13 +++- flang/include/flang/Runtime/CUDA/memory.h | 4 -- .../Optimizer/Transforms/CufOpConversion.cpp | 1 + flang/runtime/CUDA/CMakeLists.txt | 4 ++ flang/runtime/CUDA/allocatable.cpp | 71 +++++++++++++++++++ flang/runtime/CUDA/allocator.cpp | 1 + flang/runtime/CUDA/descriptor.cpp | 22 ++++++ flang/unittests/Runtime/CUDA/Allocatable.cpp | 60 ++++++++++++++++ flang/unittests/Runtime/CUDA/CMakeLists.txt | 8 +++ 12 files changed, 242 insertions(+), 17 deletions(-) create mode 100644 flang/include/flang/Runtime/CUDA/allocatable.h create mode 100644 flang/include/flang/Runtime/CUDA/common.h create mode 100644 flang/runtime/CUDA/allocatable.cpp create mode 100644 flang/unittests/Runtime/CUDA/Allocatable.cpp diff --git a/flang/include/flang/Runtime/CUDA/allocatable.h b/flang/include/flang/Runtime/CUDA/allocatable.h new file mode 100644 index 00000000000000..e986ad910a3f3a --- /dev/null +++ b/flang/include/flang/Runtime/CUDA/allocatable.h @@ -0,0 +1,34 @@ +//===-- include/flang/Runtime/CUDA/allocatable.h ----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef FORTRAN_RUNTIME_CUDA_ALLOCATABLE_H_ +#define FORTRAN_RUNTIME_CUDA_ALLOCATABLE_H_ + +#include "flang/Runtime/descriptor.h" +#include "flang/Runtime/entry-names.h" + +namespace Fortran::runtime::cuda { + +extern "C" { + +/// Perform allocation of the descriptor with synchronization of it when +/// necessary. +int RTDECL(CUFAllocatableAllocate)(Descriptor &, bool hasStat = false, + const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr, + int sourceLine = 0); + +/// Perform deallocation of the descriptor with synchronization of it when +/// necessary. +int RTDECL(CUFAllocatableDeallocate)(Descriptor &, bool hasStat = false, + const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr, + int sourceLine = 0); + +} // extern "C" + +} // namespace Fortran::runtime::cuda +#endif // FORTRAN_RUNTIME_CUDA_ALLOCATABLE_H_ diff --git a/flang/include/flang/Runtime/CUDA/allocator.h b/flang/include/flang/Runtime/CUDA/allocator.h index 4527c9f18fa054..06bda81c6f75ad 100644 --- a/flang/include/flang/Runtime/CUDA/allocator.h +++ b/flang/include/flang/Runtime/CUDA/allocator.h @@ -12,17 +12,6 @@ #include "flang/Runtime/descriptor.h" #include "flang/Runtime/entry-names.h" -#define CUDA_REPORT_IF_ERROR(expr) \ - [](cudaError_t err) { \ - if (err == cudaSuccess) \ - return; \ - const char *name = cudaGetErrorName(err); \ - if (!name) \ - name = ""; \ - Terminator terminator{__FILE__, __LINE__}; \ - terminator.Crash("'%s' failed with '%s'", #expr, name); \ - }(expr) - namespace Fortran::runtime::cuda { extern "C" { diff --git a/flang/include/flang/Runtime/CUDA/common.h b/flang/include/flang/Runtime/CUDA/common.h new file mode 100644 index 00000000000000..cb8681da161f0d --- /dev/null +++ b/flang/include/flang/Runtime/CUDA/common.h @@ -0,0 +1,30 @@ +//===-- include/flang/Runtime/CUDA/common.h ------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef FORTRAN_RUNTIME_CUDA_COMMON_H_ +#define FORTRAN_RUNTIME_CUDA_COMMON_H_ + +#include "flang/Runtime/descriptor.h" +#include "flang/Runtime/entry-names.h" + +static constexpr unsigned kHostToDevice = 0; +static constexpr unsigned kDeviceToHost = 1; +static constexpr unsigned kDeviceToDevice = 2; + +#define CUDA_REPORT_IF_ERROR(expr) \ + [](cudaError_t err) { \ + if (err == cudaSuccess) \ + return; \ + const char *name = cudaGetErrorName(err); \ + if (!name) \ + name = ""; \ + Terminator terminator{__FILE__, __LINE__}; \ + terminator.Crash("'%s' failed with '%s'", #expr, name); \ + }(expr) + +#endif // FORTRAN_RUNTIME_CUDA_COMMON_H_ diff --git a/flang/include/flang/Runtime/CUDA/descriptor.h b/flang/include/flang/Runtime/CUDA/descriptor.h index d593989420420f..93791012fdcc73 100644 --- a/flang/include/flang/Runtime/CUDA/descriptor.h +++ b/flang/include/flang/Runtime/CUDA/descriptor.h @@ -17,14 +17,23 @@ namespace Fortran::runtime::cuda { extern "C" { -// Allocate a descriptor in managed. +/// Allocate a descriptor in managed. Descriptor *RTDECL(CUFAllocDesciptor)( std::size_t, const char *sourceFile = nullptr, int sourceLine = 0); -// Deallocate a descriptor allocated in managed or unified memory. +/// Deallocate a descriptor allocated in managed or unified memory. void RTDECL(CUFFreeDesciptor)( Descriptor *, const char *sourceFile = nullptr, int sourceLine = 0); +/// Retrieve the device pointer from the host one. +void *RTDECL(CUFGetDeviceAddress)( + void *hostPtr, const char *sourceFile = nullptr, int sourceLine = 0); + +/// Sync the \p src descriptor to the \p dst descriptor. +void RTDECL(CUFDescriptorSync)(Descriptor *dst, const Descriptor *src, + const char *sourceFile = nullptr, int sourceLine = 0); + } // extern "C" + } // namespace Fortran::runtime::cuda #endif // FORTRAN_RUNTIME_CUDA_DESCRIPTOR_H_ diff --git a/flang/include/flang/Runtime/CUDA/memory.h b/flang/include/flang/Runtime/CUDA/memory.h index 8fd51129e81fe0..33947248dc4831 100644 --- a/flang/include/flang/Runtime/CUDA/memory.h +++ b/flang/include/flang/Runtime/CUDA/memory.h @@ -13,10 +13,6 @@ #include "flang/Runtime/entry-names.h" #include -static constexpr unsigned kHostToDevice = 0; -static constexpr unsigned kDeviceToHost = 1; -static constexpr unsigned kDeviceToDevice = 2; - namespace Fortran::runtime::cuda { extern "C" { diff --git a/flang/lib/Optimizer/Transforms/CufOpConversion.cpp b/flang/lib/Optimizer/Transforms/CufOpConversion.cpp index 03a1eb74343b43..2dc37f4df3aeec 100644 --- a/flang/lib/Optimizer/Transforms/CufOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CufOpConversion.cpp @@ -14,6 +14,7 @@ #include "flang/Optimizer/Dialect/FIROps.h" #include "flang/Optimizer/HLFIR/HLFIROps.h" #include "flang/Optimizer/Support/DataLayout.h" +#include "flang/Runtime/CUDA/common.h" #include "flang/Runtime/CUDA/descriptor.h" #include "flang/Runtime/CUDA/memory.h" #include "flang/Runtime/allocatable.h" diff --git a/flang/runtime/CUDA/CMakeLists.txt b/flang/runtime/CUDA/CMakeLists.txt index 490bb369b572f6..803ff01b945dc4 100644 --- a/flang/runtime/CUDA/CMakeLists.txt +++ b/flang/runtime/CUDA/CMakeLists.txt @@ -15,8 +15,12 @@ set(CUFRT_LIBNAME CufRuntime_cuda_${CUDAToolkit_VERSION_MAJOR}) add_flang_library(${CUFRT_LIBNAME} allocator.cpp + allocatable.cpp descriptor.cpp memory.cpp + + LINK_COMPONENTS + Support ) if (BUILD_SHARED_LIBS) diff --git a/flang/runtime/CUDA/allocatable.cpp b/flang/runtime/CUDA/allocatable.cpp new file mode 100644 index 00000000000000..cc0c647c6c9529 --- /dev/null +++ b/flang/runtime/CUDA/allocatable.cpp @@ -0,0 +1,71 @@ +//===-- runtime/CUDA/allocatable.cpp --------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "flang/Runtime/CUDA/allocatable.h" +#include "../stat.h" +#include "../terminator.h" +#include "flang/Runtime/CUDA/common.h" +#include "flang/Runtime/CUDA/descriptor.h" +#include "flang/Runtime/allocatable.h" +#include "llvm/Support/ErrorHandling.h" + +#include "cuda_runtime.h" + +namespace Fortran::runtime::cuda { + +extern "C" { +RT_EXT_API_GROUP_BEGIN + +int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, bool hasStat, + const Descriptor *errMsg, const char *sourceFile, int sourceLine) { + if (desc.HasAddendum()) { + Terminator terminator{sourceFile, sourceLine}; + // TODO: This require a bit more work to set the correct type descriptor + // address + terminator.Crash( + "not yet implemented: CUDA descriptor allocation with addendum"); + } + // Perform the standard allocation. + int stat{RTNAME(AllocatableAllocate)( + desc, hasStat, errMsg, sourceFile, sourceLine)}; +#ifndef RT_DEVICE_COMPILATION + // Descriptor synchronization is only done when the allocation is done + // from the host. + if (stat == StatOk) { + void *deviceAddr{ + RTNAME(CUFGetDeviceAddress)((void *)&desc, sourceFile, sourceLine)}; + RTNAME(CUFDescriptorSync) + ((Descriptor *)deviceAddr, &desc, sourceFile, sourceLine); + } +#endif + return stat; +} + +int RTDEF(CUFAllocatableDeallocate)(Descriptor &desc, bool hasStat, + const Descriptor *errMsg, const char *sourceFile, int sourceLine) { + // Perform the standard allocation. + int stat{RTNAME(AllocatableDeallocate)( + desc, hasStat, errMsg, sourceFile, sourceLine)}; +#ifndef RT_DEVICE_COMPILATION + // Descriptor synchronization is only done when the deallocation is done + // from the host. + if (stat == StatOk) { + void *deviceAddr{ + RTNAME(CUFGetDeviceAddress)((void *)&desc, sourceFile, sourceLine)}; + RTNAME(CUFDescriptorSync) + ((Descriptor *)deviceAddr, &desc, sourceFile, sourceLine); + } +#endif + return stat; +} + +RT_EXT_API_GROUP_END + +} // extern "C" + +} // namespace Fortran::runtime::cuda diff --git a/flang/runtime/CUDA/allocator.cpp b/flang/runtime/CUDA/allocator.cpp index d4a473d58e86cd..85b3daf65a8ba4 100644 --- a/flang/runtime/CUDA/allocator.cpp +++ b/flang/runtime/CUDA/allocator.cpp @@ -13,6 +13,7 @@ #include "../type-info.h" #include "flang/Common/Fortran.h" #include "flang/ISO_Fortran_binding_wrapper.h" +#include "flang/Runtime/CUDA/common.h" #include "flang/Runtime/allocator-registry.h" #include "cuda_runtime.h" diff --git a/flang/runtime/CUDA/descriptor.cpp b/flang/runtime/CUDA/descriptor.cpp index 1031b1e601b646..7ce1429cd94d4a 100644 --- a/flang/runtime/CUDA/descriptor.cpp +++ b/flang/runtime/CUDA/descriptor.cpp @@ -7,7 +7,11 @@ //===----------------------------------------------------------------------===// #include "flang/Runtime/CUDA/descriptor.h" +#include "../terminator.h" #include "flang/Runtime/CUDA/allocator.h" +#include "flang/Runtime/CUDA/common.h" + +#include "cuda_runtime.h" namespace Fortran::runtime::cuda { extern "C" { @@ -23,6 +27,24 @@ void RTDEF(CUFFreeDesciptor)( CUFFreeManaged(reinterpret_cast(desc)); } +void *RTDEF(CUFGetDeviceAddress)( + void *hostPtr, const char *sourceFile, int sourceLine) { + Terminator terminator{sourceFile, sourceLine}; + void *p; + CUDA_REPORT_IF_ERROR(cudaGetSymbolAddress((void **)&p, hostPtr)); + if (!p) { + terminator.Crash("Could not retrieve symbol's address"); + } + return p; +} + +void RTDEF(CUFDescriptorSync)(Descriptor *dst, const Descriptor *src, + const char *sourceFile, int sourceLine) { + std::size_t count{src->SizeInBytes()}; + CUDA_REPORT_IF_ERROR(cudaMemcpy( + (void *)dst, (const void *)src, count, cudaMemcpyHostToDevice)); +} + RT_EXT_API_GROUP_END } } // namespace Fortran::runtime::cuda diff --git a/flang/unittests/Runtime/CUDA/Allocatable.cpp b/flang/unittests/Runtime/CUDA/Allocatable.cpp new file mode 100644 index 00000000000000..0f7eb27789316c --- /dev/null +++ b/flang/unittests/Runtime/CUDA/Allocatable.cpp @@ -0,0 +1,60 @@ +//===-- flang/unittests/Runtime/Allocatable.cpp ------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "flang/Runtime/allocatable.h" +#include "gtest/gtest.h" +#include "../../../runtime/terminator.h" +#include "flang/Common/Fortran.h" +#include "flang/Runtime/CUDA/allocator.h" +#include "flang/Runtime/CUDA/common.h" +#include "flang/Runtime/CUDA/descriptor.h" +#include "flang/Runtime/allocator-registry.h" + +#include "cuda_runtime.h" + +using namespace Fortran::runtime; +using namespace Fortran::runtime::cuda; + +static OwningPtr createAllocatable( + Fortran::common::TypeCategory tc, int kind, int rank = 1) { + return Descriptor::Create(TypeCode{tc, kind}, kind, nullptr, rank, nullptr, + CFI_attribute_allocatable); +} + +TEST(AllocatableCUFTest, SimpleDeviceAllocatable) { + using Fortran::common::TypeCategory; + RTNAME(CUFRegisterAllocator)(); + // REAL(4), DEVICE, ALLOCATABLE :: a(:) + auto a{createAllocatable(TypeCategory::Real, 4)}; + a->SetAllocIdx(kDeviceAllocatorPos); + EXPECT_EQ((int)kDeviceAllocatorPos, a->GetAllocIdx()); + EXPECT_FALSE(a->HasAddendum()); + RTNAME(AllocatableSetBounds)(*a, 0, 1, 10); + + // Emulate a device descriptor for the purpose of unit testing part of the + // code. + Descriptor *device_desc; + CUDA_REPORT_IF_ERROR(cudaMalloc(&device_desc, a->SizeInBytes())); + + RTNAME(AllocatableAllocate) + (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__); + EXPECT_TRUE(a->IsAllocated()); + RTNAME(CUFDescriptorSync)(device_desc, a.get(), __FILE__, __LINE__); + cudaDeviceSynchronize(); + + EXPECT_EQ(cudaSuccess, cudaGetLastError()); + + RTNAME(AllocatableDeallocate) + (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__); + EXPECT_FALSE(a->IsAllocated()); + + RTNAME(CUFDescriptorSync)(device_desc, a.get(), __FILE__, __LINE__); + cudaDeviceSynchronize(); + + EXPECT_EQ(cudaSuccess, cudaGetLastError()); +} diff --git a/flang/unittests/Runtime/CUDA/CMakeLists.txt b/flang/unittests/Runtime/CUDA/CMakeLists.txt index ed0caece3d15db..30fb8c220233c0 100644 --- a/flang/unittests/Runtime/CUDA/CMakeLists.txt +++ b/flang/unittests/Runtime/CUDA/CMakeLists.txt @@ -1,11 +1,19 @@ if (FLANG_CUF_RUNTIME) add_flang_unittest(FlangCufRuntimeTests + Allocatable.cpp AllocatorCUF.cpp ) +if (BUILD_SHARED_LIBS) + set(CUDA_RT_TARGET CUDA::cudart) +else() + set(CUDA_RT_TARGET CUDA::cudart_static) +endif() + target_link_libraries(FlangCufRuntimeTests PRIVATE + ${CUDA_RT_TARGET} CufRuntime_cuda_${CUDAToolkit_VERSION_MAJOR} FortranRuntime ) From 56015da593b646489c43263625cd2a8ceb7ef906 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 18 Sep 2024 20:45:26 -0700 Subject: [PATCH 146/321] [LLVM][TableGen] Change RegisterBankEmitter to use const RecordKeeper (#109195) Change RegisterBankEmitter to use const RecordKeeper. This is a part of effort to have better const correctness in TableGen backends: https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089 --- llvm/utils/TableGen/RegisterBankEmitter.cpp | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/llvm/utils/TableGen/RegisterBankEmitter.cpp b/llvm/utils/TableGen/RegisterBankEmitter.cpp index 6872f16df4724e..460f286543b176 100644 --- a/llvm/utils/TableGen/RegisterBankEmitter.cpp +++ b/llvm/utils/TableGen/RegisterBankEmitter.cpp @@ -107,18 +107,18 @@ class RegisterBank { class RegisterBankEmitter { private: - CodeGenTarget Target; - RecordKeeper &Records; + const CodeGenTarget Target; + const RecordKeeper &Records; void emitHeader(raw_ostream &OS, const StringRef TargetName, - const std::vector &Banks); + ArrayRef Banks); void emitBaseClassDefinition(raw_ostream &OS, const StringRef TargetName, - const std::vector &Banks); + ArrayRef Banks); void emitBaseClassImplementation(raw_ostream &OS, const StringRef TargetName, - std::vector &Banks); + ArrayRef Banks); public: - RegisterBankEmitter(RecordKeeper &R) : Target(R), Records(R) {} + RegisterBankEmitter(const RecordKeeper &R) : Target(R), Records(R) {} void run(raw_ostream &OS); }; @@ -129,7 +129,7 @@ class RegisterBankEmitter { /// variables. void RegisterBankEmitter::emitHeader(raw_ostream &OS, const StringRef TargetName, - const std::vector &Banks) { + ArrayRef Banks) { // RegisterBankInfo.h OS << "namespace llvm {\n" << "namespace " << TargetName << " {\n" @@ -147,8 +147,7 @@ void RegisterBankEmitter::emitHeader(raw_ostream &OS, /// Emit declarations of the GenRegisterBankInfo class. void RegisterBankEmitter::emitBaseClassDefinition( - raw_ostream &OS, const StringRef TargetName, - const std::vector &Banks) { + raw_ostream &OS, const StringRef TargetName, ArrayRef Banks) { OS << "private:\n" << " static const RegisterBank *RegBanks[];\n" << " static const unsigned Sizes[];\n\n" @@ -218,7 +217,7 @@ static void visitRegisterBankClasses( } void RegisterBankEmitter::emitBaseClassImplementation( - raw_ostream &OS, StringRef TargetName, std::vector &Banks) { + raw_ostream &OS, StringRef TargetName, ArrayRef Banks) { const CodeGenRegBank &RegisterClassHierarchy = Target.getRegBank(); const CodeGenHwModes &CGH = Target.getHwModes(); From 156035ed4dc6910105393be8981ecb3098299c5d Mon Sep 17 00:00:00 2001 From: Valentin Clement Date: Wed, 18 Sep 2024 15:42:19 -0700 Subject: [PATCH 147/321] [flang][cuda] Convert module allocation/deallocation to runtime calls Convert `cuf.allocate` and `cuf.deallocate` to the runtime entry points added in #109213 Was reviewed in https://github.com/llvm/llvm-project/pull/109214 but the parent branch was closed for some reason. --- .../Optimizer/Transforms/CufOpConversion.cpp | 59 +++++++++++-------- flang/test/Fir/CUDA/cuda-allocate.fir | 40 ++++++++++++- 2 files changed, 74 insertions(+), 25 deletions(-) diff --git a/flang/lib/Optimizer/Transforms/CufOpConversion.cpp b/flang/lib/Optimizer/Transforms/CufOpConversion.cpp index 2dc37f4df3aeec..ac796e83b07078 100644 --- a/flang/lib/Optimizer/Transforms/CufOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CufOpConversion.cpp @@ -14,6 +14,7 @@ #include "flang/Optimizer/Dialect/FIROps.h" #include "flang/Optimizer/HLFIR/HLFIROps.h" #include "flang/Optimizer/Support/DataLayout.h" +#include "flang/Runtime/CUDA/allocatable.h" #include "flang/Runtime/CUDA/common.h" #include "flang/Runtime/CUDA/descriptor.h" #include "flang/Runtime/CUDA/memory.h" @@ -35,13 +36,19 @@ using namespace Fortran::runtime::cuda; namespace { template -static bool needDoubleDescriptor(OpTy op) { +static bool isPinned(OpTy op) { + if (op.getDataAttr() && *op.getDataAttr() == cuf::DataAttribute::Pinned) + return true; + return false; +} + +template +static bool hasDoubleDescriptors(OpTy op) { if (auto declareOp = mlir::dyn_cast_or_null(op.getBox().getDefiningOp())) { if (mlir::isa_and_nonnull( declareOp.getMemref().getDefiningOp())) { - if (declareOp.getDataAttr() && - *declareOp.getDataAttr() == cuf::DataAttribute::Pinned) + if (isPinned(declareOp)) return false; return true; } @@ -49,8 +56,7 @@ static bool needDoubleDescriptor(OpTy op) { op.getBox().getDefiningOp())) { if (mlir::isa_and_nonnull( declareOp.getMemref().getDefiningOp())) { - if (declareOp.getDataAttr() && - *declareOp.getDataAttr() == cuf::DataAttribute::Pinned) + if (isPinned(declareOp)) return false; return true; } @@ -108,17 +114,22 @@ struct CufAllocateOpConversion if (op.getPinned()) return mlir::failure(); - // TODO: Allocation of module variable will need more work as the descriptor - // will be duplicated and needs to be synced after allocation. - if (needDoubleDescriptor(op)) - return mlir::failure(); + auto mod = op->getParentOfType(); + fir::FirOpBuilder builder(rewriter, mod); + mlir::Location loc = op.getLoc(); + + if (hasDoubleDescriptors(op)) { + // Allocation for module variable are done with custom runtime entry point + // so the descriptors can be synchronized. + mlir::func::FuncOp func = + fir::runtime::getRuntimeFunc( + loc, builder); + return convertOpToCall(op, rewriter, func); + } // Allocation for local descriptor falls back on the standard runtime // AllocatableAllocate as the dedicated allocator is set in the descriptor // before the call. - auto mod = op->template getParentOfType(); - fir::FirOpBuilder builder(rewriter, mod); - mlir::Location loc = op.getLoc(); mlir::func::FuncOp func = fir::runtime::getRuntimeFunc(loc, builder); @@ -133,17 +144,23 @@ struct CufDeallocateOpConversion mlir::LogicalResult matchAndRewrite(cuf::DeallocateOp op, mlir::PatternRewriter &rewriter) const override { - // TODO: Allocation of module variable will need more work as the descriptor - // will be duplicated and needs to be synced after allocation. - if (needDoubleDescriptor(op)) - return mlir::failure(); - // Deallocation for local descriptor falls back on the standard runtime - // AllocatableDeallocate as the dedicated deallocator is set in the - // descriptor before the call. auto mod = op->getParentOfType(); fir::FirOpBuilder builder(rewriter, mod); mlir::Location loc = op.getLoc(); + + if (hasDoubleDescriptors(op)) { + // Deallocation for module variable are done with custom runtime entry + // point so the descriptors can be synchronized. + mlir::func::FuncOp func = + fir::runtime::getRuntimeFunc( + loc, builder); + return convertOpToCall(op, rewriter, func); + } + + // Deallocation for local descriptor falls back on the standard runtime + // AllocatableDeallocate as the dedicated deallocator is set in the + // descriptor before the call. mlir::func::FuncOp func = fir::runtime::getRuntimeFunc(loc, builder); @@ -448,10 +465,6 @@ class CufOpConversion : public fir::impl::CufOpConversionBase { } return true; }); - target.addDynamicallyLegalOp( - [](::cuf::AllocateOp op) { return needDoubleDescriptor(op); }); - target.addDynamicallyLegalOp( - [](::cuf::DeallocateOp op) { return needDoubleDescriptor(op); }); target.addDynamicallyLegalOp( [](::cuf::DataTransferOp op) { mlir::Type srcTy = fir::unwrapRefType(op.getSrc().getType()); diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir index 1c17e7447e5c97..65c68bb69301af 100644 --- a/flang/test/Fir/CUDA/cuda-allocate.fir +++ b/flang/test/Fir/CUDA/cuda-allocate.fir @@ -54,8 +54,14 @@ func.func @_QPsub3() { } // CHECK-LABEL: func.func @_QPsub3() -// CHECK: cuf.allocate -// CHECK: cuf.deallocate +// CHECK: %[[A_ADDR:.*]] = fir.address_of(@_QMmod1Ea) : !fir.ref>>> +// CHECK: %[[A:.*]]:2 = hlfir.declare %[[A_ADDR]] {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QMmod1Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) + +// CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref>>>) -> !fir.ref> +// CHECK: fir.call @_FortranACUFAllocatableAllocate(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 + +// CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref>>>) -> !fir.ref> +// CHECK: fir.call @_FortranACUFAllocatableDeallocate(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 func.func @_QPsub4() attributes {cuf.proc_attr = #cuf.cuda_proc} { %0 = cuf.alloc !fir.box>> {bindc_name = "a", data_attr = #cuf.cuda, uniq_name = "_QFsub1Ea"} -> !fir.ref>>> @@ -95,4 +101,34 @@ func.func @_QPsub5() { // CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) : (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 // CHECK: fir.call @_FortranAAllocatableDeallocate({{.*}}) : (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 + +fir.global @_QMdataEb {data_attr = #cuf.cuda} : !fir.box>> { + %c0 = arith.constant 0 : index + %0 = fir.zero_bits !fir.heap> + %1 = fir.shape %c0 : (index) -> !fir.shape<1> + %2 = fir.embox %0(%1) {allocator_idx = 2 : i32} : (!fir.heap>, !fir.shape<1>) -> !fir.box>> + fir.has_value %2 : !fir.box>> +} + +func.func @_QQsub6() attributes {fir.bindc_name = "test"} { + %c0_i32 = arith.constant 0 : i32 + %c10_i32 = arith.constant 10 : i32 + %c1 = arith.constant 1 : index + %0 = fir.address_of(@_QMdataEb) : !fir.ref>>> + %1:2 = hlfir.declare %0 {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QMdataEb"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) + %2 = fir.convert %1#1 : (!fir.ref>>>) -> !fir.ref> + %3 = fir.convert %c1 : (index) -> i64 + %4 = fir.convert %c10_i32 : (i32) -> i64 + %5 = fir.call @_FortranAAllocatableSetBounds(%2, %c0_i32, %3, %4) fastmath : (!fir.ref>, i32, i64, i64) -> none + %6 = cuf.allocate %1#1 : !fir.ref>>> {data_attr = #cuf.cuda} -> i32 + return +} + +// CHECK-LABEL: func.func @_QQsub6() attributes {fir.bindc_name = "test"} +// CHECK: %[[B_ADDR:.*]] = fir.address_of(@_QMdataEb) : !fir.ref>>> +// CHECK: %[[B:.*]]:2 = hlfir.declare %[[B_ADDR]] {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QMdataEb"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) +// CHECK: _FortranAAllocatableSetBounds +// CHECK: %[[B_BOX:.*]] = fir.convert %[[B]]#1 : (!fir.ref>>>) -> !fir.ref> +// CHECK: fir.call @_FortranACUFAllocatableAllocate(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 + } // end of module From ce9209f50e33fa0bd81de0a53723adde65290c68 Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Wed, 18 Sep 2024 21:08:34 -0700 Subject: [PATCH 148/321] [ctx_prof] Fix `ProfileAnnotator::allTakenPathsExit` (#109183) Added tests to the validator and fixed issues stemming from the previous skipping over BBs with single successors - which is incorrect. That would be now picked by added tests where the assertions are expected to be triggered. --- .../Instrumentation/PGOCtxProfFlattening.cpp | 29 ++++--- .../CtxProfAnalysis/flatten-check-path.ll | 85 +++++++++++++++++++ 2 files changed, 104 insertions(+), 10 deletions(-) create mode 100644 llvm/test/Analysis/CtxProfAnalysis/flatten-check-path.ll diff --git a/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp b/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp index e76689e2f5f0a5..91f950e2ba4c3e 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp @@ -233,28 +233,37 @@ class ProfileAnnotator final { std::deque Worklist; DenseSet Visited; Worklist.push_back(&F.getEntryBlock()); - Visited.insert(&F.getEntryBlock()); + bool HitExit = false; while (!Worklist.empty()) { const auto *BB = Worklist.front(); Worklist.pop_front(); - if (succ_size(BB) <= 1) + if (!Visited.insert(BB).second) continue; + if (succ_size(BB) == 0) { + if (isa(BB->getTerminator())) + return false; + HitExit = true; + continue; + } + if (succ_size(BB) == 1) { + llvm::append_range(Worklist, successors(BB)); + continue; + } const auto &BBInfo = getBBInfo(*BB); - bool Inserted = false; + bool HasAWayOut = false; for (auto I = 0U; I < BB->getTerminator()->getNumSuccessors(); ++I) { const auto *Succ = BB->getTerminator()->getSuccessor(I); if (!shouldExcludeEdge(*BB, *Succ)) { - if (BBInfo.getEdgeCount(I) > 0) - if (Visited.insert(Succ).second) { - Worklist.push_back(Succ); - Inserted = true; - } + if (BBInfo.getEdgeCount(I) > 0) { + HasAWayOut = true; + Worklist.push_back(Succ); + } } } - if (!Inserted) + if (!HasAWayOut) return false; } - return true; + return HitExit; } public: diff --git a/llvm/test/Analysis/CtxProfAnalysis/flatten-check-path.ll b/llvm/test/Analysis/CtxProfAnalysis/flatten-check-path.ll new file mode 100644 index 00000000000000..42eaa67a983087 --- /dev/null +++ b/llvm/test/Analysis/CtxProfAnalysis/flatten-check-path.ll @@ -0,0 +1,85 @@ +; REQUIRES: asserts && x86_64-linux +; Check that the profile annotator works: we hit an exit and non-zero paths to +; already visited blocks count as taken (i.e. the flow continues through them). +; +; RUN: split-file %s %t +; RUN: llvm-ctxprof-util fromJSON --input=%t/profile_ok.json --output=%t/profile_ok.ctxprofdata +; RUN: llvm-ctxprof-util fromJSON --input=%t/profile_pump.json --output=%t/profile_pump.ctxprofdata +; RUN: llvm-ctxprof-util fromJSON --input=%t/profile_unreachable.json --output=%t/profile_unreachable.ctxprofdata +; +; RUN: opt -passes=ctx-prof-flatten %t/example_ok.ll -use-ctx-profile=%t/profile_ok.ctxprofdata -S -o - | FileCheck %s +; RUN: not --crash opt -passes=ctx-prof-flatten %t/message_pump.ll -use-ctx-profile=%t/profile_pump.ctxprofdata -S 2>&1 | FileCheck %s --check-prefix=ASSERTION +; RUN: not --crash opt -passes=ctx-prof-flatten %t/unreachable.ll -use-ctx-profile=%t/profile_unreachable.ctxprofdata -S 2>&1 | FileCheck %s --check-prefix=ASSERTION + +; CHECK: br i1 %x, label %b1, label %exit, !prof ![[PROF1:[0-9]+]] +; CHECK: br i1 %y, label %blk, label %exit, !prof ![[PROF2:[0-9]+]] +; CHECK: ![[PROF1]] = !{!"branch_weights", i32 1, i32 1} +; CHECK: ![[PROF2]] = !{!"branch_weights", i32 0, i32 1} +; ASSERTION: Assertion `allTakenPathsExit() + +; b1->exit is the only way out from b1, but the exit block would have been +; already visited from blk. That should not result in an assertion, though. +;--- example_ok.ll +define void @foo(i32 %t) !guid !0 { +entry: + call void @llvm.instrprof.increment(ptr @foo, i64 42, i32 42, i32 0) + br label %blk +blk: + call void @llvm.instrprof.increment(ptr @foo, i64 42, i32 42, i32 1) + %x = icmp eq i32 %t, 0 + br i1 %x, label %b1, label %exit +b1: + call void @llvm.instrprof.increment(ptr @foo, i64 42, i32 42, i32 2) + %y = icmp eq i32 %t, 0 + br i1 %y, label %blk, label %exit +exit: + call void @llvm.instrprof.increment(ptr @foo, i64 42, i32 42, i32 3) + ret void +} +!0 = !{i64 1234} + +;--- profile_ok.json +[{"Guid":1234, "Counters":[2, 2, 1, 2]}] + +;--- message_pump.ll +; This is a message pump: the loop never exits. This should result in an +; assertion because we can't reach an exit BB + +define void @foo(i32 %t) !guid !0 { +entry: + call void @llvm.instrprof.increment(ptr @foo, i64 42, i32 42, i32 0) + br label %blk +blk: + call void @llvm.instrprof.increment(ptr @foo, i64 42, i32 42, i32 1) + %x = icmp eq i32 %t, 0 + br i1 %x, label %blk, label %exit +exit: + call void @llvm.instrprof.increment(ptr @foo, i64 42, i32 42, i32 2) + ret void +} +!0 = !{i64 1234} + +;--- profile_pump.json +[{"Guid":1234, "Counters":[2, 10, 0]}] + +;--- unreachable.ll +; An unreachable block is reached, that's an error +define void @foo(i32 %t) !guid !0 { +entry: + call void @llvm.instrprof.increment(ptr @foo, i64 42, i32 42, i32 0) + br label %blk +blk: + call void @llvm.instrprof.increment(ptr @foo, i64 42, i32 42, i32 1) + %x = icmp eq i32 %t, 0 + br i1 %x, label %b1, label %exit +b1: + call void @llvm.instrprof.increment(ptr @foo, i64 42, i32 42, i32 2) + unreachable +exit: + call void @llvm.instrprof.increment(ptr @foo, i64 42, i32 42, i32 3) + ret void +} +!0 = !{i64 1234} + +;--- profile_unreachable.json +[{"Guid":1234, "Counters":[2, 1, 1, 2]}] \ No newline at end of file From ee5709b3b4bfd6e8e7fed8195e14e78ef10c9d74 Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Wed, 18 Sep 2024 21:13:48 -0700 Subject: [PATCH 149/321] [nfc][ctx_prof] Don't try finding callsite annotation for un-instrumentable callsites (#109184) Reinforcing properties ensured at instrumentation time. --- llvm/lib/Analysis/CtxProfAnalysis.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Analysis/CtxProfAnalysis.cpp b/llvm/lib/Analysis/CtxProfAnalysis.cpp index c29709b613410e..3df72983862d98 100644 --- a/llvm/lib/Analysis/CtxProfAnalysis.cpp +++ b/llvm/lib/Analysis/CtxProfAnalysis.cpp @@ -234,16 +234,23 @@ PreservedAnalyses CtxProfAnalysisPrinterPass::run(Module &M, } InstrProfCallsite *CtxProfAnalysis::getCallsiteInstrumentation(CallBase &CB) { - for (auto *Prev = CB.getPrevNode(); Prev; Prev = Prev->getPrevNode()) + if (!InstrProfCallsite::canInstrumentCallsite(CB)) + return nullptr; + for (auto *Prev = CB.getPrevNode(); Prev; Prev = Prev->getPrevNode()) { if (auto *IPC = dyn_cast(Prev)) return IPC; + assert(!isa(Prev) && + "didn't expect to find another call, that's not the callsite " + "instrumentation, before an instrumentable callsite"); + } return nullptr; } InstrProfIncrementInst *CtxProfAnalysis::getBBInstrumentation(BasicBlock &BB) { for (auto &I : BB) if (auto *Incr = dyn_cast(&I)) - return Incr; + if (!isa(&I)) + return Incr; return nullptr; } From 12d94850cd183cadf37f1f278e5795e84a95e894 Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Wed, 18 Sep 2024 21:19:28 -0700 Subject: [PATCH 150/321] [ctx_prof] Avoid `llvm::append_range` to fix some build bots Example: https://lab.llvm.org/buildbot/#/builders/169/builds/3381 The CI allowed the `llvm::append_range` instantiation, but on the other hand it's quite unnecessary here. --- llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp b/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp index 91f950e2ba4c3e..4bb505a4a4f03e 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp @@ -246,7 +246,7 @@ class ProfileAnnotator final { continue; } if (succ_size(BB) == 1) { - llvm::append_range(Worklist, successors(BB)); + Worklist.push_back(BB->getUniqueSuccessor()); continue; } const auto &BBInfo = getBBInfo(*BB); From 80f6b42a26ec7594e6b016c5dde5d57db6c9dfb1 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 18 Sep 2024 21:34:05 -0700 Subject: [PATCH 151/321] [MachinePipeliner] Fix incorrect use of getPressureSets. (#109179) The code was passing a physical register directly to getPressureSets which expects a register unit. Fix this by looping over the register units and calling getPressureSets for each of them. Found while trying to add a RegisterUnit class to stop storing register units in `Register`. 0 is a valid register unit but not a valid Register. --- llvm/lib/CodeGen/MachinePipeliner.cpp | 29 ++++++++++++++++----------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp index 34eaf211c17a30..cd8333931bb5f9 100644 --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -1344,9 +1344,11 @@ class HighRegisterPressureDetector { LLVM_DEBUG({ for (auto Reg : FixedRegs) { dbgs() << printReg(Reg, TRI, 0, &MRI) << ": ["; - const int *Sets = TRI->getRegUnitPressureSets(Reg); - for (; *Sets != -1; Sets++) { - dbgs() << TRI->getRegPressureSetName(*Sets) << ", "; + for (MCRegUnit Unit : TRI->regunits(Reg)) { + const int *Sets = TRI->getRegUnitPressureSets(Unit); + for (; *Sets != -1; Sets++) { + dbgs() << TRI->getRegPressureSetName(*Sets) << ", "; + } } dbgs() << "]\n"; } @@ -1355,15 +1357,18 @@ class HighRegisterPressureDetector { for (auto Reg : FixedRegs) { LLVM_DEBUG(dbgs() << "fixed register: " << printReg(Reg, TRI, 0, &MRI) << "\n"); - auto PSetIter = MRI.getPressureSets(Reg); - unsigned Weight = PSetIter.getWeight(); - for (; PSetIter.isValid(); ++PSetIter) { - unsigned &Limit = PressureSetLimit[*PSetIter]; - assert(Limit >= Weight && - "register pressure limit must be greater than or equal weight"); - Limit -= Weight; - LLVM_DEBUG(dbgs() << "PSet=" << *PSetIter << " Limit=" << Limit - << " (decreased by " << Weight << ")\n"); + for (MCRegUnit Unit : TRI->regunits(Reg)) { + auto PSetIter = MRI.getPressureSets(Unit); + unsigned Weight = PSetIter.getWeight(); + for (; PSetIter.isValid(); ++PSetIter) { + unsigned &Limit = PressureSetLimit[*PSetIter]; + assert( + Limit >= Weight && + "register pressure limit must be greater than or equal weight"); + Limit -= Weight; + LLVM_DEBUG(dbgs() << "PSet=" << *PSetIter << " Limit=" << Limit + << " (decreased by " << Weight << ")\n"); + } } } } From 0f06f707ec9c670ed6e8f245d045462fbc14224b Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 18 Sep 2024 21:42:52 -0700 Subject: [PATCH 152/321] [NFC] Cleanup RegisterInfoEmitter code (#109199) Change variable name `o` to `OS` to match definition, and `ClName` to `ClassName` for better clarity. Cache RegBank reference in the class and do no pass around class members to functions. --- llvm/utils/TableGen/RegisterInfoEmitter.cpp | 89 +++++++++------------ 1 file changed, 38 insertions(+), 51 deletions(-) diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp index 63e70698d7cd6f..a7f3977300b302 100644 --- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp +++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp @@ -56,54 +56,49 @@ static cl::opt namespace { class RegisterInfoEmitter { - CodeGenTarget Target; RecordKeeper &Records; + const CodeGenTarget Target; + CodeGenRegBank &RegBank; public: - RegisterInfoEmitter(RecordKeeper &R) : Target(R), Records(R) { - CodeGenRegBank &RegBank = Target.getRegBank(); + RegisterInfoEmitter(RecordKeeper &R) + : Records(R), Target(R), RegBank(Target.getRegBank()) { RegBank.computeDerivedInfo(); } // runEnums - Print out enum values for all of the registers. - void runEnums(raw_ostream &o, CodeGenTarget &Target, CodeGenRegBank &Bank); + void runEnums(raw_ostream &OS); // runMCDesc - Print out MC register descriptions. - void runMCDesc(raw_ostream &o, CodeGenTarget &Target, CodeGenRegBank &Bank); + void runMCDesc(raw_ostream &OS); // runTargetHeader - Emit a header fragment for the register info emitter. - void runTargetHeader(raw_ostream &o, CodeGenTarget &Target, - CodeGenRegBank &Bank); + void runTargetHeader(raw_ostream &OS); // runTargetDesc - Output the target register and register file descriptions. - void runTargetDesc(raw_ostream &o, CodeGenTarget &Target, - CodeGenRegBank &Bank); + void runTargetDesc(raw_ostream &OS); // run - Output the register file description. - void run(raw_ostream &o); + void run(raw_ostream &OS); void debugDump(raw_ostream &OS); private: - void EmitRegMapping(raw_ostream &o, const std::deque &Regs, + void EmitRegMapping(raw_ostream &OS, const std::deque &Regs, bool isCtor); - void EmitRegMappingTables(raw_ostream &o, + void EmitRegMappingTables(raw_ostream &OS, const std::deque &Regs, bool isCtor); - void EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank, - const std::string &ClassName); - void emitComposeSubRegIndices(raw_ostream &OS, CodeGenRegBank &RegBank, - const std::string &ClassName); - void emitComposeSubRegIndexLaneMask(raw_ostream &OS, CodeGenRegBank &RegBank, - const std::string &ClassName); + void EmitRegUnitPressure(raw_ostream &OS, StringRef ClassName); + void emitComposeSubRegIndices(raw_ostream &OS, StringRef ClassName); + void emitComposeSubRegIndexLaneMask(raw_ostream &OS, StringRef ClassName); }; } // end anonymous namespace // runEnums - Print out enum values for all of the registers. -void RegisterInfoEmitter::runEnums(raw_ostream &OS, CodeGenTarget &Target, - CodeGenRegBank &Bank) { - const auto &Registers = Bank.getRegisters(); +void RegisterInfoEmitter::runEnums(raw_ostream &OS) { + const auto &Registers = RegBank.getRegisters(); // Register enums are stored as uint16_t in the tables. Make sure we'll fit. assert(Registers.size() <= 0xffff && "Too many regs to fit in tables"); @@ -134,7 +129,7 @@ void RegisterInfoEmitter::runEnums(raw_ostream &OS, CodeGenTarget &Target, if (!Namespace.empty()) OS << "} // end namespace " << Namespace << "\n"; - const auto &RegisterClasses = Bank.getRegClasses(); + const auto &RegisterClasses = RegBank.getRegClasses(); if (!RegisterClasses.empty()) { // RegisterClass enums are stored as uint16_t in the tables. @@ -168,7 +163,7 @@ void RegisterInfoEmitter::runEnums(raw_ostream &OS, CodeGenTarget &Target, OS << "} // end namespace " << Namespace << "\n\n"; } - auto &SubRegIndices = Bank.getSubRegIndices(); + auto &SubRegIndices = RegBank.getSubRegIndices(); if (!SubRegIndices.empty()) { OS << "\n// Subregister indices\n\n"; std::string Namespace = SubRegIndices.front().getNamespace(); @@ -187,9 +182,9 @@ void RegisterInfoEmitter::runEnums(raw_ostream &OS, CodeGenTarget &Target, if (!Namespace.empty()) OS << "namespace " << Namespace << " {\n"; OS << "enum RegisterPressureSets {\n"; - unsigned NumSets = Bank.getNumRegPressureSets(); + unsigned NumSets = RegBank.getNumRegPressureSets(); for (unsigned i = 0; i < NumSets; ++i) { - const RegUnitSet &RegUnits = Bank.getRegSetAt(i); + const RegUnitSet &RegUnits = RegBank.getRegSetAt(i); OS << " " << RegUnits.Name << " = " << i << ",\n"; } OS << "};\n"; @@ -204,8 +199,7 @@ void RegisterInfoEmitter::runEnums(raw_ostream &OS, CodeGenTarget &Target, static void printInt(raw_ostream &OS, int Val) { OS << Val; } void RegisterInfoEmitter::EmitRegUnitPressure(raw_ostream &OS, - const CodeGenRegBank &RegBank, - const std::string &ClassName) { + StringRef ClassName) { unsigned NumRCs = RegBank.getRegClasses().size(); unsigned NumSets = RegBank.getNumRegPressureSets(); @@ -683,10 +677,9 @@ static bool combine(const CodeGenSubRegIndex *Idx, } void RegisterInfoEmitter::emitComposeSubRegIndices(raw_ostream &OS, - CodeGenRegBank &RegBank, - const std::string &ClName) { + StringRef ClassName) { const auto &SubRegIndices = RegBank.getSubRegIndices(); - OS << "unsigned " << ClName + OS << "unsigned " << ClassName << "::composeSubRegIndicesImpl(unsigned IdxA, unsigned IdxB) const {\n"; // Many sub-register indexes are composition-compatible, meaning that @@ -751,8 +744,8 @@ void RegisterInfoEmitter::emitComposeSubRegIndices(raw_ostream &OS, OS << "}\n\n"; } -void RegisterInfoEmitter::emitComposeSubRegIndexLaneMask( - raw_ostream &OS, CodeGenRegBank &RegBank, const std::string &ClName) { +void RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(raw_ostream &OS, + StringRef ClassName) { // See the comments in computeSubRegLaneMasks() for our goal here. const auto &SubRegIndices = RegBank.getSubRegIndices(); @@ -815,7 +808,7 @@ void RegisterInfoEmitter::emitComposeSubRegIndexLaneMask( } OS << " };\n\n"; - OS << "LaneBitmask " << ClName + OS << "LaneBitmask " << ClassName << "::composeSubRegIndexLaneMaskImpl(unsigned IdxA, LaneBitmask LaneMask)" " const {\n" " --IdxA; assert(IdxA < " @@ -836,7 +829,7 @@ void RegisterInfoEmitter::emitComposeSubRegIndexLaneMask( " return Result;\n" "}\n\n"; - OS << "LaneBitmask " << ClName + OS << "LaneBitmask " << ClassName << "::reverseComposeSubRegIndexLaneMaskImpl(unsigned IdxA, " " LaneBitmask LaneMask) const {\n" " LaneMask &= getSubRegIndexLaneMask(IdxA);\n" @@ -861,8 +854,7 @@ void RegisterInfoEmitter::emitComposeSubRegIndexLaneMask( // // runMCDesc - Print out MC register descriptions. // -void RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target, - CodeGenRegBank &RegBank) { +void RegisterInfoEmitter::runMCDesc(raw_ostream &OS) { emitSourceFileHeader("MC Register Information", OS); OS << "\n#ifdef GET_REGINFO_MC_DESC\n"; @@ -1025,7 +1017,7 @@ void RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target, << " const uint8_t " << Name << "Bits[] = {\n "; BitVectorEmitter BVE; for (const Record *Reg : Order) { - BVE.add(Target.getRegBank().getReg(Reg)->EnumValue); + BVE.add(RegBank.getReg(Reg)->EnumValue); } BVE.print(OS); OS << "\n };\n\n"; @@ -1100,9 +1092,7 @@ void RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target, OS << "#endif // GET_REGINFO_MC_DESC\n\n"; } -void RegisterInfoEmitter::runTargetHeader(raw_ostream &OS, - CodeGenTarget &Target, - CodeGenRegBank &RegBank) { +void RegisterInfoEmitter::runTargetHeader(raw_ostream &OS) { emitSourceFileHeader("Register Information Header Fragment", OS); OS << "\n#ifdef GET_REGINFO_HEADER\n"; @@ -1187,8 +1177,7 @@ void RegisterInfoEmitter::runTargetHeader(raw_ostream &OS, // // runTargetDesc - Output the target register and register file descriptions. // -void RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target, - CodeGenRegBank &RegBank) { +void RegisterInfoEmitter::runTargetDesc(raw_ostream &OS) { emitSourceFileHeader("Target Register and Register Classes Information", OS); OS << "\n#ifdef GET_REGINFO_TARGET_DESC\n"; @@ -1491,8 +1480,8 @@ void RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target, std::distance(SubRegIndices.begin(), SubRegIndices.end()); if (!SubRegIndices.empty()) { - emitComposeSubRegIndices(OS, RegBank, ClassName); - emitComposeSubRegIndexLaneMask(OS, RegBank, ClassName); + emitComposeSubRegIndices(OS, ClassName); + emitComposeSubRegIndexLaneMask(OS, ClassName); } if (!SubRegIndices.empty()) { @@ -1574,7 +1563,7 @@ void RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target, << " return TV ? getRegClass(TV - 1) : nullptr;\n}\n\n"; } - EmitRegUnitPressure(OS, RegBank, ClassName); + EmitRegUnitPressure(OS, ClassName); // Emit register base class mapper if (!RegisterClasses.empty()) { @@ -1816,25 +1805,23 @@ void RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target, } void RegisterInfoEmitter::run(raw_ostream &OS) { - CodeGenRegBank &RegBank = Target.getRegBank(); Records.startTimer("Print enums"); - runEnums(OS, Target, RegBank); + runEnums(OS); Records.startTimer("Print MC registers"); - runMCDesc(OS, Target, RegBank); + runMCDesc(OS); Records.startTimer("Print header fragment"); - runTargetHeader(OS, Target, RegBank); + runTargetHeader(OS); Records.startTimer("Print target registers"); - runTargetDesc(OS, Target, RegBank); + runTargetDesc(OS); if (RegisterInfoDebug) debugDump(errs()); } void RegisterInfoEmitter::debugDump(raw_ostream &OS) { - CodeGenRegBank &RegBank = Target.getRegBank(); const CodeGenHwModes &CGH = Target.getHwModes(); unsigned NumModes = CGH.getNumModeIds(); auto getModeName = [CGH](unsigned M) -> StringRef { From 5e1a54b298d108be9c2face5ca0e6664c2dafc93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Wed, 18 Sep 2024 21:45:32 -0700 Subject: [PATCH 153/321] [flang][cuda][NFC] Add more descriptor inquiry tests for data transfer (#108094) Make sure there is no data transfer generated when a device variable is used in these intrinsic functions. --- flang/test/Lower/CUDA/cuda-data-transfer.cuf | 21 +++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/flang/test/Lower/CUDA/cuda-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-data-transfer.cuf index 11f1f33d7cb587..2f76b5e78800ad 100644 --- a/flang/test/Lower/CUDA/cuda-data-transfer.cuf +++ b/flang/test/Lower/CUDA/cuda-data-transfer.cuf @@ -354,12 +354,23 @@ end subroutine ! CHECK: cuf.kernel<<<*, *>>> ! CHECK-NOT: cuf.data_transfer -subroutine sub18() +subroutine sub18(o) + integer, device, optional, allocatable :: o(:) integer, device, allocatable :: a(:) - integer :: isz - - isz = size(a) + integer, device, pointer :: p(:) + integer :: b + integer :: s(1) + logical :: l + + b = size(a) + b = lbound(a, dim=1) + b = ubound(a, dim=1) + s = shape(a) + l = allocated(a) + l = associated(p) + b = kind(a) + l = present(o) end subroutine -! CHECK-LABEL: func.func @_QPsub18() +! CHECK-LABEL: func.func @_QPsub18 ! CHECK-NOT: cuf.data_transfer From 4194e8dea52a23949f81a611cbd91148404714cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Wed, 18 Sep 2024 22:14:30 -0700 Subject: [PATCH 154/321] [flang][cuda][NFC] Fix grammar in CanCUDASymbolHasSave function name (#109234) --- flang/include/flang/Evaluate/tools.h | 2 +- flang/lib/Evaluate/tools.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h index a0487e399d936c..d2887b69cc6de1 100644 --- a/flang/include/flang/Evaluate/tools.h +++ b/flang/include/flang/Evaluate/tools.h @@ -1259,7 +1259,7 @@ bool CheckForCoindexedObject(parser::ContextualMessages &, const std::optional &, const std::string &procName, const std::string &argName); -inline bool CanCUDASymbolHasSave(const Symbol &sym) { +inline bool CanCUDASymbolHaveSaveAttr(const Symbol &sym) { if (const auto *details = sym.GetUltimate().detailsIf()) { if (details->cudaDataAttr() && diff --git a/flang/lib/Evaluate/tools.cpp b/flang/lib/Evaluate/tools.cpp index 400f27aef98da6..c2545a87099426 100644 --- a/flang/lib/Evaluate/tools.cpp +++ b/flang/lib/Evaluate/tools.cpp @@ -1700,7 +1700,7 @@ bool IsSaved(const Symbol &original) { (features.IsEnabled( common::LanguageFeature::SaveBigMainProgramVariables) && symbol.size() > 32)) && - Fortran::evaluate::CanCUDASymbolHasSave(symbol)) { + Fortran::evaluate::CanCUDASymbolHaveSaveAttr(symbol)) { // With SaveBigMainProgramVariables, keeping all unsaved main program // variables of 32 bytes or less on the stack allows keeping numerical and // logical scalars, small scalar characters or derived, small arrays, and From 7603e854295baeabeaa2c7ffcf187c9c53def822 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 18 Sep 2024 22:26:48 -0700 Subject: [PATCH 155/321] [LLVM][TableGen] Change PseudoLoweringEmitter to use const RecordKeeper (#109194) Change PseudoLoweringEmitter to use const RecordKeeper. This is a part of effort to have better const correctness in TableGen backends: https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089 --- llvm/utils/TableGen/PseudoLoweringEmitter.cpp | 56 +++++++++---------- 1 file changed, 26 insertions(+), 30 deletions(-) diff --git a/llvm/utils/TableGen/PseudoLoweringEmitter.cpp b/llvm/utils/TableGen/PseudoLoweringEmitter.cpp index 7c3abefd96f5d5..9e09bdae76fd41 100644 --- a/llvm/utils/TableGen/PseudoLoweringEmitter.cpp +++ b/llvm/utils/TableGen/PseudoLoweringEmitter.cpp @@ -29,7 +29,7 @@ class PseudoLoweringEmitter { union { unsigned Operand; // Operand number mapped to. uint64_t Imm; // Integer immedate value. - Record *Reg; // Physical register. + const Record *Reg; // Physical register. } Data; }; struct PseudoExpansion { @@ -42,24 +42,24 @@ class PseudoLoweringEmitter { : Source(s), Dest(d), OperandMap(m) {} }; - RecordKeeper &Records; + const RecordKeeper &Records; // It's overkill to have an instance of the full CodeGenTarget object, // but it loads everything on demand, not in the constructor, so it's // lightweight in performance, so it works out OK. - CodeGenTarget Target; + const CodeGenTarget Target; SmallVector Expansions; - unsigned addDagOperandMapping(Record *Rec, DagInit *Dag, - CodeGenInstruction &Insn, + unsigned addDagOperandMapping(const Record *Rec, const DagInit *Dag, + const CodeGenInstruction &Insn, IndexedMap &OperandMap, unsigned BaseIdx); - void evaluateExpansion(Record *Pseudo); + void evaluateExpansion(const Record *Pseudo); void emitLoweringEmitter(raw_ostream &o); public: - PseudoLoweringEmitter(RecordKeeper &R) : Records(R), Target(R) {} + PseudoLoweringEmitter(const RecordKeeper &R) : Records(R), Target(R) {} /// run - Output the pseudo-lowerings. void run(raw_ostream &o); @@ -69,13 +69,12 @@ class PseudoLoweringEmitter { // FIXME: This pass currently can only expand a pseudo to a single instruction. // The pseudo expansion really should take a list of dags, not just // a single dag, so we can do fancier things. - unsigned PseudoLoweringEmitter::addDagOperandMapping( - Record *Rec, DagInit *Dag, CodeGenInstruction &Insn, + const Record *Rec, const DagInit *Dag, const CodeGenInstruction &Insn, IndexedMap &OperandMap, unsigned BaseIdx) { unsigned OpsAdded = 0; for (unsigned i = 0, e = Dag->getNumArgs(); i != e; ++i) { - if (DefInit *DI = dyn_cast(Dag->getArg(i))) { + if (const DefInit *DI = dyn_cast(Dag->getArg(i))) { // Physical register reference. Explicit check for the special case // "zero_reg" definition. if (DI->getDef()->isSubClassOf("Register") || @@ -105,17 +104,15 @@ unsigned PseudoLoweringEmitter::addDagOperandMapping( for (unsigned I = 0, E = Insn.Operands[i].MINumOperands; I != E; ++I) OperandMap[BaseIdx + i + I].Kind = OpData::Operand; OpsAdded += Insn.Operands[i].MINumOperands; - } else if (IntInit *II = dyn_cast(Dag->getArg(i))) { + } else if (const IntInit *II = dyn_cast(Dag->getArg(i))) { OperandMap[BaseIdx + i].Kind = OpData::Imm; OperandMap[BaseIdx + i].Data.Imm = II->getValue(); ++OpsAdded; - } else if (auto *BI = dyn_cast(Dag->getArg(i))) { - auto *II = - cast(BI->convertInitializerTo(IntRecTy::get(Records))); + } else if (const auto *BI = dyn_cast(Dag->getArg(i))) { OperandMap[BaseIdx + i].Kind = OpData::Imm; - OperandMap[BaseIdx + i].Data.Imm = II->getValue(); + OperandMap[BaseIdx + i].Data.Imm = *BI->convertInitializerToInt(); ++OpsAdded; - } else if (DagInit *SubDag = dyn_cast(Dag->getArg(i))) { + } else if (const DagInit *SubDag = dyn_cast(Dag->getArg(i))) { // Just add the operands recursively. This is almost certainly // a constant value for a complex operand (> 1 MI operand). unsigned NewOps = @@ -129,23 +126,23 @@ unsigned PseudoLoweringEmitter::addDagOperandMapping( return OpsAdded; } -void PseudoLoweringEmitter::evaluateExpansion(Record *Rec) { +void PseudoLoweringEmitter::evaluateExpansion(const Record *Rec) { LLVM_DEBUG(dbgs() << "Pseudo definition: " << Rec->getName() << "\n"); // Validate that the result pattern has the corrent number and types // of arguments for the instruction it references. - DagInit *Dag = Rec->getValueAsDag("ResultInst"); + const DagInit *Dag = Rec->getValueAsDag("ResultInst"); assert(Dag && "Missing result instruction in pseudo expansion!"); LLVM_DEBUG(dbgs() << " Result: " << *Dag << "\n"); - DefInit *OpDef = dyn_cast(Dag->getOperator()); + const DefInit *OpDef = dyn_cast(Dag->getOperator()); if (!OpDef) { PrintError(Rec, "In pseudo instruction '" + Rec->getName() + "', result operator is not a record"); PrintFatalNote(Rec->getValue("ResultInst"), "Result was assigned at the following location:"); } - Record *Operator = OpDef->getDef(); + const Record *Operator = OpDef->getDef(); if (!Operator->isSubClassOf("Instruction")) { PrintError(Rec, "In pseudo instruction '" + Rec->getName() + "', result operator '" + Operator->getName() + @@ -173,8 +170,8 @@ void PseudoLoweringEmitter::evaluateExpansion(Record *Rec) { } unsigned NumMIOperands = 0; - for (unsigned i = 0, e = Insn.Operands.size(); i != e; ++i) - NumMIOperands += Insn.Operands[i].MINumOperands; + for (const auto &Op : Insn.Operands) + NumMIOperands += Op.MINumOperands; IndexedMap OperandMap; OperandMap.grow(NumMIOperands); @@ -192,8 +189,8 @@ void PseudoLoweringEmitter::evaluateExpansion(Record *Rec) { // the lowering emitter. CodeGenInstruction SourceInsn(Rec); StringMap SourceOperands; - for (unsigned i = 0, e = SourceInsn.Operands.size(); i != e; ++i) - SourceOperands[SourceInsn.Operands[i].Name] = i; + for (const auto &[Idx, SrcOp] : enumerate(SourceInsn.Operands)) + SourceOperands[SrcOp.Name] = Idx; LLVM_DEBUG(dbgs() << " Operand mapping:\n"); for (unsigned i = 0, e = Insn.Operands.size(); i != e; ++i) { @@ -265,7 +262,7 @@ void PseudoLoweringEmitter::emitLoweringEmitter(raw_ostream &o) { << Expansion.OperandMap[MIOpNo + i].Data.Imm << "));\n"; break; case OpData::Reg: { - Record *Reg = Expansion.OperandMap[MIOpNo + i].Data.Reg; + const Record *Reg = Expansion.OperandMap[MIOpNo + i].Data.Reg; o << " Inst.addOperand(MCOperand::createReg("; // "zero_reg" is special. if (Reg->getName() == "zero_reg") @@ -297,19 +294,18 @@ void PseudoLoweringEmitter::emitLoweringEmitter(raw_ostream &o) { o << "\n}\n\n"; } -void PseudoLoweringEmitter::run(raw_ostream &o) { +void PseudoLoweringEmitter::run(raw_ostream &OS) { StringRef Classes[] = {"PseudoInstExpansion", "Instruction"}; - std::vector Insts = Records.getAllDerivedDefinitions(Classes); // Process the pseudo expansion definitions, validating them as we do so. Records.startTimer("Process definitions"); - for (unsigned i = 0, e = Insts.size(); i != e; ++i) - evaluateExpansion(Insts[i]); + for (const Record *Inst : Records.getAllDerivedDefinitions(Classes)) + evaluateExpansion(Inst); // Generate expansion code to lower the pseudo to an MCInst of the real // instruction. Records.startTimer("Emit expansion code"); - emitLoweringEmitter(o); + emitLoweringEmitter(OS); } static TableGen::Emitter::OptClass From 23123aa4ec9c6d8ce406df36006e6729b6cd044e Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 18 Sep 2024 22:27:26 -0700 Subject: [PATCH 156/321] [LLVM][TableGen] Change InstrInfoEmitter to use const RecordKeeper (#109189) Change InstrInfoEmitter to use const RecordKeeper. This is a part of effort to have better const correctness in TableGen backends: https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089 --- .../TableGen/Common/CodeGenInstruction.cpp | 6 +-- .../TableGen/Common/CodeGenInstruction.h | 2 +- llvm/utils/TableGen/Common/CodeGenTarget.cpp | 3 +- llvm/utils/TableGen/Common/CodeGenTarget.h | 2 +- .../Common/GlobalISel/GlobalISelMatchTable.h | 4 +- llvm/utils/TableGen/GlobalISelEmitter.cpp | 2 +- llvm/utils/TableGen/InstrInfoEmitter.cpp | 47 ++++++++++--------- 7 files changed, 34 insertions(+), 32 deletions(-) diff --git a/llvm/utils/TableGen/Common/CodeGenInstruction.cpp b/llvm/utils/TableGen/Common/CodeGenInstruction.cpp index 8d698fa9aa36d0..452b084aa6f7d5 100644 --- a/llvm/utils/TableGen/Common/CodeGenInstruction.cpp +++ b/llvm/utils/TableGen/Common/CodeGenInstruction.cpp @@ -485,8 +485,8 @@ CodeGenInstruction::CodeGenInstruction(const Record *R) isCodeGenOnly = R->getValueAsBit("isCodeGenOnly"); isPseudo = R->getValueAsBit("isPseudo"); isMeta = R->getValueAsBit("isMeta"); - ImplicitDefs = R->getValueAsListOfDefs("Defs"); - ImplicitUses = R->getValueAsListOfDefs("Uses"); + ImplicitDefs = R->getValueAsListOfConstDefs("Defs"); + ImplicitUses = R->getValueAsListOfConstDefs("Uses"); // This flag is only inferred from the pattern. hasChain = false; @@ -523,7 +523,7 @@ MVT::SimpleValueType CodeGenInstruction::HasOneImplicitDefWithKnownVT( return MVT::Other; // Check to see if the first implicit def has a resolvable type. - Record *FirstImplicitDef = ImplicitDefs[0]; + const Record *FirstImplicitDef = ImplicitDefs[0]; assert(FirstImplicitDef->isSubClassOf("Register")); const std::vector &RegVTs = TargetInfo.getRegisterVTs(FirstImplicitDef); diff --git a/llvm/utils/TableGen/Common/CodeGenInstruction.h b/llvm/utils/TableGen/Common/CodeGenInstruction.h index 3d4360fcfda706..18294b157fedb1 100644 --- a/llvm/utils/TableGen/Common/CodeGenInstruction.h +++ b/llvm/utils/TableGen/Common/CodeGenInstruction.h @@ -235,7 +235,7 @@ class CodeGenInstruction { /// ImplicitDefs/ImplicitUses - These are lists of registers that are /// implicitly defined and used by the instruction. - std::vector ImplicitDefs, ImplicitUses; + std::vector ImplicitDefs, ImplicitUses; // Various boolean values we track for the instruction. bool isPreISelOpcode : 1; diff --git a/llvm/utils/TableGen/Common/CodeGenTarget.cpp b/llvm/utils/TableGen/Common/CodeGenTarget.cpp index 69d2c7006e61da..065d1010ff9aec 100644 --- a/llvm/utils/TableGen/Common/CodeGenTarget.cpp +++ b/llvm/utils/TableGen/Common/CodeGenTarget.cpp @@ -234,7 +234,8 @@ CodeGenTarget::getRegisterClass(const Record *R) const { return *getRegBank().getRegClass(R); } -std::vector CodeGenTarget::getRegisterVTs(Record *R) const { +std::vector +CodeGenTarget::getRegisterVTs(const Record *R) const { const CodeGenRegister *Reg = getRegBank().getReg(R); std::vector Result; for (const auto &RC : getRegBank().getRegClasses()) { diff --git a/llvm/utils/TableGen/Common/CodeGenTarget.h b/llvm/utils/TableGen/Common/CodeGenTarget.h index 225bdd97128f85..41497c8d8e0d15 100644 --- a/llvm/utils/TableGen/Common/CodeGenTarget.h +++ b/llvm/utils/TableGen/Common/CodeGenTarget.h @@ -144,7 +144,7 @@ class CodeGenTarget { /// getRegisterVTs - Find the union of all possible SimpleValueTypes for the /// specified physical register. - std::vector getRegisterVTs(Record *R) const; + std::vector getRegisterVTs(const Record *R) const; ArrayRef getLegalValueTypes() const { if (LegalValueTypes.empty()) diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h index 94f26d85488af6..80fb3dc9fa1203 100644 --- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h +++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h @@ -2337,7 +2337,7 @@ class BuildMIAction : public MatchAction { const CodeGenInstruction *I; InstructionMatcher *Matched; std::vector> OperandRenderers; - SmallPtrSet DeadImplicitDefs; + SmallPtrSet DeadImplicitDefs; std::vector CopiedFlags; std::vector SetFlags; @@ -2365,7 +2365,7 @@ class BuildMIAction : public MatchAction { void chooseInsnToMutate(RuleMatcher &Rule); - void setDeadImplicitDef(Record *R) { DeadImplicitDefs.insert(R); } + void setDeadImplicitDef(const Record *R) { DeadImplicitDefs.insert(R); } template Kind &addRenderer(Args &&...args) { OperandRenderers.emplace_back( diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp index d82f1c369533e0..41a2db1d0bc38d 100644 --- a/llvm/utils/TableGen/GlobalISelEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp @@ -2368,7 +2368,7 @@ void GlobalISelEmitter::emitRunCustomAction(raw_ostream &OS) { } void GlobalISelEmitter::postProcessRule(RuleMatcher &M) { - SmallPtrSet UsedRegs; + SmallPtrSet UsedRegs; // TODO: deal with subregs? for (auto &A : M.actions()) { diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp index 5830cdae709629..cc5ef49385bb86 100644 --- a/llvm/utils/TableGen/InstrInfoEmitter.cpp +++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp @@ -48,12 +48,12 @@ static cl::opt ExpandMIOperandInfo( namespace { class InstrInfoEmitter { - RecordKeeper &Records; - CodeGenDAGPatterns CDP; + const RecordKeeper &Records; + const CodeGenDAGPatterns CDP; const CodeGenSchedModels &SchedModels; public: - InstrInfoEmitter(RecordKeeper &R) + InstrInfoEmitter(const RecordKeeper &R) : Records(R), CDP(R), SchedModels(CDP.getTargetInfo().getSchedModels()) {} // run - Output the instruction set description. @@ -88,8 +88,8 @@ class InstrInfoEmitter { /// Write verifyInstructionPredicates methods. void emitFeatureVerifier(raw_ostream &OS, const CodeGenTarget &Target); void emitRecord(const CodeGenInstruction &Inst, unsigned Num, - Record *InstrInfo, - std::map, unsigned> &EL, + const Record *InstrInfo, + std::map, unsigned> &EL, const OperandInfoMapTy &OperandInfo, raw_ostream &OS); void emitOperandTypeMappings( raw_ostream &OS, const CodeGenTarget &Target, @@ -136,7 +136,7 @@ InstrInfoEmitter::GetOperandInfo(const CodeGenInstruction &Inst) { // registers in their multi-operand operands. It may also be an anonymous // operand, which has a single operand, but no declared class for the // operand. - DagInit *MIOI = Op.MIOperandInfo; + const DagInit *MIOI = Op.MIOperandInfo; if (!MIOI || MIOI->getNumArgs() == 0) { // Single, anonymous, operand. @@ -356,10 +356,11 @@ void InstrInfoEmitter::emitOperandTypeMappings( ArrayRef NumberedInstructions) { StringRef Namespace = Target.getInstNamespace(); - std::vector Operands = Records.getAllDerivedDefinitions("Operand"); - std::vector RegisterOperands = + ArrayRef Operands = + Records.getAllDerivedDefinitions("Operand"); + ArrayRef RegisterOperands = Records.getAllDerivedDefinitions("RegisterOperand"); - std::vector RegisterClasses = + ArrayRef RegisterClasses = Records.getAllDerivedDefinitions("RegisterClass"); OS << "#ifdef GET_INSTRINFO_OPERAND_TYPES_ENUM\n"; @@ -370,9 +371,9 @@ void InstrInfoEmitter::emitOperandTypeMappings( OS << "enum OperandType {\n"; unsigned EnumVal = 0; - for (const std::vector *RecordsToAdd : - {&Operands, &RegisterOperands, &RegisterClasses}) { - for (const Record *Op : *RecordsToAdd) { + for (ArrayRef RecordsToAdd : + {Operands, RegisterOperands, RegisterClasses}) { + for (const Record *Op : RecordsToAdd) { if (!Op->isAnonymous()) OS << " " << Op->getName() << " = " << EnumVal << ",\n"; ++EnumVal; @@ -764,8 +765,8 @@ void InstrInfoEmitter::emitFeatureVerifier(raw_ostream &OS, } } - llvm::sort(FeatureBitsets, [&](const std::vector &A, - const std::vector &B) { + llvm::sort(FeatureBitsets, [&](ArrayRef A, + ArrayRef B) { if (A.size() < B.size()) return true; if (A.size() > B.size()) @@ -928,9 +929,9 @@ void InstrInfoEmitter::run(raw_ostream &OS) { emitSourceFileHeader("Target Instruction Enum Values and Descriptors", OS); emitEnums(OS); - CodeGenTarget &Target = CDP.getTargetInfo(); + const CodeGenTarget &Target = CDP.getTargetInfo(); const std::string &TargetName = std::string(Target.getName()); - Record *InstrInfo = Target.getInstructionSet(); + const Record *InstrInfo = Target.getInstructionSet(); // Collect all of the operand info records. Records.startTimer("Collect operand info"); @@ -941,11 +942,11 @@ void InstrInfoEmitter::run(raw_ostream &OS) { // Collect all of the instruction's implicit uses and defs. Records.startTimer("Collect uses/defs"); - std::map, unsigned> EmittedLists; - std::vector> ImplicitLists; + std::map, unsigned> EmittedLists; + std::vector> ImplicitLists; unsigned ImplicitListSize = 0; for (const CodeGenInstruction *II : Target.getInstructionsByEnumValue()) { - std::vector ImplicitOps = II->ImplicitUses; + std::vector ImplicitOps = II->ImplicitUses; llvm::append_range(ImplicitOps, II->ImplicitDefs); if (EmittedLists.insert({ImplicitOps, ImplicitListSize}).second) { ImplicitLists.push_back(ImplicitOps); @@ -1175,8 +1176,8 @@ void InstrInfoEmitter::run(raw_ostream &OS) { } void InstrInfoEmitter::emitRecord( - const CodeGenInstruction &Inst, unsigned Num, Record *InstrInfo, - std::map, unsigned> &EmittedLists, + const CodeGenInstruction &Inst, unsigned Num, const Record *InstrInfo, + std::map, unsigned> &EmittedLists, const OperandInfoMapTy &OperandInfoMap, raw_ostream &OS) { int MinOperands = 0; if (!Inst.Operands.empty()) @@ -1195,11 +1196,11 @@ void InstrInfoEmitter::emitRecord( << Inst.TheDef->getValueAsInt("Size") << ",\t" << SchedModels.getSchedClassIdx(Inst) << ",\t"; - CodeGenTarget &Target = CDP.getTargetInfo(); + const CodeGenTarget &Target = CDP.getTargetInfo(); // Emit the implicit use/def list... OS << Inst.ImplicitUses.size() << ",\t" << Inst.ImplicitDefs.size() << ",\t"; - std::vector ImplicitOps = Inst.ImplicitUses; + std::vector ImplicitOps = Inst.ImplicitUses; llvm::append_range(ImplicitOps, Inst.ImplicitDefs); OS << Target.getName() << "ImpOpBase + " << EmittedLists[ImplicitOps] << ",\t"; From 7281e0cb3bbcce396aab8b3ea0967d7a17cd287a Mon Sep 17 00:00:00 2001 From: Brendan Shanks Date: Wed, 18 Sep 2024 22:57:01 -0700 Subject: [PATCH 157/321] [lldb] [debugserver] Use "full" x86_64 GPR state when available. (#108663) macOS 10.15 added a "full" x86_64 GPR thread state flavor, equivalent to the normal one but with DS, ES, SS, and GSbase added. This flavor can only be used with processes that install a custom LDT (functionality that was also added in 10.15 and is used by apps like Wine to execute 32-bit code). Along with allowing DS, ES, SS, and GSbase to be viewed/modified, using the full flavor is necessary when debugging a thread executing 32-bit code. If thread_set_state() is used with the regular thread state flavor, the kernel resets CS to the 64-bit code segment (see [set_thread_state64()](https://github.com/apple-oss-distributions/xnu/blob/94d3b452840153a99b38a3a9659680b2a006908e/osfmk/i386/pcb.c#L723), which makes debugging impossible. There's no way to detect whether the full flavor is available, try to use it and fall back to the regular one if it's not available. A downside is that this patch exposes the DS, ES, SS, and GSbase registers for all x86_64 processes, even though they are not populated unless the full thread state is available. I'm not sure if there's a way to tell LLDB that a register is unavailable. The classic GDB `g` command [allows returning `x`](https://sourceware.org/gdb/current/onlinedocs/gdb.html/Packets.html#Packets) to denote unavailable registers, but it seems like the debug server uses newer commands like `jThreadsInfo` and I'm not sure if those have the same support. Fixes #57591 (also filed as Apple FB11464104) @jasonmolenda --- .../MacOSX/x86_64/DNBArchImplX86_64.cpp | 67 +++++++++++++++---- .../source/MacOSX/x86_64/DNBArchImplX86_64.h | 4 +- .../MacOSX/x86_64/MachRegisterStatesX86_64.h | 5 ++ 3 files changed, 61 insertions(+), 15 deletions(-) diff --git a/lldb/tools/debugserver/source/MacOSX/x86_64/DNBArchImplX86_64.cpp b/lldb/tools/debugserver/source/MacOSX/x86_64/DNBArchImplX86_64.cpp index 5a62e2a8d12e2c..3b3f1f02a2851f 100644 --- a/lldb/tools/debugserver/source/MacOSX/x86_64/DNBArchImplX86_64.cpp +++ b/lldb/tools/debugserver/source/MacOSX/x86_64/DNBArchImplX86_64.cpp @@ -182,22 +182,39 @@ kern_return_t DNBArchImplX86_64::GetGPRState(bool force) { m_state.context.gpr.__gs = ('g' << 8) + 's'; m_state.SetError(e_regSetGPR, Read, 0); #else - mach_msg_type_number_t count = e_regSetWordSizeGPR; + mach_msg_type_number_t count = e_regSetWordSizeGPRFull; + int flavor = __x86_64_THREAD_FULL_STATE; m_state.SetError( e_regSetGPR, Read, - ::thread_get_state(m_thread->MachPortNumber(), __x86_64_THREAD_STATE, + ::thread_get_state(m_thread->MachPortNumber(), flavor, (thread_state_t)&m_state.context.gpr, &count)); + + if (!m_state.GetError(e_regSetGPR, Read)) { + m_state.hasFullGPRState = true; + } else { + m_state.hasFullGPRState = false; + count = e_regSetWordSizeGPR; + flavor = __x86_64_THREAD_STATE; + m_state.SetError( + e_regSetGPR, Read, + ::thread_get_state(m_thread->MachPortNumber(), flavor, + (thread_state_t)&m_state.context.gpr, &count)); + } DNBLogThreadedIf( LOG_THREAD, - "::thread_get_state (0x%4.4x, %u, &gpr, %u) => 0x%8.8x" + "::thread_get_state (0x%4.4x, %u (%s), &gpr, %u) => 0x%8.8x" "\n\trax = %16.16llx rbx = %16.16llx rcx = %16.16llx rdx = %16.16llx" "\n\trdi = %16.16llx rsi = %16.16llx rbp = %16.16llx rsp = %16.16llx" "\n\t r8 = %16.16llx r9 = %16.16llx r10 = %16.16llx r11 = %16.16llx" "\n\tr12 = %16.16llx r13 = %16.16llx r14 = %16.16llx r15 = %16.16llx" "\n\trip = %16.16llx" - "\n\tflg = %16.16llx cs = %16.16llx fs = %16.16llx gs = %16.16llx", - m_thread->MachPortNumber(), x86_THREAD_STATE64, - x86_THREAD_STATE64_COUNT, m_state.GetError(e_regSetGPR, Read), + "\n\tflg = %16.16llx cs = %16.16llx fs = %16.16llx gs = %16.16llx" + "\n\t ds = %16.16llx es = %16.16llx ss = %16.16llx gsB = %16.16llx", + m_thread->MachPortNumber(), flavor, + m_state.hasFullGPRState ? "full" : "non-full", + m_state.hasFullGPRState ? e_regSetWordSizeGPRFull + : e_regSetWordSizeGPR, + m_state.GetError(e_regSetGPR, Read), m_state.context.gpr.__rax, m_state.context.gpr.__rbx, m_state.context.gpr.__rcx, m_state.context.gpr.__rdx, m_state.context.gpr.__rdi, m_state.context.gpr.__rsi, @@ -208,7 +225,9 @@ kern_return_t DNBArchImplX86_64::GetGPRState(bool force) { m_state.context.gpr.__r14, m_state.context.gpr.__r15, m_state.context.gpr.__rip, m_state.context.gpr.__rflags, m_state.context.gpr.__cs, m_state.context.gpr.__fs, - m_state.context.gpr.__gs); + m_state.context.gpr.__gs, m_state.context.gpr.__ds, + m_state.context.gpr.__es, m_state.context.gpr.__ss, + m_state.context.gpr.__gsbase ); // DNBLogThreadedIf (LOG_THREAD, "thread_get_state(0x%4.4x, %u, &gpr, %u) // => 0x%8.8x" @@ -459,21 +478,26 @@ kern_return_t DNBArchImplX86_64::SetGPRState() { "(SetGPRState() for stop_count = %u)", m_thread->MachPortNumber(), kret, m_thread->Process()->StopCount()); + mach_msg_type_number_t count = + m_state.hasFullGPRState ? e_regSetWordSizeGPRFull : e_regSetWordSizeGPR; + int flavor = m_state.hasFullGPRState ? __x86_64_THREAD_FULL_STATE + : __x86_64_THREAD_STATE; m_state.SetError(e_regSetGPR, Write, - ::thread_set_state(m_thread->MachPortNumber(), - __x86_64_THREAD_STATE, + ::thread_set_state(m_thread->MachPortNumber(), flavor, (thread_state_t)&m_state.context.gpr, - e_regSetWordSizeGPR)); + count)); DNBLogThreadedIf( LOG_THREAD, - "::thread_set_state (0x%4.4x, %u, &gpr, %u) => 0x%8.8x" + "::thread_set_state (0x%4.4x, %u (%s), &gpr, %u) => 0x%8.8x" "\n\trax = %16.16llx rbx = %16.16llx rcx = %16.16llx rdx = %16.16llx" "\n\trdi = %16.16llx rsi = %16.16llx rbp = %16.16llx rsp = %16.16llx" "\n\t r8 = %16.16llx r9 = %16.16llx r10 = %16.16llx r11 = %16.16llx" "\n\tr12 = %16.16llx r13 = %16.16llx r14 = %16.16llx r15 = %16.16llx" "\n\trip = %16.16llx" - "\n\tflg = %16.16llx cs = %16.16llx fs = %16.16llx gs = %16.16llx", - m_thread->MachPortNumber(), __x86_64_THREAD_STATE, e_regSetWordSizeGPR, + "\n\tflg = %16.16llx cs = %16.16llx fs = %16.16llx gs = %16.16llx" + "\n\t ds = %16.16llx es = %16.16llx ss = %16.16llx gsB = %16.16llx", + m_thread->MachPortNumber(), flavor, + m_state.hasFullGPRState ? "full" : "non-full", count, m_state.GetError(e_regSetGPR, Write), m_state.context.gpr.__rax, m_state.context.gpr.__rbx, m_state.context.gpr.__rcx, m_state.context.gpr.__rdx, m_state.context.gpr.__rdi, @@ -484,7 +508,9 @@ kern_return_t DNBArchImplX86_64::SetGPRState() { m_state.context.gpr.__r13, m_state.context.gpr.__r14, m_state.context.gpr.__r15, m_state.context.gpr.__rip, m_state.context.gpr.__rflags, m_state.context.gpr.__cs, - m_state.context.gpr.__fs, m_state.context.gpr.__gs); + m_state.context.gpr.__fs, m_state.context.gpr.__gs, + m_state.context.gpr.__ds, m_state.context.gpr.__es, + m_state.context.gpr.__ss, m_state.context.gpr.__gsbase); return m_state.GetError(e_regSetGPR, Write); } @@ -1157,6 +1183,10 @@ enum { gpr_cs, gpr_fs, gpr_gs, + gpr_ds, + gpr_es, + gpr_ss, + gpr_gsbase, gpr_eax, gpr_ebx, gpr_ecx, @@ -1543,6 +1573,7 @@ enum debugserver_regnums { debugserver_k5 = 123, debugserver_k6 = 124, debugserver_k7 = 125, + debugserver_gsbase = 126, }; #define GPR_OFFSET(reg) (offsetof(DNBArchImplX86_64::GPR, __##reg)) @@ -1690,6 +1721,10 @@ const DNBRegisterInfo DNBArchImplX86_64::g_gpr_registers[] = { DEFINE_GPR_ALT2(cs, NULL), DEFINE_GPR_ALT2(fs, NULL), DEFINE_GPR_ALT2(gs, NULL), + DEFINE_GPR_ALT2(ds, NULL), + DEFINE_GPR_ALT2(es, NULL), + DEFINE_GPR_ALT2(ss, NULL), + DEFINE_GPR_ALT2(gsbase, NULL), DEFINE_GPR_PSEUDO_32(eax, rax), DEFINE_GPR_PSEUDO_32(ebx, rbx), DEFINE_GPR_PSEUDO_32(ecx, rcx), @@ -2313,6 +2348,8 @@ bool DNBArchImplX86_64::GetRegisterValue(uint32_t set, uint32_t reg, value->info = *regInfo; switch (set) { case e_regSetGPR: + if (reg > gpr_gs && !m_state.hasFullGPRState) + return false; if (reg < k_num_gpr_registers) { value->value.uint64 = ((uint64_t *)(&m_state.context.gpr))[reg]; return true; @@ -2524,6 +2561,8 @@ bool DNBArchImplX86_64::SetRegisterValue(uint32_t set, uint32_t reg, if (regInfo) { switch (set) { case e_regSetGPR: + if (reg > gpr_gs && !m_state.hasFullGPRState) + return false; if (reg < k_num_gpr_registers) { ((uint64_t *)(&m_state.context.gpr))[reg] = value->value.uint64; success = true; diff --git a/lldb/tools/debugserver/source/MacOSX/x86_64/DNBArchImplX86_64.h b/lldb/tools/debugserver/source/MacOSX/x86_64/DNBArchImplX86_64.h index 96da02a4c9ff9f..7fffd60b2064e0 100644 --- a/lldb/tools/debugserver/source/MacOSX/x86_64/DNBArchImplX86_64.h +++ b/lldb/tools/debugserver/source/MacOSX/x86_64/DNBArchImplX86_64.h @@ -103,7 +103,8 @@ class DNBArchImplX86_64 : public DNBArchProtocol { }; enum RegisterSetWordSize { - e_regSetWordSizeGPR = sizeof(GPR) / sizeof(int), + e_regSetWordSizeGPR = (sizeof(GPR) - 32) / sizeof(int), + e_regSetWordSizeGPRFull = sizeof(GPR) / sizeof(int), e_regSetWordSizeFPU = sizeof(FPU) / sizeof(int), e_regSetWordSizeEXC = sizeof(EXC) / sizeof(int), e_regSetWordSizeAVX = sizeof(AVX) / sizeof(int), @@ -130,6 +131,7 @@ class DNBArchImplX86_64 : public DNBArchProtocol { kern_return_t fpu_errs[2]; // Read/Write errors kern_return_t exc_errs[2]; // Read/Write errors kern_return_t dbg_errs[2]; // Read/Write errors + bool hasFullGPRState; State() { uint32_t i; diff --git a/lldb/tools/debugserver/source/MacOSX/x86_64/MachRegisterStatesX86_64.h b/lldb/tools/debugserver/source/MacOSX/x86_64/MachRegisterStatesX86_64.h index b566accd397285..743c665b691067 100644 --- a/lldb/tools/debugserver/source/MacOSX/x86_64/MachRegisterStatesX86_64.h +++ b/lldb/tools/debugserver/source/MacOSX/x86_64/MachRegisterStatesX86_64.h @@ -22,6 +22,7 @@ #define __x86_64_DEBUG_STATE 11 #define __x86_64_AVX_STATE 17 #define __x86_64_AVX512F_STATE 20 +#define __x86_64_THREAD_FULL_STATE 23 typedef struct { uint64_t __rax; @@ -45,6 +46,10 @@ typedef struct { uint64_t __cs; uint64_t __fs; uint64_t __gs; + uint64_t __ds; + uint64_t __es; + uint64_t __ss; + uint64_t __gsbase; } __x86_64_thread_state_t; typedef struct { From e82f0838ae88ad69515ebec234765e3e2607bebf Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Wed, 18 Sep 2024 23:06:12 -0700 Subject: [PATCH 158/321] [ELF] --icf: don't fold a section without relocation and a section with relocations for SHT_CREL Similar to commit 686cff17cc310884e48ae963bf7507f96950cc90 for SHT_REL (#57693). CREL hasn't been tested with ICF before. And avoid a pitfall that eqClass[0] might interfere with ICF. --- lld/ELF/ICF.cpp | 4 ++-- lld/ELF/InputSection.cpp | 6 +++--- lld/ELF/InputSection.h | 4 ++++ lld/test/ELF/icf10.s | 3 +++ 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/lld/ELF/ICF.cpp b/lld/ELF/ICF.cpp index 14e0afc6029e9f..952e4dfe982c80 100644 --- a/lld/ELF/ICF.cpp +++ b/lld/ELF/ICF.cpp @@ -324,7 +324,7 @@ bool ICF::equalsConstant(const InputSection *a, const InputSection *b) { const RelsOrRelas ra = a->template relsOrRelas(); const RelsOrRelas rb = b->template relsOrRelas(); - if (ra.areRelocsCrel()) + if (ra.areRelocsCrel() || rb.areRelocsCrel()) return constantEq(a, ra.crels, b, rb.crels); return ra.areRelocsRel() || rb.areRelocsRel() ? constantEq(a, ra.rels, b, rb.rels) @@ -376,7 +376,7 @@ template bool ICF::equalsVariable(const InputSection *a, const InputSection *b) { const RelsOrRelas ra = a->template relsOrRelas(); const RelsOrRelas rb = b->template relsOrRelas(); - if (ra.areRelocsCrel()) + if (ra.areRelocsCrel() || rb.areRelocsCrel()) return variableEq(a, ra.crels, b, rb.crels); return ra.areRelocsRel() || rb.areRelocsRel() ? variableEq(a, ra.rels, b, rb.rels) diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp index 9601e6b3250cc0..363afe36c2449e 100644 --- a/lld/ELF/InputSection.cpp +++ b/lld/ELF/InputSection.cpp @@ -148,12 +148,12 @@ RelsOrRelas InputSectionBase::relsOrRelas(bool supportsCrel) const { InputSectionBase *const &relSec = f->getSections()[relSecIdx]; // Otherwise, allocate a buffer to hold the decoded RELA relocations. When // called for the first time, relSec is null (without --emit-relocs) or an - // InputSection with zero eqClass[0]. - if (!relSec || !cast(relSec)->eqClass[0]) { + // InputSection with false decodedCrel. + if (!relSec || !cast(relSec)->decodedCrel) { auto *sec = makeThreadLocal(*f, shdr, name); f->cacheDecodedCrel(relSecIdx, sec); sec->type = SHT_RELA; - sec->eqClass[0] = SHT_RELA; + sec->decodedCrel = true; RelocsCrel entries(sec->content_); sec->size = entries.size() * sizeof(typename ELFT::Rela); diff --git a/lld/ELF/InputSection.h b/lld/ELF/InputSection.h index f7672bbf553902..53dbaf0784b910 100644 --- a/lld/ELF/InputSection.h +++ b/lld/ELF/InputSection.h @@ -176,6 +176,10 @@ class InputSectionBase : public SectionBase { mutable bool compressed = false; + // Whether this section is SHT_CREL and has been decoded to RELA by + // relsOrRelas. + bool decodedCrel = false; + // Whether the section needs to be padded with a NOP filler due to // deleteFallThruJmpInsn. bool nopFiller = false; diff --git a/lld/test/ELF/icf10.s b/lld/test/ELF/icf10.s index 3c18c431c3b9da..ff926d0e16b103 100644 --- a/lld/test/ELF/icf10.s +++ b/lld/test/ELF/icf10.s @@ -5,6 +5,9 @@ # RUN: llvm-mc -filetype=obj -triple=x86_64-pc-freebsd %s -o %t.o # RUN: ld.lld --icf=all %t.o -o /dev/null --print-icf-sections 2>&1 | FileCheck %s +# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o --crel +# RUN: ld.lld --icf=all %t.o -o /dev/null --print-icf-sections 2>&1 | FileCheck %s + # Checks that ICF does not merge 2 sections the offset of # the relocations of which differ. From 90330e993d74b90325d936c9ec923c82623b20db Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Thu, 19 Sep 2024 07:12:32 +0100 Subject: [PATCH 159/321] [NVPTX] Set v2i16 SETCC to Expand (#108969) Note that this refers to the return type of SETCC. This operation is not legal in PTX but was assumed as such because v2i16 is declared a legal type. We were already expanding v4i8 SETCC. The DAGCombiner would in certain circumstances try to fold an extension of an illegal v2i1 SETCC (because v2i1 is illegal) into a "legal" v2i16 SETCC, which we wouldn't have patterns for. --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 2 +- llvm/test/CodeGen/NVPTX/sext-setcc.ll | 72 +++++++++++++++++++++ 2 files changed, 73 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/NVPTX/sext-setcc.ll diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 31a5e937adae96..26888342210918 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -725,7 +725,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // Other arithmetic and logic ops are unsupported. setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SRA, ISD::SRL, ISD::MULHS, ISD::MULHU, ISD::FP_TO_SINT, ISD::FP_TO_UINT, - ISD::SINT_TO_FP, ISD::UINT_TO_FP}, + ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::SETCC}, MVT::v2i16, Expand); setOperationAction(ISD::ADDC, MVT::i32, Legal); diff --git a/llvm/test/CodeGen/NVPTX/sext-setcc.ll b/llvm/test/CodeGen/NVPTX/sext-setcc.ll new file mode 100644 index 00000000000000..f471d47077cf0d --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/sext-setcc.ll @@ -0,0 +1,72 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 | FileCheck %s +; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} + +define <2 x i16> @sext_setcc_v2i1_to_v2i16(ptr %p) { +; CHECK-LABEL: sext_setcc_v2i1_to_v2i16( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ld.param.u64 %rd1, [sext_setcc_v2i1_to_v2i16_param_0]; +; CHECK-NEXT: ld.u32 %r1, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: setp.eq.s16 %p1, %rs1, 0; +; CHECK-NEXT: setp.eq.s16 %p2, %rs2, 0; +; CHECK-NEXT: selp.s16 %rs3, -1, 0, %p2; +; CHECK-NEXT: selp.s16 %rs4, -1, 0, %p1; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-NEXT: ret; +entry: + %v = load <2 x i16>, ptr %p, align 4 + %cmp = icmp eq <2 x i16> %v, zeroinitializer + %sext = sext <2 x i1> %cmp to <2 x i16> + ret <2 x i16> %sext +} + +define <4 x i8> @sext_setcc_v4i1_to_v4i8(ptr %p) { +; CHECK-LABEL: sext_setcc_v4i1_to_v4i8( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<5>; +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b32 %r<14>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ld.param.u64 %rd1, [sext_setcc_v4i1_to_v4i8_param_0]; +; CHECK-NEXT: ld.u32 %r1, [%rd1]; +; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; +; CHECK-NEXT: and.b16 %rs2, %rs1, 255; +; CHECK-NEXT: setp.eq.s16 %p1, %rs2, 0; +; CHECK-NEXT: bfe.u32 %r3, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r3; +; CHECK-NEXT: and.b16 %rs4, %rs3, 255; +; CHECK-NEXT: setp.eq.s16 %p2, %rs4, 0; +; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r4; +; CHECK-NEXT: and.b16 %rs6, %rs5, 255; +; CHECK-NEXT: setp.eq.s16 %p3, %rs6, 0; +; CHECK-NEXT: bfe.u32 %r5, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r5; +; CHECK-NEXT: and.b16 %rs8, %rs7, 255; +; CHECK-NEXT: setp.eq.s16 %p4, %rs8, 0; +; CHECK-NEXT: selp.s32 %r6, -1, 0, %p4; +; CHECK-NEXT: selp.s32 %r7, -1, 0, %p3; +; CHECK-NEXT: bfi.b32 %r8, %r7, %r6, 8, 8; +; CHECK-NEXT: selp.s32 %r9, -1, 0, %p2; +; CHECK-NEXT: bfi.b32 %r10, %r9, %r8, 16, 8; +; CHECK-NEXT: selp.s32 %r11, -1, 0, %p1; +; CHECK-NEXT: bfi.b32 %r12, %r11, %r10, 24, 8; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r12; +; CHECK-NEXT: ret; +entry: + %v = load <4 x i8>, ptr %p, align 4 + %cmp = icmp eq <4 x i8> %v, zeroinitializer + %sext = sext <4 x i1> %cmp to <4 x i8> + ret <4 x i8> %sext +} From 77af9d10237fef194eb275f33a11daea88e304a4 Mon Sep 17 00:00:00 2001 From: Him188 Date: Thu, 19 Sep 2024 07:18:14 +0100 Subject: [PATCH 160/321] [AArch64][GlobalISel] Implement selectVaStartAAPCS (#106979) This commit adds the missing support for varargs in the instruction selection pass for AAPCS. Previously we only implemented this for Darwin. The implementation was according to AAPCS and SelectionDAG's LowerAAPCS_VASTART. It resolves all VA_START fallbacks in RAJAperf, llvm-test-suite, and SPEC CPU2017. These benchmarks now compile and pass without fallbacks due to varargs. --------- Co-authored-by: Madhur Amilkanthwar --- .../GISel/AArch64InstructionSelector.cpp | 101 ++++- .../CodeGen/AArch64/GlobalISel/vararg.mir | 56 +++ llvm/test/CodeGen/AArch64/vararg.ll | 384 ++++++++++++++++++ 3 files changed, 540 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/vararg.mir create mode 100644 llvm/test/CodeGen/AArch64/vararg.ll diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index 18361cf3685642..df0c09d32c074a 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -1994,7 +1994,106 @@ bool AArch64InstructionSelector::selectVectorAshrLshr( bool AArch64InstructionSelector::selectVaStartAAPCS( MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { - return false; + + if (STI.isCallingConvWin64(MF.getFunction().getCallingConv(), + MF.getFunction().isVarArg())) + return false; + + // The layout of the va_list struct is specified in the AArch64 Procedure Call + // Standard, section 10.1.5. + + const AArch64FunctionInfo *FuncInfo = MF.getInfo(); + const unsigned PtrSize = STI.isTargetILP32() ? 4 : 8; + const auto *PtrRegClass = + STI.isTargetILP32() ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass; + + const MCInstrDesc &MCIDAddAddr = + TII.get(STI.isTargetILP32() ? AArch64::ADDWri : AArch64::ADDXri); + const MCInstrDesc &MCIDStoreAddr = + TII.get(STI.isTargetILP32() ? AArch64::STRWui : AArch64::STRXui); + + /* + * typedef struct va_list { + * void * stack; // next stack param + * void * gr_top; // end of GP arg reg save area + * void * vr_top; // end of FP/SIMD arg reg save area + * int gr_offs; // offset from gr_top to next GP register arg + * int vr_offs; // offset from vr_top to next FP/SIMD register arg + * } va_list; + */ + const auto VAList = I.getOperand(0).getReg(); + + // Our current offset in bytes from the va_list struct (VAList). + unsigned OffsetBytes = 0; + + // Helper function to store (FrameIndex + Imm) to VAList at offset OffsetBytes + // and increment OffsetBytes by PtrSize. + const auto PushAddress = [&](const int FrameIndex, const int64_t Imm) { + const Register Top = MRI.createVirtualRegister(PtrRegClass); + auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), MCIDAddAddr) + .addDef(Top) + .addFrameIndex(FrameIndex) + .addImm(Imm) + .addImm(0); + constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + + const auto *MMO = *I.memoperands_begin(); + MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), MCIDStoreAddr) + .addUse(Top) + .addUse(VAList) + .addImm(OffsetBytes / PtrSize) + .addMemOperand(MF.getMachineMemOperand( + MMO->getPointerInfo().getWithOffset(OffsetBytes), + MachineMemOperand::MOStore, PtrSize, MMO->getBaseAlign())); + constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + + OffsetBytes += PtrSize; + }; + + // void* stack at offset 0 + PushAddress(FuncInfo->getVarArgsStackIndex(), 0); + + // void* gr_top at offset 8 (4 on ILP32) + const unsigned GPRSize = FuncInfo->getVarArgsGPRSize(); + PushAddress(FuncInfo->getVarArgsGPRIndex(), GPRSize); + + // void* vr_top at offset 16 (8 on ILP32) + const unsigned FPRSize = FuncInfo->getVarArgsFPRSize(); + PushAddress(FuncInfo->getVarArgsFPRIndex(), FPRSize); + + // Helper function to store a 4-byte integer constant to VAList at offset + // OffsetBytes, and increment OffsetBytes by 4. + const auto PushIntConstant = [&](const int32_t Value) { + constexpr int IntSize = 4; + const Register Temp = MRI.createVirtualRegister(&AArch64::GPR32RegClass); + auto MIB = + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::MOVi32imm)) + .addDef(Temp) + .addImm(Value); + constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + + const auto *MMO = *I.memoperands_begin(); + MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRWui)) + .addUse(Temp) + .addUse(VAList) + .addImm(OffsetBytes / IntSize) + .addMemOperand(MF.getMachineMemOperand( + MMO->getPointerInfo().getWithOffset(OffsetBytes), + MachineMemOperand::MOStore, IntSize, MMO->getBaseAlign())); + constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + OffsetBytes += IntSize; + }; + + // int gr_offs at offset 24 (12 on ILP32) + PushIntConstant(-static_cast(GPRSize)); + + // int vr_offs at offset 28 (16 on ILP32) + PushIntConstant(-static_cast(FPRSize)); + + assert(OffsetBytes == (STI.isTargetILP32() ? 20 : 32) && "Unexpected offset"); + + I.eraseFromParent(); + return true; } bool AArch64InstructionSelector::selectVaStartDarwin( diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/vararg.mir b/llvm/test/CodeGen/AArch64/GlobalISel/vararg.mir new file mode 100644 index 00000000000000..437a9e6cb89ac3 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/vararg.mir @@ -0,0 +1,56 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -O0 -mtriple=aarch64-unknown-linux -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK + +--- | + target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32" + target triple = "aarch64-unknown-linux" + + %struct.__va_list = type { ptr, ptr, ptr, i32, i32 } + + define i32 @va_start(ptr %a, ...) { + entry: + %ap = alloca %struct.__va_list, align 8 + call void @llvm.lifetime.start.p0(i64 32, ptr %ap) + call void @llvm.va_start.p0(ptr %ap) + %vr_offs_p = getelementptr inbounds i8, ptr %ap, i64 28 + %vr_offs = load i32, ptr %vr_offs_p, align 4 + ret i32 %vr_offs + } +... +--- +name: va_start +alignment: 16 +legalized: true +regBankSelected: true +tracksRegLiveness: true +fixedStack: + - { id: 0, size: 4, alignment: 16 } +stack: + - { id: 0, size: 56, alignment: 8 } + - { id: 1, size: 128, alignment: 16 } + - { id: 2, name: ap, size: 32, alignment: 8 } +body: | + bb.0.entry: + ; CHECK-LABEL: name: va_start + ; CHECK: LIFETIME_START %stack.2.ap + ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64sp = ADDXri %stack.2.ap, 0, 0 + ; CHECK-NEXT: [[ADDXri1:%[0-9]+]]:gpr64common = ADDXri %stack.0, 0, 0 + ; CHECK-NEXT: STRXui [[ADDXri1]], [[ADDXri]], 0 :: (store (s64) into %ir.ap) + ; CHECK-NEXT: [[ADDXri2:%[0-9]+]]:gpr64common = ADDXri %stack.0, 0, 0 + ; CHECK-NEXT: STRXui [[ADDXri2]], [[ADDXri]], 1 :: (store (s64) into %ir.ap + 8) + ; CHECK-NEXT: [[ADDXri3:%[0-9]+]]:gpr64common = ADDXri %stack.0, 0, 0 + ; CHECK-NEXT: STRXui [[ADDXri3]], [[ADDXri]], 2 :: (store (s64) into %ir.ap + 16) + ; CHECK-NEXT: [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm 0 + ; CHECK-NEXT: STRWui [[MOVi32imm]], [[ADDXri]], 6 :: (store (s32) into %ir.ap + 24, align 8) + ; CHECK-NEXT: [[MOVi32imm1:%[0-9]+]]:gpr32 = MOVi32imm 0 + ; CHECK-NEXT: STRWui [[MOVi32imm1]], [[ADDXri]], 7 :: (store (s32) into %ir.ap + 28, basealign 8) + ; CHECK-NEXT: [[LDRWui:%[0-9]+]]:gpr32 = LDRWui %stack.2.ap, 7 :: (dereferenceable load (s32) from %ir.vr_offs_p) + ; CHECK-NEXT: $w0 = COPY [[LDRWui]] + LIFETIME_START %stack.2.ap + %0:gpr(p0) = G_FRAME_INDEX %stack.2.ap + G_VASTART %0(p0) :: (store (s256) into %ir.ap, align 8) + %1:gpr(s64) = G_CONSTANT i64 28 + %2:gpr(p0) = G_PTR_ADD %0, %1(s64) + %3:gpr(s32) = G_LOAD %2(p0) :: (dereferenceable load (s32) from %ir.vr_offs_p) + $w0 = COPY %3(s32) +... diff --git a/llvm/test/CodeGen/AArch64/vararg.ll b/llvm/test/CodeGen/AArch64/vararg.ll new file mode 100644 index 00000000000000..291eee2ddf706d --- /dev/null +++ b/llvm/test/CodeGen/AArch64/vararg.ll @@ -0,0 +1,384 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-unknown-linux-gnu -O0 -global-isel=0 -o - < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=aarch64-unknown-linux-gnu -O0 -global-isel=1 -global-isel-abort=1 -o - < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32" + +%struct.__va_list = type { ptr, ptr, ptr, i32, i32 } + +declare void @llvm.va_start(ptr) nounwind +declare void @llvm.va_end(ptr) nounwind +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) +declare void @llvm.va_start.p0(ptr) +declare void @llvm.va_end.p0(ptr) +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) + +define i64 @vararg(...) #0 { +; CHECK-SD-LABEL: vararg: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #224 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 224 +; CHECK-SD-NEXT: stp x29, x30, [sp, #208] // 16-byte Folded Spill +; CHECK-SD-NEXT: add x29, sp, #208 +; CHECK-SD-NEXT: .cfi_def_cfa w29, 16 +; CHECK-SD-NEXT: .cfi_offset w30, -8 +; CHECK-SD-NEXT: .cfi_offset w29, -16 +; CHECK-SD-NEXT: str q7, [sp, #112] +; CHECK-SD-NEXT: str q6, [sp, #96] +; CHECK-SD-NEXT: str q5, [sp, #80] +; CHECK-SD-NEXT: str q4, [sp, #64] +; CHECK-SD-NEXT: str q3, [sp, #48] +; CHECK-SD-NEXT: str q2, [sp, #32] +; CHECK-SD-NEXT: str q1, [sp, #16] +; CHECK-SD-NEXT: str q0, [sp] +; CHECK-SD-NEXT: stur x7, [x29, #-16] +; CHECK-SD-NEXT: stur x6, [x29, #-24] +; CHECK-SD-NEXT: stur x5, [x29, #-32] +; CHECK-SD-NEXT: stur x4, [x29, #-40] +; CHECK-SD-NEXT: stur x3, [x29, #-48] +; CHECK-SD-NEXT: stur x2, [x29, #-56] +; CHECK-SD-NEXT: stur x1, [x29, #-64] +; CHECK-SD-NEXT: stur x0, [x29, #-72] +; CHECK-SD-NEXT: mov w8, #-128 // =0xffffff80 +; CHECK-SD-NEXT: str w8, [x29, #20] +; CHECK-SD-NEXT: mov w8, #-64 // =0xffffffc0 +; CHECK-SD-NEXT: str w8, [x29, #16] +; CHECK-SD-NEXT: add x8, x29, #16 +; CHECK-SD-NEXT: stur x8, [x29, #-8] +; CHECK-SD-NEXT: mov x8, sp +; CHECK-SD-NEXT: add x8, x8, #128 +; CHECK-SD-NEXT: str x8, [x29, #8] +; CHECK-SD-NEXT: sub x8, x29, #72 +; CHECK-SD-NEXT: add x8, x8, #64 +; CHECK-SD-NEXT: str x8, [x29] +; CHECK-SD-NEXT: mov w8, #1 // =0x1 +; CHECK-SD-NEXT: mov w0, w8 +; CHECK-SD-NEXT: .cfi_def_cfa wsp, 224 +; CHECK-SD-NEXT: ldp x29, x30, [sp, #208] // 16-byte Folded Reload +; CHECK-SD-NEXT: add sp, sp, #224 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 0 +; CHECK-SD-NEXT: .cfi_restore w30 +; CHECK-SD-NEXT: .cfi_restore w29 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: vararg: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #224 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 224 +; CHECK-GI-NEXT: stp x29, x30, [sp, #208] // 16-byte Folded Spill +; CHECK-GI-NEXT: add x29, sp, #208 +; CHECK-GI-NEXT: .cfi_def_cfa w29, 16 +; CHECK-GI-NEXT: .cfi_offset w30, -8 +; CHECK-GI-NEXT: .cfi_offset w29, -16 +; CHECK-GI-NEXT: stur x0, [x29, #-64] +; CHECK-GI-NEXT: stur x1, [x29, #-56] +; CHECK-GI-NEXT: stur x2, [x29, #-48] +; CHECK-GI-NEXT: stur x3, [x29, #-40] +; CHECK-GI-NEXT: stur x4, [x29, #-32] +; CHECK-GI-NEXT: stur x5, [x29, #-24] +; CHECK-GI-NEXT: stur x6, [x29, #-16] +; CHECK-GI-NEXT: stur x7, [x29, #-8] +; CHECK-GI-NEXT: str q0, [sp, #16] +; CHECK-GI-NEXT: str q1, [sp, #32] +; CHECK-GI-NEXT: str q2, [sp, #48] +; CHECK-GI-NEXT: str q3, [sp, #64] +; CHECK-GI-NEXT: str q4, [sp, #80] +; CHECK-GI-NEXT: str q5, [sp, #96] +; CHECK-GI-NEXT: str q6, [sp, #112] +; CHECK-GI-NEXT: str q7, [sp, #128] +; CHECK-GI-NEXT: add x9, sp, #8 +; CHECK-GI-NEXT: add x8, x29, #16 +; CHECK-GI-NEXT: str x8, [x9] +; CHECK-GI-NEXT: add x8, x29, #0 +; CHECK-GI-NEXT: str x8, [x9, #8] +; CHECK-GI-NEXT: add x8, sp, #144 +; CHECK-GI-NEXT: str x8, [x9, #16] +; CHECK-GI-NEXT: mov w8, #-64 // =0xffffffc0 +; CHECK-GI-NEXT: str w8, [x9, #24] +; CHECK-GI-NEXT: mov w8, #-128 // =0xffffff80 +; CHECK-GI-NEXT: str w8, [x9, #28] +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: mov w0, w8 +; CHECK-GI-NEXT: .cfi_def_cfa wsp, 224 +; CHECK-GI-NEXT: ldp x29, x30, [sp, #208] // 16-byte Folded Reload +; CHECK-GI-NEXT: add sp, sp, #224 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 0 +; CHECK-GI-NEXT: .cfi_restore w30 +; CHECK-GI-NEXT: .cfi_restore w29 +; CHECK-GI-NEXT: ret +entry: + %g = alloca ptr, align 4 + call void @llvm.va_start(ptr %g) + ret i64 1 +} + +define i64 @vararg_many_gpr(i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7, ...) #0 { +; CHECK-SD-LABEL: vararg_many_gpr: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #160 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 160 +; CHECK-SD-NEXT: stp x29, x30, [sp, #144] // 16-byte Folded Spill +; CHECK-SD-NEXT: add x29, sp, #144 +; CHECK-SD-NEXT: .cfi_def_cfa w29, 16 +; CHECK-SD-NEXT: .cfi_offset w30, -8 +; CHECK-SD-NEXT: .cfi_offset w29, -16 +; CHECK-SD-NEXT: str q7, [sp, #112] +; CHECK-SD-NEXT: str q6, [sp, #96] +; CHECK-SD-NEXT: str q5, [sp, #80] +; CHECK-SD-NEXT: str q4, [sp, #64] +; CHECK-SD-NEXT: str q3, [sp, #48] +; CHECK-SD-NEXT: str q2, [sp, #32] +; CHECK-SD-NEXT: str q1, [sp, #16] +; CHECK-SD-NEXT: str q0, [sp] +; CHECK-SD-NEXT: stur x7, [x29, #-16] +; CHECK-SD-NEXT: mov w8, #-128 // =0xffffff80 +; CHECK-SD-NEXT: str w8, [x29, #20] +; CHECK-SD-NEXT: mov w8, #-8 // =0xfffffff8 +; CHECK-SD-NEXT: str w8, [x29, #16] +; CHECK-SD-NEXT: add x8, x29, #16 +; CHECK-SD-NEXT: stur x8, [x29, #-8] +; CHECK-SD-NEXT: mov x8, sp +; CHECK-SD-NEXT: add x8, x8, #128 +; CHECK-SD-NEXT: str x8, [x29, #8] +; CHECK-SD-NEXT: sub x8, x29, #16 +; CHECK-SD-NEXT: add x8, x8, #8 +; CHECK-SD-NEXT: str x8, [x29] +; CHECK-SD-NEXT: mov w8, #1 // =0x1 +; CHECK-SD-NEXT: mov w0, w8 +; CHECK-SD-NEXT: .cfi_def_cfa wsp, 160 +; CHECK-SD-NEXT: ldp x29, x30, [sp, #144] // 16-byte Folded Reload +; CHECK-SD-NEXT: add sp, sp, #160 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 0 +; CHECK-SD-NEXT: .cfi_restore w30 +; CHECK-SD-NEXT: .cfi_restore w29 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: vararg_many_gpr: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #176 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 176 +; CHECK-GI-NEXT: stp x29, x30, [sp, #160] // 16-byte Folded Spill +; CHECK-GI-NEXT: add x29, sp, #160 +; CHECK-GI-NEXT: .cfi_def_cfa w29, 16 +; CHECK-GI-NEXT: .cfi_offset w30, -8 +; CHECK-GI-NEXT: .cfi_offset w29, -16 +; CHECK-GI-NEXT: stur x7, [x29, #-8] +; CHECK-GI-NEXT: str q0, [sp, #16] +; CHECK-GI-NEXT: str q1, [sp, #32] +; CHECK-GI-NEXT: str q2, [sp, #48] +; CHECK-GI-NEXT: str q3, [sp, #64] +; CHECK-GI-NEXT: str q4, [sp, #80] +; CHECK-GI-NEXT: str q5, [sp, #96] +; CHECK-GI-NEXT: str q6, [sp, #112] +; CHECK-GI-NEXT: str q7, [sp, #128] +; CHECK-GI-NEXT: add x9, sp, #8 +; CHECK-GI-NEXT: add x8, x29, #16 +; CHECK-GI-NEXT: str x8, [x9] +; CHECK-GI-NEXT: add x8, x29, #0 +; CHECK-GI-NEXT: str x8, [x9, #8] +; CHECK-GI-NEXT: add x8, sp, #144 +; CHECK-GI-NEXT: str x8, [x9, #16] +; CHECK-GI-NEXT: mov w8, #-8 // =0xfffffff8 +; CHECK-GI-NEXT: str w8, [x9, #24] +; CHECK-GI-NEXT: mov w8, #-128 // =0xffffff80 +; CHECK-GI-NEXT: str w8, [x9, #28] +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: mov w0, w8 +; CHECK-GI-NEXT: .cfi_def_cfa wsp, 176 +; CHECK-GI-NEXT: ldp x29, x30, [sp, #160] // 16-byte Folded Reload +; CHECK-GI-NEXT: add sp, sp, #176 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 0 +; CHECK-GI-NEXT: .cfi_restore w30 +; CHECK-GI-NEXT: .cfi_restore w29 +; CHECK-GI-NEXT: ret +entry: + %g = alloca ptr, align 4 + call void @llvm.va_start(ptr %g) + ret i64 1 +} + +define i64 @vararg_many_float(float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7, ...) #0 { +; CHECK-SD-LABEL: vararg_many_float: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #112 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 112 +; CHECK-SD-NEXT: stp x29, x30, [sp, #96] // 16-byte Folded Spill +; CHECK-SD-NEXT: add x29, sp, #96 +; CHECK-SD-NEXT: .cfi_def_cfa w29, 16 +; CHECK-SD-NEXT: .cfi_offset w30, -8 +; CHECK-SD-NEXT: .cfi_offset w29, -16 +; CHECK-SD-NEXT: str q7, [sp] +; CHECK-SD-NEXT: str x7, [sp, #80] +; CHECK-SD-NEXT: str x6, [sp, #72] +; CHECK-SD-NEXT: str x5, [sp, #64] +; CHECK-SD-NEXT: str x4, [sp, #56] +; CHECK-SD-NEXT: str x3, [sp, #48] +; CHECK-SD-NEXT: str x2, [sp, #40] +; CHECK-SD-NEXT: str x1, [sp, #32] +; CHECK-SD-NEXT: str x0, [sp, #24] +; CHECK-SD-NEXT: mov w8, #-16 // =0xfffffff0 +; CHECK-SD-NEXT: str w8, [x29, #20] +; CHECK-SD-NEXT: mov w8, #-64 // =0xffffffc0 +; CHECK-SD-NEXT: str w8, [x29, #16] +; CHECK-SD-NEXT: add x8, x29, #16 +; CHECK-SD-NEXT: stur x8, [x29, #-8] +; CHECK-SD-NEXT: mov x8, sp +; CHECK-SD-NEXT: add x8, x8, #16 +; CHECK-SD-NEXT: str x8, [x29, #8] +; CHECK-SD-NEXT: add x8, sp, #24 +; CHECK-SD-NEXT: add x8, x8, #64 +; CHECK-SD-NEXT: str x8, [x29] +; CHECK-SD-NEXT: mov w8, #1 // =0x1 +; CHECK-SD-NEXT: mov w0, w8 +; CHECK-SD-NEXT: .cfi_def_cfa wsp, 112 +; CHECK-SD-NEXT: ldp x29, x30, [sp, #96] // 16-byte Folded Reload +; CHECK-SD-NEXT: add sp, sp, #112 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 0 +; CHECK-SD-NEXT: .cfi_restore w30 +; CHECK-SD-NEXT: .cfi_restore w29 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: vararg_many_float: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #112 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 112 +; CHECK-GI-NEXT: stp x29, x30, [sp, #96] // 16-byte Folded Spill +; CHECK-GI-NEXT: add x29, sp, #96 +; CHECK-GI-NEXT: .cfi_def_cfa w29, 16 +; CHECK-GI-NEXT: .cfi_offset w30, -8 +; CHECK-GI-NEXT: .cfi_offset w29, -16 +; CHECK-GI-NEXT: str x0, [sp, #32] +; CHECK-GI-NEXT: str x1, [sp, #40] +; CHECK-GI-NEXT: str x2, [sp, #48] +; CHECK-GI-NEXT: str x3, [sp, #56] +; CHECK-GI-NEXT: str x4, [sp, #64] +; CHECK-GI-NEXT: str x5, [sp, #72] +; CHECK-GI-NEXT: str x6, [sp, #80] +; CHECK-GI-NEXT: str x7, [sp, #88] +; CHECK-GI-NEXT: str q7, [sp, #16] +; CHECK-GI-NEXT: add x9, sp, #8 +; CHECK-GI-NEXT: add x8, x29, #16 +; CHECK-GI-NEXT: str x8, [x9] +; CHECK-GI-NEXT: add x8, sp, #96 +; CHECK-GI-NEXT: str x8, [x9, #8] +; CHECK-GI-NEXT: add x8, sp, #32 +; CHECK-GI-NEXT: str x8, [x9, #16] +; CHECK-GI-NEXT: mov w8, #-64 // =0xffffffc0 +; CHECK-GI-NEXT: str w8, [x9, #24] +; CHECK-GI-NEXT: mov w8, #-16 // =0xfffffff0 +; CHECK-GI-NEXT: str w8, [x9, #28] +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: mov w0, w8 +; CHECK-GI-NEXT: .cfi_def_cfa wsp, 112 +; CHECK-GI-NEXT: ldp x29, x30, [sp, #96] // 16-byte Folded Reload +; CHECK-GI-NEXT: add sp, sp, #112 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 0 +; CHECK-GI-NEXT: .cfi_restore w30 +; CHECK-GI-NEXT: .cfi_restore w29 +; CHECK-GI-NEXT: ret +entry: + %g = alloca ptr, align 4 + call void @llvm.va_start(ptr %g) + ret i64 1 +} + +define i64 @gpr1_fpr1(i32 %i, float %f, ...) #0 { +; CHECK-SD-LABEL: gpr1_fpr1: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #192 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 192 +; CHECK-SD-NEXT: stp x29, x30, [sp, #176] // 16-byte Folded Spill +; CHECK-SD-NEXT: add x29, sp, #176 +; CHECK-SD-NEXT: .cfi_def_cfa w29, 16 +; CHECK-SD-NEXT: .cfi_offset w30, -8 +; CHECK-SD-NEXT: .cfi_offset w29, -16 +; CHECK-SD-NEXT: str q7, [sp, #96] +; CHECK-SD-NEXT: str q6, [sp, #80] +; CHECK-SD-NEXT: str q5, [sp, #64] +; CHECK-SD-NEXT: str q4, [sp, #48] +; CHECK-SD-NEXT: str q3, [sp, #32] +; CHECK-SD-NEXT: str q2, [sp, #16] +; CHECK-SD-NEXT: str q1, [sp] +; CHECK-SD-NEXT: stur x7, [x29, #-16] +; CHECK-SD-NEXT: stur x6, [x29, #-24] +; CHECK-SD-NEXT: stur x5, [x29, #-32] +; CHECK-SD-NEXT: stur x4, [x29, #-40] +; CHECK-SD-NEXT: stur x3, [x29, #-48] +; CHECK-SD-NEXT: stur x2, [x29, #-56] +; CHECK-SD-NEXT: stur x1, [x29, #-64] +; CHECK-SD-NEXT: mov w8, #-112 // =0xffffff90 +; CHECK-SD-NEXT: str w8, [x29, #20] +; CHECK-SD-NEXT: mov w8, #-56 // =0xffffffc8 +; CHECK-SD-NEXT: str w8, [x29, #16] +; CHECK-SD-NEXT: add x8, x29, #16 +; CHECK-SD-NEXT: stur x8, [x29, #-8] +; CHECK-SD-NEXT: mov x8, sp +; CHECK-SD-NEXT: add x8, x8, #112 +; CHECK-SD-NEXT: str x8, [x29, #8] +; CHECK-SD-NEXT: sub x8, x29, #64 +; CHECK-SD-NEXT: add x8, x8, #56 +; CHECK-SD-NEXT: str x8, [x29] +; CHECK-SD-NEXT: mov w8, #1 // =0x1 +; CHECK-SD-NEXT: mov w0, w8 +; CHECK-SD-NEXT: .cfi_def_cfa wsp, 192 +; CHECK-SD-NEXT: ldp x29, x30, [sp, #176] // 16-byte Folded Reload +; CHECK-SD-NEXT: add sp, sp, #192 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 0 +; CHECK-SD-NEXT: .cfi_restore w30 +; CHECK-SD-NEXT: .cfi_restore w29 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: gpr1_fpr1: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #208 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 208 +; CHECK-GI-NEXT: stp x29, x30, [sp, #192] // 16-byte Folded Spill +; CHECK-GI-NEXT: add x29, sp, #192 +; CHECK-GI-NEXT: .cfi_def_cfa w29, 16 +; CHECK-GI-NEXT: .cfi_offset w30, -8 +; CHECK-GI-NEXT: .cfi_offset w29, -16 +; CHECK-GI-NEXT: stur x1, [x29, #-56] +; CHECK-GI-NEXT: stur x2, [x29, #-48] +; CHECK-GI-NEXT: stur x3, [x29, #-40] +; CHECK-GI-NEXT: stur x4, [x29, #-32] +; CHECK-GI-NEXT: stur x5, [x29, #-24] +; CHECK-GI-NEXT: stur x6, [x29, #-16] +; CHECK-GI-NEXT: stur x7, [x29, #-8] +; CHECK-GI-NEXT: str q1, [sp, #16] +; CHECK-GI-NEXT: str q2, [sp, #32] +; CHECK-GI-NEXT: str q3, [sp, #48] +; CHECK-GI-NEXT: str q4, [sp, #64] +; CHECK-GI-NEXT: str q5, [sp, #80] +; CHECK-GI-NEXT: str q6, [sp, #96] +; CHECK-GI-NEXT: str q7, [sp, #112] +; CHECK-GI-NEXT: add x9, sp, #8 +; CHECK-GI-NEXT: add x8, x29, #16 +; CHECK-GI-NEXT: str x8, [x9] +; CHECK-GI-NEXT: add x8, x29, #0 +; CHECK-GI-NEXT: str x8, [x9, #8] +; CHECK-GI-NEXT: add x8, sp, #128 +; CHECK-GI-NEXT: str x8, [x9, #16] +; CHECK-GI-NEXT: mov w8, #-56 // =0xffffffc8 +; CHECK-GI-NEXT: str w8, [x9, #24] +; CHECK-GI-NEXT: mov w8, #-112 // =0xffffff90 +; CHECK-GI-NEXT: str w8, [x9, #28] +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: mov w0, w8 +; CHECK-GI-NEXT: .cfi_def_cfa wsp, 208 +; CHECK-GI-NEXT: ldp x29, x30, [sp, #192] // 16-byte Folded Reload +; CHECK-GI-NEXT: add sp, sp, #208 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 0 +; CHECK-GI-NEXT: .cfi_restore w30 +; CHECK-GI-NEXT: .cfi_restore w29 +; CHECK-GI-NEXT: ret +entry: + %g = alloca ptr, align 4 + call void @llvm.va_start(ptr %g) + ret i64 1 +} + +; To make the outputs more readable +attributes #0 = { uwtable "frame-pointer"="all" } + +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} From 758444ca3e7163a1504eeced3383af861d01d761 Mon Sep 17 00:00:00 2001 From: Pierre van Houtryve Date: Thu, 19 Sep 2024 09:00:21 +0200 Subject: [PATCH 161/321] [AMDGPU] Promote uniform ops to I32 in DAGISel (#106383) Promote uniform binops, selects and setcc between 2 and 16 bits to 32 bits in DAGISel Solves #64591 --- llvm/include/llvm/CodeGen/TargetLowering.h | 2 +- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 19 +- .../CodeGen/SelectionDAG/TargetLowering.cpp | 10 +- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 35 +- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 2 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 127 +- llvm/lib/Target/AMDGPU/SIISelLowering.h | 2 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 3 +- llvm/lib/Target/X86/X86ISelLowering.h | 2 +- .../AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll | 854 ++- ...amdgpu-codegenprepare-fold-binop-select.ll | 7 +- .../amdgpu-codegenprepare-i16-to-i32.ll | 4 +- .../amdgpu-simplify-libcall-pow-codegen.ll | 652 +- .../CodeGen/AMDGPU/amdgpu.private-memory.ll | 2 +- .../branch-folding-implicit-def-subreg.ll | 4 +- .../AMDGPU/bug-sdag-emitcopyfromreg.ll | 64 +- .../CodeGen/AMDGPU/calling-conventions.ll | 1658 ++-- .../CodeGen/AMDGPU/cgp-bitfield-extract.ll | 4 +- llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll | 2 +- llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll | 19 +- llvm/test/CodeGen/AMDGPU/cttz.ll | 5 +- llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll | 28 +- llvm/test/CodeGen/AMDGPU/dagcombine-select.ll | 5 +- .../CodeGen/AMDGPU/extract_vector_dynelt.ll | 452 +- .../CodeGen/AMDGPU/extract_vector_elt-i8.ll | 20 +- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 101 +- llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll | 532 -- .../AMDGPU/gfx-callable-argument-types.ll | 33 +- llvm/test/CodeGen/AMDGPU/idiv-licm.ll | 450 +- llvm/test/CodeGen/AMDGPU/imm16.ll | 18 +- .../CodeGen/AMDGPU/insert-delay-alu-bug.ll | 106 +- .../CodeGen/AMDGPU/insert_vector_dynelt.ll | 1903 +++-- llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll | 160 +- .../CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll | 40 +- llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 6735 ++++++++--------- llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 4942 ++++++------ llvm/test/CodeGen/AMDGPU/load-global-i8.ll | 15 +- llvm/test/CodeGen/AMDGPU/load-local-i8.ll | 15 +- .../AMDGPU/lower-lds-struct-aa-memcpy.ll | 4 +- llvm/test/CodeGen/AMDGPU/min.ll | 150 +- llvm/test/CodeGen/AMDGPU/mul.ll | 51 +- llvm/test/CodeGen/AMDGPU/permute_i8.ll | 73 +- llvm/test/CodeGen/AMDGPU/preload-kernargs.ll | 227 +- llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll | 35 +- llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll | 180 +- llvm/test/CodeGen/AMDGPU/select-i1.ll | 13 +- llvm/test/CodeGen/AMDGPU/select-vectors.ll | 5 +- llvm/test/CodeGen/AMDGPU/setcc-opt.ll | 17 +- llvm/test/CodeGen/AMDGPU/sign_extend.ll | 19 +- llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll | 22 +- llvm/test/CodeGen/AMDGPU/srem.ll | 34 +- llvm/test/CodeGen/AMDGPU/trunc-combine.ll | 4 +- llvm/test/CodeGen/AMDGPU/trunc-store.ll | 136 +- llvm/test/CodeGen/AMDGPU/uaddo.ll | 12 +- llvm/test/CodeGen/AMDGPU/usubo.ll | 12 +- .../CodeGen/AMDGPU/vector-alloca-bitcast.ll | 3 +- .../AMDGPU/vgpr-spill-placement-issue61083.ll | 6 +- llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll | 44 +- 58 files changed, 9357 insertions(+), 10722 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 802510dd0e4fa0..5888eaa6fbdb52 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3306,7 +3306,7 @@ class TargetLoweringBase { /// Return true if it's profitable to narrow operations of type SrcVT to /// DestVT. e.g. on x86, it's profitable to narrow from i32 to i8 but not from /// i32 to i16. - virtual bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { + virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const { return false; } diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 37fcd09d4f5626..b36a1245f83962 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -7132,7 +7132,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) { if (N1C->getAPIntValue().countLeadingZeros() >= (BitWidth - SrcBitWidth) && TLI.isTruncateFree(VT, SrcVT) && TLI.isZExtFree(SrcVT, VT) && TLI.isTypeDesirableForOp(ISD::AND, SrcVT) && - TLI.isNarrowingProfitable(VT, SrcVT)) + TLI.isNarrowingProfitable(N, VT, SrcVT)) return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, DAG.getNode(ISD::AND, DL, SrcVT, N0Op0, DAG.getZExtOrTrunc(N1, DL, SrcVT))); @@ -14704,7 +14704,7 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) { // ShLeftAmt will indicate how much a narrowed load should be shifted left. unsigned ShLeftAmt = 0; if (ShAmt == 0 && N0.getOpcode() == ISD::SHL && N0.hasOneUse() && - ExtVT == VT && TLI.isNarrowingProfitable(N0.getValueType(), VT)) { + ExtVT == VT && TLI.isNarrowingProfitable(N, N0.getValueType(), VT)) { if (ConstantSDNode *N01 = dyn_cast(N0.getOperand(1))) { ShLeftAmt = N01->getZExtValue(); N0 = N0.getOperand(0); @@ -15264,9 +15264,11 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { } // trunc (select c, a, b) -> select c, (trunc a), (trunc b) - if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse()) { - if ((!LegalOperations || TLI.isOperationLegal(ISD::SELECT, SrcVT)) && - TLI.isTruncateFree(SrcVT, VT)) { + if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse() && + TLI.isTruncateFree(SrcVT, VT)) { + if (!LegalOperations || + (TLI.isOperationLegal(ISD::SELECT, SrcVT) && + TLI.isNarrowingProfitable(N0.getNode(), SrcVT, VT))) { SDLoc SL(N0); SDValue Cond = N0.getOperand(0); SDValue TruncOp0 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1)); @@ -20207,10 +20209,9 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) { EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW); // The narrowing should be profitable, the load/store operation should be // legal (or custom) and the store size should be equal to the NewVT width. - while (NewBW < BitWidth && - (NewVT.getStoreSizeInBits() != NewBW || - !TLI.isOperationLegalOrCustom(Opc, NewVT) || - !TLI.isNarrowingProfitable(VT, NewVT))) { + while (NewBW < BitWidth && (NewVT.getStoreSizeInBits() != NewBW || + !TLI.isOperationLegalOrCustom(Opc, NewVT) || + !TLI.isNarrowingProfitable(N, VT, NewVT))) { NewBW = NextPowerOf2(NewBW); NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW); } diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 95937886280685..a293c2391c3283 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1841,7 +1841,7 @@ bool TargetLowering::SimplifyDemandedBits( for (unsigned SmallVTBits = llvm::bit_ceil(DemandedSize); SmallVTBits < BitWidth; SmallVTBits = NextPowerOf2(SmallVTBits)) { EVT SmallVT = EVT::getIntegerVT(*TLO.DAG.getContext(), SmallVTBits); - if (isNarrowingProfitable(VT, SmallVT) && + if (isNarrowingProfitable(Op.getNode(), VT, SmallVT) && isTypeDesirableForOp(ISD::SHL, SmallVT) && isTruncateFree(VT, SmallVT) && isZExtFree(SmallVT, VT) && (!TLO.LegalOperations() || isOperationLegal(ISD::SHL, SmallVT))) { @@ -1865,7 +1865,7 @@ bool TargetLowering::SimplifyDemandedBits( if ((BitWidth % 2) == 0 && !VT.isVector() && ShAmt < HalfWidth && DemandedBits.countLeadingOnes() >= HalfWidth) { EVT HalfVT = EVT::getIntegerVT(*TLO.DAG.getContext(), HalfWidth); - if (isNarrowingProfitable(VT, HalfVT) && + if (isNarrowingProfitable(Op.getNode(), VT, HalfVT) && isTypeDesirableForOp(ISD::SHL, HalfVT) && isTruncateFree(VT, HalfVT) && isZExtFree(HalfVT, VT) && (!TLO.LegalOperations() || isOperationLegal(ISD::SHL, HalfVT))) { @@ -1984,7 +1984,7 @@ bool TargetLowering::SimplifyDemandedBits( if ((BitWidth % 2) == 0 && !VT.isVector()) { APInt HiBits = APInt::getHighBitsSet(BitWidth, BitWidth / 2); EVT HalfVT = EVT::getIntegerVT(*TLO.DAG.getContext(), BitWidth / 2); - if (isNarrowingProfitable(VT, HalfVT) && + if (isNarrowingProfitable(Op.getNode(), VT, HalfVT) && isTypeDesirableForOp(ISD::SRL, HalfVT) && isTruncateFree(VT, HalfVT) && isZExtFree(HalfVT, VT) && (!TLO.LegalOperations() || isOperationLegal(ISD::SRL, HalfVT)) && @@ -4762,9 +4762,11 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, case ISD::SETULT: case ISD::SETULE: { EVT newVT = N0.getOperand(0).getValueType(); + // FIXME: Should use isNarrowingProfitable. if (DCI.isBeforeLegalizeOps() || (isOperationLegal(ISD::SETCC, newVT) && - isCondCodeLegal(Cond, newVT.getSimpleVT()))) { + isCondCodeLegal(Cond, newVT.getSimpleVT()) && + isTypeDesirableForOp(ISD::SETCC, newVT))) { EVT NewSetCCVT = getSetCCResultType(Layout, *DAG.getContext(), newVT); SDValue NewConst = DAG.getConstant(C1.trunc(InSize), dl, newVT); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 81852f6a130584..fad51ce8285e01 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1022,14 +1022,45 @@ bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const { return Src == MVT::i32 && Dest == MVT::i64; } -bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { +bool AMDGPUTargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT, + EVT DestVT) const { + switch (N->getOpcode()) { + case ISD::ADD: + case ISD::SUB: + case ISD::SHL: + case ISD::SRL: + case ISD::SRA: + case ISD::AND: + case ISD::OR: + case ISD::XOR: + case ISD::MUL: + case ISD::SETCC: + case ISD::SELECT: + if (Subtarget->has16BitInsts() && + (DestVT.isVector() ? !Subtarget->hasVOP3PInsts() : true)) { + // Don't narrow back down to i16 if promoted to i32 already. + if (!N->isDivergent() && DestVT.isInteger() && + DestVT.getScalarSizeInBits() > 1 && + DestVT.getScalarSizeInBits() <= 16 && + SrcVT.getScalarSizeInBits() > 16) { + return false; + } + } + return true; + default: + break; + } + // There aren't really 64-bit registers, but pairs of 32-bit ones and only a // limited number of native 64-bit operations. Shrinking an operation to fit // in a single 32-bit register should always be helpful. As currently used, // this is much less general than the name suggests, and is only used in // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is // not profitable, and may actually be harmful. - return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32; + if (isa(N)) + return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32; + + return true; } bool AMDGPUTargetLowering::isDesirableToCommuteWithShift( diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 18b5c388f32932..5c2abd334276c1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -201,7 +201,7 @@ class AMDGPUTargetLowering : public TargetLowering { NegatibleCost &Cost, unsigned Depth) const override; - bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override; + bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override; bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index b197f38d054fc0..a9754ba357893f 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -894,6 +894,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, ISD::UADDO_CARRY, ISD::SUB, ISD::USUBO_CARRY, + ISD::MUL, ISD::FADD, ISD::FSUB, ISD::FDIV, @@ -909,9 +910,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, ISD::UMIN, ISD::UMAX, ISD::SETCC, + ISD::SELECT, + ISD::SMIN, + ISD::SMAX, + ISD::UMIN, + ISD::UMAX, ISD::AND, ISD::OR, ISD::XOR, + ISD::SHL, + ISD::SRL, + ISD::SRA, ISD::FSHR, ISD::SINT_TO_FP, ISD::UINT_TO_FP, @@ -1942,13 +1951,6 @@ bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const { switch (Op) { case ISD::LOAD: case ISD::STORE: - - // These operations are done with 32-bit instructions anyway. - case ISD::AND: - case ISD::OR: - case ISD::XOR: - case ISD::SELECT: - // TODO: Extensions? return true; default: return false; @@ -6731,6 +6733,93 @@ SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp); } +static unsigned getExtOpcodeForPromotedOp(SDValue Op) { + switch (Op->getOpcode()) { + case ISD::SRA: + case ISD::SMIN: + case ISD::SMAX: + return ISD::SIGN_EXTEND; + case ISD::SRL: + case ISD::UMIN: + case ISD::UMAX: + return ISD::ZERO_EXTEND; + case ISD::ADD: + case ISD::SUB: + case ISD::AND: + case ISD::OR: + case ISD::XOR: + case ISD::SHL: + case ISD::SELECT: + case ISD::MUL: + // operation result won't be influenced by garbage high bits. + // TODO: are all of those cases correct, and are there more? + return ISD::ANY_EXTEND; + case ISD::SETCC: { + ISD::CondCode CC = cast(Op.getOperand(2))->get(); + return ISD::isSignedIntSetCC(CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + } + default: + llvm_unreachable("unexpected opcode!"); + } +} + +SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op, + DAGCombinerInfo &DCI) const { + const unsigned Opc = Op.getOpcode(); + assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL || + Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND || + Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL || + Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN || + Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX); + + EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType() + : Op->getOperand(0).getValueType(); + auto ExtTy = OpTy.changeElementType(MVT::i32); + + if (DCI.isBeforeLegalizeOps() || + isNarrowingProfitable(Op.getNode(), ExtTy, OpTy)) + return SDValue(); + + auto &DAG = DCI.DAG; + + SDLoc DL(Op); + SDValue LHS; + SDValue RHS; + if (Opc == ISD::SELECT) { + LHS = Op->getOperand(1); + RHS = Op->getOperand(2); + } else { + LHS = Op->getOperand(0); + RHS = Op->getOperand(1); + } + + const unsigned ExtOp = getExtOpcodeForPromotedOp(Op); + LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS}); + + // Special case: for shifts, the RHS always needs a zext. + if (Op.getOpcode() == ISD::SRA || Op.getOpcode() == ISD::SRL || + Op.getOpcode() == ISD::SRA) + RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS}); + else + RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS}); + + // setcc always return i1/i1 vec so no need to truncate after. + if (Opc == ISD::SETCC) { + ISD::CondCode CC = cast(Op.getOperand(2))->get(); + return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC); + } + + // For other ops, we extend the operation's return type as well so we need to + // truncate back to the original type. + SDValue NewVal; + if (Opc == ISD::SELECT) + NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS}); + else + NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS}); + + return DAG.getZExtOrTrunc(NewVal, DL, OpTy); +} + // Custom lowering for vector multiplications and s_mul_u64. SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); @@ -14623,8 +14712,32 @@ SDValue SITargetLowering::performClampCombine(SDNode *N, SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { + switch (N->getOpcode()) { + case ISD::ADD: + case ISD::SUB: + case ISD::SHL: + case ISD::SRL: + case ISD::SRA: + case ISD::AND: + case ISD::OR: + case ISD::XOR: + case ISD::MUL: + case ISD::SETCC: + case ISD::SELECT: + case ISD::SMIN: + case ISD::SMAX: + case ISD::UMIN: + case ISD::UMAX: + if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI)) + return Res; + break; + default: + break; + } + if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None) return SDValue(); + switch (N->getOpcode()) { case ISD::ADD: return performAddCombine(N, DCI); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index bcdb4204c6b629..6c3edf37945e24 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -147,6 +147,7 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const; + SDValue promoteUniformOpToI32(SDValue Op, DAGCombinerInfo &DCI) const; SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const; SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const; SDValue lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const; @@ -462,7 +463,6 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const; SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const; SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; - void ReplaceNodeResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const override; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 68563f556ecfb4..c2bce6f01ef8f4 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -34834,7 +34834,8 @@ bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, return false; } -bool X86TargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { +bool X86TargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT, + EVT DestVT) const { // i16 instructions are longer (0x66 prefix) and potentially slower. return !(SrcVT == MVT::i32 && DestVT == MVT::i16); } diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index ae7da8efb5f91a..0ab42f032c3ea6 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1451,7 +1451,7 @@ namespace llvm { /// Return true if it's profitable to narrow operations of type SrcVT to /// DestVT. e.g. on x86, it's profitable to narrow from i32 to i8 but not /// from i32 to i16. - bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override; + bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override; bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT) const override; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll index 4caf83774bbba2..53f6c9543c3e3f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -291,16 +291,16 @@ define amdgpu_ps double @s_div_fmas_f64(double inreg %a, double inreg %b, double define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) { ; GFX7-LABEL: test_div_fmas_f32: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s2, s[0:1], 0xa -; GFX7-NEXT: s_load_dword s3, s[0:1], 0x13 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x1c -; GFX7-NEXT: s_load_dword s5, s[0:1], 0x25 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s4, s[2:3], 0xa +; GFX7-NEXT: s_load_dword s5, s[2:3], 0x13 +; GFX7-NEXT: s_load_dword s6, s[2:3], 0x1c +; GFX7-NEXT: s_load_dword s7, s[2:3], 0x25 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_and_b32 s2, 1, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_and_b32 s2, 1, s7 ; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 @@ -311,19 +311,20 @@ define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, [8 x i32], f ; ; GFX8-LABEL: test_div_fmas_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x28 -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x4c -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x70 -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x94 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x28 +; GFX8-NEXT: s_load_dword s1, s[2:3], 0x4c +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x70 +; GFX8-NEXT: s_load_dword s5, s[2:3], 0x94 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: s_and_b32 s2, 1, s5 -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 -; GFX8-NEXT: s_nop 3 +; GFX8-NEXT: s_and_b32 s0, 1, s5 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_nop 2 ; GFX8-NEXT: v_div_fmas_f32 v2, v0, v1, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -332,52 +333,52 @@ define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, [8 x i32], f ; GFX10_W32-LABEL: test_div_fmas_f32: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x4 -; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x94 -; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x4c -; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x70 -; GFX10_W32-NEXT: s_load_dword s7, s[0:1], 0x28 -; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX10_W32-NEXT: s_load_dword s4, s[2:3], 0x94 +; GFX10_W32-NEXT: s_load_dword s5, s[2:3], 0x4c +; GFX10_W32-NEXT: s_load_dword s6, s[2:3], 0x70 +; GFX10_W32-NEXT: s_load_dword s7, s[2:3], 0x28 +; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_and_b32 s0, 1, s4 +; GFX10_W32-NEXT: s_and_b32 s2, 1, s4 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6 ; GFX10_W32-NEXT: v_div_fmas_f32 v0, s7, v0, v1 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x4 -; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x94 -; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x4c -; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x70 -; GFX10_W64-NEXT: s_load_dword s7, s[0:1], 0x28 -; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX10_W64-NEXT: s_load_dword s4, s[2:3], 0x94 +; GFX10_W64-NEXT: s_load_dword s5, s[2:3], 0x4c +; GFX10_W64-NEXT: s_load_dword s6, s[2:3], 0x70 +; GFX10_W64-NEXT: s_load_dword s7, s[2:3], 0x28 +; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_and_b32 s0, 1, s4 +; GFX10_W64-NEXT: s_and_b32 s2, 1, s4 ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, s6 ; GFX10_W64-NEXT: v_div_fmas_f32 v0, s7, v0, v1 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10_W64-NEXT: s_endpgm ; ; GFX11_W32-LABEL: test_div_fmas_f32: ; GFX11_W32: ; %bb.0: ; GFX11_W32-NEXT: s_clause 0x4 -; GFX11_W32-NEXT: s_load_b32 s2, s[0:1], 0x94 -; GFX11_W32-NEXT: s_load_b32 s3, s[0:1], 0x4c -; GFX11_W32-NEXT: s_load_b32 s4, s[0:1], 0x70 -; GFX11_W32-NEXT: s_load_b32 s5, s[0:1], 0x28 -; GFX11_W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11_W32-NEXT: s_load_b32 s4, s[2:3], 0x94 +; GFX11_W32-NEXT: s_load_b32 s5, s[2:3], 0x4c +; GFX11_W32-NEXT: s_load_b32 s6, s[2:3], 0x70 +; GFX11_W32-NEXT: s_load_b32 s7, s[2:3], 0x28 +; GFX11_W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11_W32-NEXT: s_and_b32 s2, 1, s2 -; GFX11_W32-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s4 +; GFX11_W32-NEXT: s_and_b32 s2, 1, s4 +; GFX11_W32-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 ; GFX11_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 -; GFX11_W32-NEXT: v_div_fmas_f32 v0, s5, v0, v1 +; GFX11_W32-NEXT: v_div_fmas_f32 v0, s7, v0, v1 ; GFX11_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11_W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11_W32-NEXT: s_nop 0 @@ -387,17 +388,17 @@ define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, [8 x i32], f ; GFX11_W64-LABEL: test_div_fmas_f32: ; GFX11_W64: ; %bb.0: ; GFX11_W64-NEXT: s_clause 0x4 -; GFX11_W64-NEXT: s_load_b32 s2, s[0:1], 0x94 -; GFX11_W64-NEXT: s_load_b32 s3, s[0:1], 0x4c -; GFX11_W64-NEXT: s_load_b32 s4, s[0:1], 0x70 -; GFX11_W64-NEXT: s_load_b32 s5, s[0:1], 0x28 -; GFX11_W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11_W64-NEXT: s_load_b32 s4, s[2:3], 0x94 +; GFX11_W64-NEXT: s_load_b32 s5, s[2:3], 0x4c +; GFX11_W64-NEXT: s_load_b32 s6, s[2:3], 0x70 +; GFX11_W64-NEXT: s_load_b32 s7, s[2:3], 0x28 +; GFX11_W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11_W64-NEXT: s_and_b32 s2, 1, s2 -; GFX11_W64-NEXT: v_mov_b32_e32 v0, s3 +; GFX11_W64-NEXT: s_and_b32 s2, 1, s4 +; GFX11_W64-NEXT: v_mov_b32_e32 v0, s5 ; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 -; GFX11_W64-NEXT: v_mov_b32_e32 v1, s4 -; GFX11_W64-NEXT: v_div_fmas_f32 v0, s5, v0, v1 +; GFX11_W64-NEXT: v_mov_b32_e32 v1, s6 +; GFX11_W64-NEXT: v_div_fmas_f32 v0, s7, v0, v1 ; GFX11_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11_W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11_W64-NEXT: s_nop 0 @@ -411,35 +412,36 @@ define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, [8 x i32], f define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) { ; GFX7-LABEL: test_div_fmas_f32_inline_imm_0: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s2, s[0:1], 0x13 -; GFX7-NEXT: s_load_dword s3, s[0:1], 0x1c -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x25 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s4, s[2:3], 0x13 +; GFX7-NEXT: s_load_dword s5, s[2:3], 0x1c +; GFX7-NEXT: s_load_dword s6, s[2:3], 0x25 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_and_b32 s2, 1, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_and_b32 s2, 1, s6 ; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_nop 1 +; GFX7-NEXT: s_nop 2 ; GFX7-NEXT: v_div_fmas_f32 v0, 1.0, v0, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_fmas_f32_inline_imm_0: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x4c -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x70 -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x94 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x4c +; GFX8-NEXT: s_load_dword s1, s[2:3], 0x70 +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x94 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_and_b32 s2, 1, s4 -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 -; GFX8-NEXT: s_nop 3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_and_b32 s0, 1, s4 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_nop 2 ; GFX8-NEXT: v_div_fmas_f32 v2, 1.0, v0, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -448,48 +450,48 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(ptr addrspace(1) %out, ; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_0: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x3 -; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x94 -; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x70 -; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x4c -; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX10_W32-NEXT: s_load_dword s4, s[2:3], 0x94 +; GFX10_W32-NEXT: s_load_dword s5, s[2:3], 0x70 +; GFX10_W32-NEXT: s_load_dword s6, s[2:3], 0x4c +; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_and_b32 s0, 1, s4 +; GFX10_W32-NEXT: s_and_b32 s2, 1, s4 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 ; GFX10_W32-NEXT: v_div_fmas_f32 v0, 1.0, s6, v0 -; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_0: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x3 -; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x94 -; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x70 -; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x4c -; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX10_W64-NEXT: s_load_dword s4, s[2:3], 0x94 +; GFX10_W64-NEXT: s_load_dword s5, s[2:3], 0x70 +; GFX10_W64-NEXT: s_load_dword s6, s[2:3], 0x4c +; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_and_b32 s0, 1, s4 +; GFX10_W64-NEXT: s_and_b32 s2, 1, s4 ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX10_W64-NEXT: v_div_fmas_f32 v0, 1.0, s6, v0 -; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10_W64-NEXT: s_endpgm ; ; GFX11_W32-LABEL: test_div_fmas_f32_inline_imm_0: ; GFX11_W32: ; %bb.0: ; GFX11_W32-NEXT: s_clause 0x3 -; GFX11_W32-NEXT: s_load_b32 s2, s[0:1], 0x94 -; GFX11_W32-NEXT: s_load_b32 s3, s[0:1], 0x70 -; GFX11_W32-NEXT: s_load_b32 s4, s[0:1], 0x4c -; GFX11_W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11_W32-NEXT: s_load_b32 s4, s[2:3], 0x94 +; GFX11_W32-NEXT: s_load_b32 s5, s[2:3], 0x70 +; GFX11_W32-NEXT: s_load_b32 s6, s[2:3], 0x4c +; GFX11_W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11_W32-NEXT: s_and_b32 s2, 1, s2 -; GFX11_W32-NEXT: v_mov_b32_e32 v0, s3 +; GFX11_W32-NEXT: s_and_b32 s2, 1, s4 +; GFX11_W32-NEXT: v_mov_b32_e32 v0, s5 ; GFX11_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 -; GFX11_W32-NEXT: v_div_fmas_f32 v0, 1.0, s4, v0 +; GFX11_W32-NEXT: v_div_fmas_f32 v0, 1.0, s6, v0 ; GFX11_W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11_W32-NEXT: s_nop 0 ; GFX11_W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -498,16 +500,16 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(ptr addrspace(1) %out, ; GFX11_W64-LABEL: test_div_fmas_f32_inline_imm_0: ; GFX11_W64: ; %bb.0: ; GFX11_W64-NEXT: s_clause 0x3 -; GFX11_W64-NEXT: s_load_b32 s2, s[0:1], 0x94 -; GFX11_W64-NEXT: s_load_b32 s3, s[0:1], 0x70 -; GFX11_W64-NEXT: s_load_b32 s4, s[0:1], 0x4c -; GFX11_W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11_W64-NEXT: s_load_b32 s4, s[2:3], 0x94 +; GFX11_W64-NEXT: s_load_b32 s5, s[2:3], 0x70 +; GFX11_W64-NEXT: s_load_b32 s6, s[2:3], 0x4c +; GFX11_W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11_W64-NEXT: s_and_b32 s2, 1, s2 -; GFX11_W64-NEXT: v_mov_b32_e32 v0, s3 +; GFX11_W64-NEXT: s_and_b32 s2, 1, s4 +; GFX11_W64-NEXT: v_mov_b32_e32 v0, s5 ; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 -; GFX11_W64-NEXT: v_div_fmas_f32 v0, 1.0, s4, v0 +; GFX11_W64-NEXT: v_div_fmas_f32 v0, 1.0, s6, v0 ; GFX11_W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11_W64-NEXT: s_nop 0 ; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -520,35 +522,36 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(ptr addrspace(1) %out, define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(ptr addrspace(1) %out, float %a, float %b, float %c, [8 x i32], i1 %d) { ; GFX7-LABEL: test_div_fmas_f32_inline_imm_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s2, s[0:1], 0x2 -; GFX7-NEXT: s_load_dword s3, s[0:1], 0x4 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0xd -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s4, s[2:3], 0x2 +; GFX7-NEXT: s_load_dword s5, s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s6, s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_and_b32 s2, 1, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_and_b32 s2, 1, s6 ; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_nop 1 +; GFX7-NEXT: s_nop 2 ; GFX7-NEXT: v_div_fmas_f32 v0, v0, 1.0, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_fmas_f32_inline_imm_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x8 -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x10 -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX8-NEXT: s_load_dword s1, s[2:3], 0x10 +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_and_b32 s2, 1, s4 -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 -; GFX8-NEXT: s_nop 3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_and_b32 s0, 1, s4 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_nop 2 ; GFX8-NEXT: v_div_fmas_f32 v2, v0, 1.0, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -557,48 +560,48 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(ptr addrspace(1) %out, ; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_1: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x3 -; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x34 -; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x10 -; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x8 -; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX10_W32-NEXT: s_load_dword s4, s[2:3], 0x34 +; GFX10_W32-NEXT: s_load_dword s5, s[2:3], 0x10 +; GFX10_W32-NEXT: s_load_dword s6, s[2:3], 0x8 +; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_and_b32 s0, 1, s4 +; GFX10_W32-NEXT: s_and_b32 s2, 1, s4 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 ; GFX10_W32-NEXT: v_div_fmas_f32 v0, s6, 1.0, v0 -; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_1: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x3 -; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x34 -; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x10 -; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x8 -; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX10_W64-NEXT: s_load_dword s4, s[2:3], 0x34 +; GFX10_W64-NEXT: s_load_dword s5, s[2:3], 0x10 +; GFX10_W64-NEXT: s_load_dword s6, s[2:3], 0x8 +; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_and_b32 s0, 1, s4 +; GFX10_W64-NEXT: s_and_b32 s2, 1, s4 ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX10_W64-NEXT: v_div_fmas_f32 v0, s6, 1.0, v0 -; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10_W64-NEXT: s_endpgm ; ; GFX11_W32-LABEL: test_div_fmas_f32_inline_imm_1: ; GFX11_W32: ; %bb.0: ; GFX11_W32-NEXT: s_clause 0x3 -; GFX11_W32-NEXT: s_load_b32 s2, s[0:1], 0x34 -; GFX11_W32-NEXT: s_load_b32 s3, s[0:1], 0x10 -; GFX11_W32-NEXT: s_load_b32 s4, s[0:1], 0x8 -; GFX11_W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11_W32-NEXT: s_load_b32 s4, s[2:3], 0x34 +; GFX11_W32-NEXT: s_load_b32 s5, s[2:3], 0x10 +; GFX11_W32-NEXT: s_load_b32 s6, s[2:3], 0x8 +; GFX11_W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11_W32-NEXT: s_and_b32 s2, 1, s2 -; GFX11_W32-NEXT: v_mov_b32_e32 v0, s3 +; GFX11_W32-NEXT: s_and_b32 s2, 1, s4 +; GFX11_W32-NEXT: v_mov_b32_e32 v0, s5 ; GFX11_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 -; GFX11_W32-NEXT: v_div_fmas_f32 v0, s4, 1.0, v0 +; GFX11_W32-NEXT: v_div_fmas_f32 v0, s6, 1.0, v0 ; GFX11_W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11_W32-NEXT: s_nop 0 ; GFX11_W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -607,16 +610,16 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(ptr addrspace(1) %out, ; GFX11_W64-LABEL: test_div_fmas_f32_inline_imm_1: ; GFX11_W64: ; %bb.0: ; GFX11_W64-NEXT: s_clause 0x3 -; GFX11_W64-NEXT: s_load_b32 s2, s[0:1], 0x34 -; GFX11_W64-NEXT: s_load_b32 s3, s[0:1], 0x10 -; GFX11_W64-NEXT: s_load_b32 s4, s[0:1], 0x8 -; GFX11_W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11_W64-NEXT: s_load_b32 s4, s[2:3], 0x34 +; GFX11_W64-NEXT: s_load_b32 s5, s[2:3], 0x10 +; GFX11_W64-NEXT: s_load_b32 s6, s[2:3], 0x8 +; GFX11_W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11_W64-NEXT: s_and_b32 s2, 1, s2 -; GFX11_W64-NEXT: v_mov_b32_e32 v0, s3 +; GFX11_W64-NEXT: s_and_b32 s2, 1, s4 +; GFX11_W64-NEXT: v_mov_b32_e32 v0, s5 ; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 -; GFX11_W64-NEXT: v_div_fmas_f32 v0, s4, 1.0, v0 +; GFX11_W64-NEXT: v_div_fmas_f32 v0, s6, 1.0, v0 ; GFX11_W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11_W64-NEXT: s_nop 0 ; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -629,35 +632,36 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(ptr addrspace(1) %out, define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) { ; GFX7-LABEL: test_div_fmas_f32_inline_imm_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s2, s[0:1], 0xa -; GFX7-NEXT: s_load_dword s3, s[0:1], 0x13 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x25 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s4, s[2:3], 0xa +; GFX7-NEXT: s_load_dword s5, s[2:3], 0x13 +; GFX7-NEXT: s_load_dword s6, s[2:3], 0x25 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_and_b32 s2, 1, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_and_b32 s2, 1, s6 ; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_nop 1 +; GFX7-NEXT: s_nop 2 ; GFX7-NEXT: v_div_fmas_f32 v0, v0, v1, 1.0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_fmas_f32_inline_imm_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x28 -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x4c -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x94 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x28 +; GFX8-NEXT: s_load_dword s1, s[2:3], 0x4c +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x94 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_and_b32 s2, 1, s4 -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 -; GFX8-NEXT: s_nop 3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_and_b32 s0, 1, s4 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_nop 2 ; GFX8-NEXT: v_div_fmas_f32 v2, v0, v1, 1.0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -666,48 +670,48 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(ptr addrspace(1) %out, ; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_2: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x3 -; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x94 -; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x4c -; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x28 -; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX10_W32-NEXT: s_load_dword s4, s[2:3], 0x94 +; GFX10_W32-NEXT: s_load_dword s5, s[2:3], 0x4c +; GFX10_W32-NEXT: s_load_dword s6, s[2:3], 0x28 +; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_and_b32 s0, 1, s4 +; GFX10_W32-NEXT: s_and_b32 s2, 1, s4 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 ; GFX10_W32-NEXT: v_div_fmas_f32 v0, s6, v0, 1.0 -; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_2: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x3 -; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x94 -; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x4c -; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x28 -; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX10_W64-NEXT: s_load_dword s4, s[2:3], 0x94 +; GFX10_W64-NEXT: s_load_dword s5, s[2:3], 0x4c +; GFX10_W64-NEXT: s_load_dword s6, s[2:3], 0x28 +; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_and_b32 s0, 1, s4 +; GFX10_W64-NEXT: s_and_b32 s2, 1, s4 ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX10_W64-NEXT: v_div_fmas_f32 v0, s6, v0, 1.0 -; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10_W64-NEXT: s_endpgm ; ; GFX11_W32-LABEL: test_div_fmas_f32_inline_imm_2: ; GFX11_W32: ; %bb.0: ; GFX11_W32-NEXT: s_clause 0x3 -; GFX11_W32-NEXT: s_load_b32 s2, s[0:1], 0x94 -; GFX11_W32-NEXT: s_load_b32 s3, s[0:1], 0x4c -; GFX11_W32-NEXT: s_load_b32 s4, s[0:1], 0x28 -; GFX11_W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11_W32-NEXT: s_load_b32 s4, s[2:3], 0x94 +; GFX11_W32-NEXT: s_load_b32 s5, s[2:3], 0x4c +; GFX11_W32-NEXT: s_load_b32 s6, s[2:3], 0x28 +; GFX11_W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11_W32-NEXT: s_and_b32 s2, 1, s2 -; GFX11_W32-NEXT: v_mov_b32_e32 v0, s3 +; GFX11_W32-NEXT: s_and_b32 s2, 1, s4 +; GFX11_W32-NEXT: v_mov_b32_e32 v0, s5 ; GFX11_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 -; GFX11_W32-NEXT: v_div_fmas_f32 v0, s4, v0, 1.0 +; GFX11_W32-NEXT: v_div_fmas_f32 v0, s6, v0, 1.0 ; GFX11_W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11_W32-NEXT: s_nop 0 ; GFX11_W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -716,16 +720,16 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(ptr addrspace(1) %out, ; GFX11_W64-LABEL: test_div_fmas_f32_inline_imm_2: ; GFX11_W64: ; %bb.0: ; GFX11_W64-NEXT: s_clause 0x3 -; GFX11_W64-NEXT: s_load_b32 s2, s[0:1], 0x94 -; GFX11_W64-NEXT: s_load_b32 s3, s[0:1], 0x4c -; GFX11_W64-NEXT: s_load_b32 s4, s[0:1], 0x28 -; GFX11_W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11_W64-NEXT: s_load_b32 s4, s[2:3], 0x94 +; GFX11_W64-NEXT: s_load_b32 s5, s[2:3], 0x4c +; GFX11_W64-NEXT: s_load_b32 s6, s[2:3], 0x28 +; GFX11_W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11_W64-NEXT: s_and_b32 s2, 1, s2 -; GFX11_W64-NEXT: v_mov_b32_e32 v0, s3 +; GFX11_W64-NEXT: s_and_b32 s2, 1, s4 +; GFX11_W64-NEXT: v_mov_b32_e32 v0, s5 ; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 -; GFX11_W64-NEXT: v_div_fmas_f32 v0, s4, v0, 1.0 +; GFX11_W64-NEXT: v_div_fmas_f32 v0, s6, v0, 1.0 ; GFX11_W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11_W64-NEXT: s_nop 0 ; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -738,8 +742,8 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(ptr addrspace(1) %out, define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, double %b, double %c, i1 %d) { ; GFX7-LABEL: test_div_fmas_f64: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x0 -; GFX7-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX7-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; GFX7-NEXT: s_load_dword s0, s[2:3], 0x8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s8 @@ -758,8 +762,8 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d ; ; GFX8-LABEL: test_div_fmas_f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x0 -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x20 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x20 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 @@ -779,10 +783,10 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d ; GFX10_W32-LABEL: test_div_fmas_f64: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x1 -; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0x20 -; GFX10_W32-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x0 +; GFX10_W32-NEXT: s_load_dword s0, s[2:3], 0x20 +; GFX10_W32-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_and_b32 s0, 1, s2 +; GFX10_W32-NEXT: s_and_b32 s0, 1, s0 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s8 ; GFX10_W32-NEXT: v_mov_b32_e32 v2, s10 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 @@ -796,10 +800,10 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d ; GFX10_W64-LABEL: test_div_fmas_f64: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x1 -; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0x20 -; GFX10_W64-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x0 +; GFX10_W64-NEXT: s_load_dword s0, s[2:3], 0x20 +; GFX10_W64-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_and_b32 s0, 1, s2 +; GFX10_W64-NEXT: s_and_b32 s0, 1, s0 ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s8 ; GFX10_W64-NEXT: v_mov_b32_e32 v2, s10 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 @@ -813,8 +817,8 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d ; GFX11_W32-LABEL: test_div_fmas_f64: ; GFX11_W32: ; %bb.0: ; GFX11_W32-NEXT: s_clause 0x1 -; GFX11_W32-NEXT: s_load_b32 s8, s[0:1], 0x20 -; GFX11_W32-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 +; GFX11_W32-NEXT: s_load_b32 s8, s[2:3], 0x20 +; GFX11_W32-NEXT: s_load_b256 s[0:7], s[2:3], 0x0 ; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11_W32-NEXT: s_and_b32 s8, 1, s8 ; GFX11_W32-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -830,8 +834,8 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d ; GFX11_W64-LABEL: test_div_fmas_f64: ; GFX11_W64: ; %bb.0: ; GFX11_W64-NEXT: s_clause 0x1 -; GFX11_W64-NEXT: s_load_b32 s8, s[0:1], 0x20 -; GFX11_W64-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 +; GFX11_W64-NEXT: s_load_b32 s8, s[2:3], 0x20 +; GFX11_W64-NEXT: s_load_b256 s[0:7], s[2:3], 0x0 ; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11_W64-NEXT: s_and_b32 s8, 1, s8 ; GFX11_W64-NEXT: v_mov_b32_e32 v0, s4 @@ -853,8 +857,8 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(ptr addrspace(1) %out, float %a, float %b, float %c, i32 %i) { ; GFX7-LABEL: test_div_fmas_f32_cond_to_vcc: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_cmp_eq_u32 s7, 0 @@ -872,18 +876,19 @@ define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(ptr addrspace(1) %out, ; ; GFX8-LABEL: test_div_fmas_f32_cond_to_vcc: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_cmp_eq_u32 s7, 0 -; GFX8-NEXT: s_cselect_b32 s2, 1, 0 -; GFX8-NEXT: s_and_b32 s2, 1, s2 +; GFX8-NEXT: s_cselect_b32 s0, 1, 0 +; GFX8-NEXT: s_and_b32 s0, 1, s0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 -; GFX8-NEXT: s_nop 3 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_nop 2 ; GFX8-NEXT: v_div_fmas_f32 v2, v0, v1, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -892,42 +897,42 @@ define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(ptr addrspace(1) %out, ; GFX10_W32-LABEL: test_div_fmas_f32_cond_to_vcc: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x1 -; GFX10_W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 -; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX10_W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8 +; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W32-NEXT: s_cmp_eq_u32 s7, 0 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 -; GFX10_W32-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10_W32-NEXT: s_cselect_b32 s2, 1, 0 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6 -; GFX10_W32-NEXT: s_and_b32 s0, 1, s0 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 +; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 ; GFX10_W32-NEXT: v_div_fmas_f32 v0, s4, v0, v1 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_cond_to_vcc: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x1 -; GFX10_W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 -; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX10_W64-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8 +; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W64-NEXT: s_cmp_eq_u32 s7, 0 ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 -; GFX10_W64-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10_W64-NEXT: s_cselect_b32 s2, 1, 0 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, s6 -; GFX10_W64-NEXT: s_and_b32 s0, 1, s0 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 +; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX10_W64-NEXT: v_div_fmas_f32 v0, s4, v0, v1 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10_W64-NEXT: s_endpgm ; ; GFX11_W32-LABEL: test_div_fmas_f32_cond_to_vcc: ; GFX11_W32: ; %bb.0: ; GFX11_W32-NEXT: s_clause 0x1 -; GFX11_W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x8 -; GFX11_W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11_W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x8 +; GFX11_W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11_W32-NEXT: s_cmp_eq_u32 s7, 0 ; GFX11_W32-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 @@ -944,8 +949,8 @@ define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(ptr addrspace(1) %out, ; GFX11_W64-LABEL: test_div_fmas_f32_cond_to_vcc: ; GFX11_W64: ; %bb.0: ; GFX11_W64-NEXT: s_clause 0x1 -; GFX11_W64-NEXT: s_load_b128 s[4:7], s[0:1], 0x8 -; GFX11_W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11_W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x8 +; GFX11_W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11_W64-NEXT: s_cmp_eq_u32 s7, 0 ; GFX11_W64-NEXT: v_mov_b32_e32 v0, s5 @@ -968,15 +973,15 @@ define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(ptr addrspace(1) %out, define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c) { ; GFX7-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s2, s[0:1], 0xa -; GFX7-NEXT: s_load_dword s3, s[0:1], 0x13 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x1c -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s4, s[2:3], 0xa +; GFX7-NEXT: s_load_dword s5, s[2:3], 0x13 +; GFX7-NEXT: s_load_dword s6, s[2:3], 0x1c +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: s_mov_b64 vcc, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: v_div_fmas_f32 v0, v0, v1, v2 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 @@ -985,16 +990,17 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(ptr addrspace ; ; GFX8-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x28 -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x4c -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x70 +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x28 +; GFX8-NEXT: s_load_dword s1, s[2:3], 0x4c +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x70 ; GFX8-NEXT: s_mov_b64 vcc, 0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_div_fmas_f32 v2, v0, v1, v2 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1003,46 +1009,46 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(ptr addrspace ; GFX10_W32-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x3 -; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x70 -; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x28 -; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX10_W32-NEXT: s_load_dword s4, s[2:3], 0x4c +; GFX10_W32-NEXT: s_load_dword s5, s[2:3], 0x70 +; GFX10_W32-NEXT: s_load_dword s6, s[2:3], 0x28 +; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10_W32-NEXT: s_mov_b32 vcc_lo, 0 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s4 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, s5 ; GFX10_W32-NEXT: v_div_fmas_f32 v0, s6, v0, v1 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x3 -; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x70 -; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x28 -; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX10_W64-NEXT: s_load_dword s4, s[2:3], 0x4c +; GFX10_W64-NEXT: s_load_dword s5, s[2:3], 0x70 +; GFX10_W64-NEXT: s_load_dword s6, s[2:3], 0x28 +; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10_W64-NEXT: s_mov_b64 vcc, 0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, s5 ; GFX10_W64-NEXT: v_div_fmas_f32 v0, s6, v0, v1 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10_W64-NEXT: s_endpgm ; ; GFX11_W32-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc: ; GFX11_W32: ; %bb.0: ; GFX11_W32-NEXT: s_clause 0x3 -; GFX11_W32-NEXT: s_load_b32 s2, s[0:1], 0x4c -; GFX11_W32-NEXT: s_load_b32 s3, s[0:1], 0x70 -; GFX11_W32-NEXT: s_load_b32 s4, s[0:1], 0x28 -; GFX11_W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11_W32-NEXT: s_load_b32 s4, s[2:3], 0x4c +; GFX11_W32-NEXT: s_load_b32 s5, s[2:3], 0x70 +; GFX11_W32-NEXT: s_load_b32 s6, s[2:3], 0x28 +; GFX11_W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11_W32-NEXT: s_mov_b32 vcc_lo, 0 ; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11_W32-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11_W32-NEXT: v_div_fmas_f32 v0, s4, v0, v1 +; GFX11_W32-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11_W32-NEXT: v_div_fmas_f32 v0, s6, v0, v1 ; GFX11_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11_W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11_W32-NEXT: s_nop 0 @@ -1052,15 +1058,15 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(ptr addrspace ; GFX11_W64-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc: ; GFX11_W64: ; %bb.0: ; GFX11_W64-NEXT: s_clause 0x3 -; GFX11_W64-NEXT: s_load_b32 s2, s[0:1], 0x4c -; GFX11_W64-NEXT: s_load_b32 s3, s[0:1], 0x70 -; GFX11_W64-NEXT: s_load_b32 s4, s[0:1], 0x28 -; GFX11_W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11_W64-NEXT: s_load_b32 s4, s[2:3], 0x4c +; GFX11_W64-NEXT: s_load_b32 s5, s[2:3], 0x70 +; GFX11_W64-NEXT: s_load_b32 s6, s[2:3], 0x28 +; GFX11_W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11_W64-NEXT: s_mov_b64 vcc, 0 ; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11_W64-NEXT: v_mov_b32_e32 v0, s2 -; GFX11_W64-NEXT: v_mov_b32_e32 v1, s3 -; GFX11_W64-NEXT: v_div_fmas_f32 v0, s4, v0, v1 +; GFX11_W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX11_W64-NEXT: v_mov_b32_e32 v1, s5 +; GFX11_W64-NEXT: v_div_fmas_f32 v0, s6, v0, v1 ; GFX11_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11_W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11_W64-NEXT: s_nop 0 @@ -1074,15 +1080,15 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(ptr addrspace define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c) { ; GFX7-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s2, s[0:1], 0xa -; GFX7-NEXT: s_load_dword s3, s[0:1], 0x13 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x1c -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s4, s[2:3], 0xa +; GFX7-NEXT: s_load_dword s5, s[2:3], 0x13 +; GFX7-NEXT: s_load_dword s6, s[2:3], 0x1c +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: s_mov_b64 vcc, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: v_div_fmas_f32 v0, v0, v1, v2 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 @@ -1091,16 +1097,17 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(ptr addrspace( ; ; GFX8-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x28 -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x4c -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x70 +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x28 +; GFX8-NEXT: s_load_dword s1, s[2:3], 0x4c +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x70 ; GFX8-NEXT: s_mov_b64 vcc, -1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_div_fmas_f32 v2, v0, v1, v2 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1109,46 +1116,46 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(ptr addrspace( ; GFX10_W32-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x3 -; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x70 -; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x28 -; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX10_W32-NEXT: s_load_dword s4, s[2:3], 0x4c +; GFX10_W32-NEXT: s_load_dword s5, s[2:3], 0x70 +; GFX10_W32-NEXT: s_load_dword s6, s[2:3], 0x28 +; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10_W32-NEXT: s_mov_b32 vcc_lo, -1 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s4 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, s5 ; GFX10_W32-NEXT: v_div_fmas_f32 v0, s6, v0, v1 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x3 -; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c -; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x70 -; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x28 -; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX10_W64-NEXT: s_load_dword s4, s[2:3], 0x4c +; GFX10_W64-NEXT: s_load_dword s5, s[2:3], 0x70 +; GFX10_W64-NEXT: s_load_dword s6, s[2:3], 0x28 +; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10_W64-NEXT: s_mov_b64 vcc, -1 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, s5 ; GFX10_W64-NEXT: v_div_fmas_f32 v0, s6, v0, v1 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10_W64-NEXT: s_endpgm ; ; GFX11_W32-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc: ; GFX11_W32: ; %bb.0: ; GFX11_W32-NEXT: s_clause 0x3 -; GFX11_W32-NEXT: s_load_b32 s2, s[0:1], 0x4c -; GFX11_W32-NEXT: s_load_b32 s3, s[0:1], 0x70 -; GFX11_W32-NEXT: s_load_b32 s4, s[0:1], 0x28 -; GFX11_W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11_W32-NEXT: s_load_b32 s4, s[2:3], 0x4c +; GFX11_W32-NEXT: s_load_b32 s5, s[2:3], 0x70 +; GFX11_W32-NEXT: s_load_b32 s6, s[2:3], 0x28 +; GFX11_W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11_W32-NEXT: s_mov_b32 vcc_lo, -1 ; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11_W32-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11_W32-NEXT: v_div_fmas_f32 v0, s4, v0, v1 +; GFX11_W32-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11_W32-NEXT: v_div_fmas_f32 v0, s6, v0, v1 ; GFX11_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11_W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11_W32-NEXT: s_nop 0 @@ -1158,15 +1165,15 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(ptr addrspace( ; GFX11_W64-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc: ; GFX11_W64: ; %bb.0: ; GFX11_W64-NEXT: s_clause 0x3 -; GFX11_W64-NEXT: s_load_b32 s2, s[0:1], 0x4c -; GFX11_W64-NEXT: s_load_b32 s3, s[0:1], 0x70 -; GFX11_W64-NEXT: s_load_b32 s4, s[0:1], 0x28 -; GFX11_W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11_W64-NEXT: s_load_b32 s4, s[2:3], 0x4c +; GFX11_W64-NEXT: s_load_b32 s5, s[2:3], 0x70 +; GFX11_W64-NEXT: s_load_b32 s6, s[2:3], 0x28 +; GFX11_W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11_W64-NEXT: s_mov_b64 vcc, -1 ; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11_W64-NEXT: v_mov_b32_e32 v0, s2 -; GFX11_W64-NEXT: v_mov_b32_e32 v1, s3 -; GFX11_W64-NEXT: v_div_fmas_f32 v0, s4, v0, v1 +; GFX11_W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX11_W64-NEXT: v_mov_b32_e32 v1, s5 +; GFX11_W64-NEXT: v_div_fmas_f32 v0, s6, v0, v1 ; GFX11_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11_W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11_W64-NEXT: s_nop 0 @@ -1180,36 +1187,36 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(ptr addrspace( define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %d) { ; GFX7-LABEL: test_div_fmas_f32_logical_cond_to_vcc: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GFX7-NEXT: s_load_dword s8, s[0:1], 0xc -; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 +; GFX7-NEXT: s_load_dword s0, s[2:3], 0xc +; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 -; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7] -; GFX7-NEXT: buffer_load_dword v3, v[1:2], s[0:3], 0 addr64 glc +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v3, v[1:2], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_load_dword v4, v[1:2], s[0:3], 0 addr64 offset:4 glc +; GFX7-NEXT: buffer_load_dword v4, v[1:2], s[4:7], 0 addr64 offset:4 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_load_dword v1, v[1:2], s[0:3], 0 addr64 offset:8 glc +; GFX7-NEXT: buffer_load_dword v1, v[1:2], s[4:7], 0 addr64 offset:8 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: s_cmp_lg_u32 s8, 0 +; GFX7-NEXT: s_cmp_lg_u32 s0, 0 ; GFX7-NEXT: s_cselect_b32 s0, 1, 0 ; GFX7-NEXT: s_and_b32 s0, 1, s0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_and_b64 vcc, vcc, s[0:1] -; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX7-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX7-NEXT: v_div_fmas_f32 v0, v3, v4, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 +; GFX7-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:8 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_fmas_f32_logical_cond_to_vcc: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x30 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x30 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s6 @@ -1243,9 +1250,10 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1 ; ; GFX10_W32-LABEL: test_div_fmas_f32_logical_cond_to_vcc: ; GFX10_W32: ; %bb.0: -; GFX10_W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX10_W32-NEXT: s_clause 0x1 +; GFX10_W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX10_W32-NEXT: s_load_dword s0, s[2:3], 0x30 ; GFX10_W32-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX10_W32-NEXT: s_load_dword s0, s[0:1], 0x30 ; GFX10_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W32-NEXT: global_load_dword v2, v1, s[6:7] glc dlc @@ -1254,8 +1262,8 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1 ; GFX10_W32-NEXT: s_waitcnt vmcnt(0) ; GFX10_W32-NEXT: global_load_dword v4, v1, s[6:7] offset:8 glc dlc ; GFX10_W32-NEXT: s_waitcnt vmcnt(0) -; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W32-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W32-NEXT: s_cselect_b32 s0, 1, 0 ; GFX10_W32-NEXT: s_and_b32 s0, 1, s0 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 @@ -1266,9 +1274,10 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1 ; ; GFX10_W64-LABEL: test_div_fmas_f32_logical_cond_to_vcc: ; GFX10_W64: ; %bb.0: -; GFX10_W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX10_W64-NEXT: s_clause 0x1 +; GFX10_W64-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX10_W64-NEXT: s_load_dword s0, s[2:3], 0x30 ; GFX10_W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX10_W64-NEXT: s_load_dword s0, s[0:1], 0x30 ; GFX10_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W64-NEXT: global_load_dword v2, v1, s[6:7] glc dlc @@ -1277,8 +1286,8 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1 ; GFX10_W64-NEXT: s_waitcnt vmcnt(0) ; GFX10_W64-NEXT: global_load_dword v4, v1, s[6:7] offset:8 glc dlc ; GFX10_W64-NEXT: s_waitcnt vmcnt(0) -; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W64-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W64-NEXT: s_cselect_b32 s0, 1, 0 ; GFX10_W64-NEXT: s_and_b32 s0, 1, s0 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 @@ -1289,9 +1298,11 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1 ; ; GFX11_W32-LABEL: test_div_fmas_f32_logical_cond_to_vcc: ; GFX11_W32: ; %bb.0: -; GFX11_W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11_W32-NEXT: s_clause 0x1 +; GFX11_W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11_W32-NEXT: s_load_b32 s0, s[2:3], 0x30 +; GFX11_W32-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11_W32-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX11_W32-NEXT: s_load_b32 s0, s[0:1], 0x30 ; GFX11_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11_W32-NEXT: global_load_b32 v2, v1, s[6:7] glc dlc @@ -1314,9 +1325,11 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1 ; ; GFX11_W64-LABEL: test_div_fmas_f32_logical_cond_to_vcc: ; GFX11_W64: ; %bb.0: -; GFX11_W64-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11_W64-NEXT: s_clause 0x1 +; GFX11_W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11_W64-NEXT: s_load_b32 s0, s[2:3], 0x30 +; GFX11_W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11_W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX11_W64-NEXT: s_load_b32 s0, s[0:1], 0x30 ; GFX11_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11_W64-NEXT: global_load_b32 v2, v1, s[6:7] glc dlc @@ -1358,68 +1371,73 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1 define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [8 x i32], ptr addrspace(1) %in, [8 x i32], ptr addrspace(1) %dummy) { ; GFX7-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xa +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xa ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_load_dwordx3 v[1:3], v[1:2], s[4:7], 0 addr64 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0 +; GFX7-NEXT: s_mov_b64 vcc, 0 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX7-NEXT: s_cbranch_execz .LBB13_2 ; GFX7-NEXT: ; %bb.1: ; %bb -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x14 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x14 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_cmp_lg_u32 s4, 0 -; GFX7-NEXT: s_cselect_b32 s6, 1, 0 +; GFX7-NEXT: s_cmp_lg_u32 s0, 0 +; GFX7-NEXT: s_cselect_b32 s0, 1, 0 +; GFX7-NEXT: s_and_b32 s0, 1, s0 +; GFX7-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 +; GFX7-NEXT: s_andn2_b64 s[8:9], 0, exec +; GFX7-NEXT: s_and_b64 s[0:1], exec, s[0:1] +; GFX7-NEXT: s_or_b64 vcc, s[8:9], s[0:1] ; GFX7-NEXT: .LBB13_2: ; %exit -; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX7-NEXT: s_and_b32 s0, 1, s6 -; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: s_nop 1 ; GFX7-NEXT: v_div_fmas_f32 v0, v1, v2, v3 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x28 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x28 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0 -; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GFX8-NEXT: flat_load_dwordx3 v[1:3], v[1:2] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0 +; GFX8-NEXT: s_mov_b64 vcc, 0 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB13_2 ; GFX8-NEXT: ; %bb.1: ; %bb -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x50 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x50 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_cmp_lg_u32 s4, 0 -; GFX8-NEXT: s_cselect_b32 s4, 1, 0 +; GFX8-NEXT: s_cmp_lg_u32 s0, 0 +; GFX8-NEXT: s_cselect_b32 s0, 1, 0 +; GFX8-NEXT: s_and_b32 s0, 1, s0 +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 +; GFX8-NEXT: s_andn2_b64 s[6:7], 0, exec +; GFX8-NEXT: s_and_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_or_b64 vcc, s[6:7], s[0:1] ; GFX8-NEXT: .LBB13_2: ; %exit -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_div_fmas_f32 v2, v1, v2, v3 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 8 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: s_and_b32 s2, 1, s4 -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: s_nop 2 -; GFX8-NEXT: v_div_fmas_f32 v2, v1, v2, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1427,27 +1445,29 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [ ; ; GFX10_W32-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX10_W32: ; %bb.0: ; %entry -; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x28 +; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x28 ; GFX10_W32-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX10_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10_W32-NEXT: s_mov_b32 vcc_lo, 0 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: global_load_dwordx3 v[1:3], v1, s[2:3] -; GFX10_W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_W32-NEXT: s_mov_b32 s2, 0 -; GFX10_W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10_W32-NEXT: global_load_dwordx3 v[1:3], v1, s[0:1] +; GFX10_W32-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX10_W32-NEXT: s_and_saveexec_b32 s1, s0 ; GFX10_W32-NEXT: s_cbranch_execz .LBB13_2 ; GFX10_W32-NEXT: ; %bb.1: ; %bb -; GFX10_W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x50 +; GFX10_W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x50 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10_W32-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_cmp_lg_u32 s2, 0 -; GFX10_W32-NEXT: s_cselect_b32 s2, 1, 0 +; GFX10_W32-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10_W32-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10_W32-NEXT: s_andn2_b32 s4, 0, exec_lo +; GFX10_W32-NEXT: s_and_b32 s0, 1, s0 +; GFX10_W32-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 +; GFX10_W32-NEXT: s_and_b32 s0, exec_lo, s0 +; GFX10_W32-NEXT: s_or_b32 vcc_lo, s4, s0 ; GFX10_W32-NEXT: .LBB13_2: ; %exit -; GFX10_W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 +; GFX10_W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10_W32-NEXT: s_waitcnt vmcnt(0) ; GFX10_W32-NEXT: v_div_fmas_f32 v0, v1, v2, v3 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 @@ -1457,27 +1477,29 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [ ; ; GFX10_W64-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX10_W64: ; %bb.0: ; %entry -; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x28 +; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x28 ; GFX10_W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX10_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10_W64-NEXT: s_mov_b32 s4, 0 +; GFX10_W64-NEXT: s_mov_b64 vcc, 0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: global_load_dwordx3 v[1:3], v1, s[2:3] -; GFX10_W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10_W64-NEXT: global_load_dwordx3 v[1:3], v1, s[0:1] +; GFX10_W64-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0 +; GFX10_W64-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX10_W64-NEXT: s_cbranch_execz .LBB13_2 ; GFX10_W64-NEXT: ; %bb.1: ; %bb -; GFX10_W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x50 +; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x50 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10_W64-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10_W64-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10_W64-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10_W64-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10_W64-NEXT: s_andn2_b64 s[6:7], 0, exec +; GFX10_W64-NEXT: s_and_b32 s0, 1, s0 +; GFX10_W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 +; GFX10_W64-NEXT: s_and_b64 s[0:1], exec, s[0:1] +; GFX10_W64-NEXT: s_or_b64 vcc, s[6:7], s[0:1] ; GFX10_W64-NEXT: .LBB13_2: ; %exit -; GFX10_W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10_W64-NEXT: s_and_b32 s2, 1, s4 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX10_W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10_W64-NEXT: s_waitcnt vmcnt(0) ; GFX10_W64-NEXT: v_div_fmas_f32 v0, v1, v2, v3 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 @@ -1487,28 +1509,32 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [ ; ; GFX11_W32-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX11_W32: ; %bb.0: ; %entry -; GFX11_W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x28 -; GFX11_W32-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX11_W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x28 +; GFX11_W32-NEXT: v_and_b32_e32 v3, 0x3ff, v0 +; GFX11_W32-NEXT: s_mov_b32 vcc_lo, 0 +; GFX11_W32-NEXT: v_lshlrev_b32_e32 v0, 2, v3 ; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11_W32-NEXT: global_load_b96 v[1:3], v1, s[2:3] -; GFX11_W32-NEXT: s_mov_b32 s2, 0 -; GFX11_W32-NEXT: s_mov_b32 s3, exec_lo -; GFX11_W32-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11_W32-NEXT: global_load_b96 v[0:2], v0, s[0:1] +; GFX11_W32-NEXT: s_mov_b32 s1, exec_lo +; GFX11_W32-NEXT: v_cmpx_eq_u32_e32 0, v3 ; GFX11_W32-NEXT: s_cbranch_execz .LBB13_2 ; GFX11_W32-NEXT: ; %bb.1: ; %bb -; GFX11_W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x50 +; GFX11_W32-NEXT: s_load_b64 s[4:5], s[2:3], 0x50 ; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11_W32-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11_W32-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11_W32-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11_W32-NEXT: s_cselect_b32 s2, 1, 0 +; GFX11_W32-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11_W32-NEXT: s_cselect_b32 s0, 1, 0 +; GFX11_W32-NEXT: s_and_not1_b32 s4, 0, exec_lo +; GFX11_W32-NEXT: s_and_b32 s0, 1, s0 +; GFX11_W32-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 +; GFX11_W32-NEXT: s_and_b32 s0, exec_lo, s0 +; GFX11_W32-NEXT: s_or_b32 vcc_lo, s4, s0 ; GFX11_W32-NEXT: .LBB13_2: ; %exit -; GFX11_W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX11_W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11_W32-NEXT: s_and_b32 s2, 1, s2 -; GFX11_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 +; GFX11_W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11_W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11_W32-NEXT: s_waitcnt vmcnt(0) -; GFX11_W32-NEXT: v_div_fmas_f32 v0, v1, v2, v3 +; GFX11_W32-NEXT: v_div_fmas_f32 v0, v0, v1, v2 ; GFX11_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11_W32-NEXT: global_store_b32 v1, v0, s[0:1] offset:8 @@ -1518,28 +1544,32 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [ ; ; GFX11_W64-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX11_W64: ; %bb.0: ; %entry -; GFX11_W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x28 -; GFX11_W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX11_W64-NEXT: s_mov_b32 s4, 0 +; GFX11_W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x28 +; GFX11_W64-NEXT: v_and_b32_e32 v3, 0x3ff, v0 +; GFX11_W64-NEXT: s_mov_b64 vcc, 0 +; GFX11_W64-NEXT: s_mov_b64 s[4:5], exec +; GFX11_W64-NEXT: v_lshlrev_b32_e32 v0, 2, v3 ; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11_W64-NEXT: global_load_b96 v[1:3], v1, s[2:3] -; GFX11_W64-NEXT: s_mov_b64 s[2:3], exec -; GFX11_W64-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11_W64-NEXT: global_load_b96 v[0:2], v0, s[0:1] +; GFX11_W64-NEXT: v_cmpx_eq_u32_e32 0, v3 ; GFX11_W64-NEXT: s_cbranch_execz .LBB13_2 ; GFX11_W64-NEXT: ; %bb.1: ; %bb -; GFX11_W64-NEXT: s_load_b64 s[4:5], s[0:1], 0x50 +; GFX11_W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x50 ; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11_W64-NEXT: s_load_b32 s4, s[4:5], 0x0 +; GFX11_W64-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11_W64-NEXT: s_cmp_lg_u32 s4, 0 -; GFX11_W64-NEXT: s_cselect_b32 s4, 1, 0 +; GFX11_W64-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11_W64-NEXT: s_cselect_b32 s0, 1, 0 +; GFX11_W64-NEXT: s_and_not1_b64 s[6:7], 0, exec +; GFX11_W64-NEXT: s_and_b32 s0, 1, s0 +; GFX11_W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 +; GFX11_W64-NEXT: s_and_b64 s[0:1], exec, s[0:1] +; GFX11_W64-NEXT: s_or_b64 vcc, s[6:7], s[0:1] ; GFX11_W64-NEXT: .LBB13_2: ; %exit -; GFX11_W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX11_W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11_W64-NEXT: s_and_b32 s2, 1, s4 -; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX11_W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11_W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11_W64-NEXT: s_waitcnt vmcnt(0) -; GFX11_W64-NEXT: v_div_fmas_f32 v0, v1, v2, v3 +; GFX11_W64-NEXT: v_div_fmas_f32 v0, v0, v1, v2 ; GFX11_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11_W64-NEXT: global_store_b32 v1, v0, s[0:1] offset:8 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll index bf72cccd912cee..de318e7ae31a5b 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll @@ -393,12 +393,11 @@ define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) { ; GCN-LABEL: select_add_lhs_const_i16: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 -; GCN-NEXT: v_mov_b32_e32 v0, 0x83 -; GCN-NEXT: v_mov_b32_e32 v1, 0x80 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp1_b32 s0, 0 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: s_movk_i32 s0, 0x80 +; GCN-NEXT: s_cselect_b32 s0, s0, 0x83 +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: flat_store_short v[0:1], v0 ; GCN-NEXT: s_endpgm %select = select i1 %cond, i16 5, i16 8 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll index 210356d1313501..b8585120afa45f 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare %s | FileCheck -check-prefix=SI %s -; RUN: opt -S -mtriple=amdgcn-- -mcpu=tonga -amdgpu-codegenprepare %s | FileCheck -check-prefix=VI %s +; RUN: opt -S -amdgpu-codegenprepare-widen-16-bit-ops -mtriple=amdgcn-- -amdgpu-codegenprepare %s | FileCheck -check-prefix=SI %s +; RUN: opt -S -amdgpu-codegenprepare-widen-16-bit-ops -mtriple=amdgcn-- -mcpu=tonga -amdgpu-codegenprepare %s | FileCheck -check-prefix=VI %s define amdgpu_kernel void @add_i3(i3 %a, i3 %b) { ; SI-LABEL: @add_i3( diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll index 0025d23b108038..32b2fa238cbac4 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll @@ -18,189 +18,33 @@ declare hidden half @_Z4pownDhi(half, i32) ; -------------------------------------------------------------------- define half @test_pow_fast_f16(half %x, half %y) { -; CHECK-LABEL: test_pow_fast_f16: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, _Z3powDhDh@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, _Z3powDhDh@rel32@hi+12 -; CHECK-NEXT: s_setpc_b64 s[16:17] %pow = tail call fast half @_Z3powDhDh(half %x, half %y) ret half %pow } define float @test_pow_fast_f32(float %x, float %y) { -; CHECK-LABEL: test_pow_fast_f32: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, _Z3powff@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, _Z3powff@rel32@hi+12 -; CHECK-NEXT: s_setpc_b64 s[16:17] %pow = tail call fast float @_Z3powff(float %x, float %y) ret float %pow } define double @test_pow_fast_f64(double %x, double %y) { -; CHECK-LABEL: test_pow_fast_f64: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, _Z3powdd@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, _Z3powdd@rel32@hi+12 -; CHECK-NEXT: s_setpc_b64 s[16:17] %pow = tail call fast double @_Z3powdd(double %x, double %y) ret double %pow } define half @test_pow_fast_f16__integral_y(half %x, i32 %y.i) { -; CHECK-LABEL: test_pow_fast_f16__integral_y: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1 -; CHECK-NEXT: v_log_f16_e64 v3, |v0| -; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CHECK-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CHECK-NEXT: v_cvt_i32_f32_e32 v1, v1 -; CHECK-NEXT: v_cvt_f32_i32_e32 v2, v1 -; CHECK-NEXT: v_lshlrev_b16_e32 v1, 15, v1 -; CHECK-NEXT: v_and_b32_e32 v0, v1, v0 -; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_f16_e32 v2, v3, v2 -; CHECK-NEXT: v_exp_f16_e32 v2, v2 -; CHECK-NEXT: v_or_b32_e32 v0, v0, v2 -; CHECK-NEXT: s_setpc_b64 s[30:31] %y = sitofp i32 %y.i to half %pow = tail call fast half @_Z3powDhDh(half %x, half %y) ret half %pow } define float @test_pow_fast_f32__integral_y(float %x, i32 %y.i) { -; CHECK-LABEL: test_pow_fast_f32__integral_y: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1 -; CHECK-NEXT: s_mov_b32 s4, 0x800000 -; CHECK-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; CHECK-NEXT: v_mov_b32_e32 v3, 0x4f800000 -; CHECK-NEXT: v_cvt_i32_f32_e32 v1, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc -; CHECK-NEXT: v_mul_f32_e64 v3, |v0|, v3 -; CHECK-NEXT: v_log_f32_e32 v3, v3 -; CHECK-NEXT: v_cvt_f32_i32_e32 v4, v1 -; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000 -; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; CHECK-NEXT: v_sub_f32_e32 v2, v3, v2 -; CHECK-NEXT: v_mul_f32_e32 v3, v2, v4 -; CHECK-NEXT: s_mov_b32 s4, 0xc2fc0000 -; CHECK-NEXT: v_mov_b32_e32 v5, 0x42800000 -; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v3 -; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc -; CHECK-NEXT: v_fma_f32 v2, v2, v4, v3 -; CHECK-NEXT: v_exp_f32_e32 v2, v2 -; CHECK-NEXT: v_mov_b32_e32 v3, 0x1f800000 -; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 31, v1 -; CHECK-NEXT: v_mul_f32_e32 v2, v2, v3 -; CHECK-NEXT: v_and_or_b32 v0, v1, v0, v2 -; CHECK-NEXT: s_setpc_b64 s[30:31] %y = sitofp i32 %y.i to float %pow = tail call fast float @_Z3powff(float %x, float %y) ret float %pow } define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) { -; CHECK-LABEL: test_pow_fast_f64__integral_y: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s16, s33 -; CHECK-NEXT: s_mov_b32 s33, s32 -; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 -; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: v_writelane_b32 v43, s16, 14 -; CHECK-NEXT: v_writelane_b32 v43, s30, 0 -; CHECK-NEXT: v_writelane_b32 v43, s31, 1 -; CHECK-NEXT: v_writelane_b32 v43, s34, 2 -; CHECK-NEXT: v_writelane_b32 v43, s35, 3 -; CHECK-NEXT: v_writelane_b32 v43, s36, 4 -; CHECK-NEXT: v_writelane_b32 v43, s37, 5 -; CHECK-NEXT: v_writelane_b32 v43, s38, 6 -; CHECK-NEXT: v_writelane_b32 v43, s39, 7 -; CHECK-NEXT: s_addk_i32 s32, 0x800 -; CHECK-NEXT: v_writelane_b32 v43, s40, 8 -; CHECK-NEXT: v_writelane_b32 v43, s41, 9 -; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v43, s42, 10 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v43, s43, 11 -; CHECK-NEXT: v_mov_b32_e32 v42, v1 -; CHECK-NEXT: v_writelane_b32 v43, s44, 12 -; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v42 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: v_writelane_b32 v43, s45, 13 -; CHECK-NEXT: v_mov_b32_e32 v40, v31 -; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7] -; CHECK-NEXT: v_mov_b32_e32 v41, v2 -; CHECK-NEXT: s_mov_b32 s42, s15 -; CHECK-NEXT: s_mov_b32 s43, s14 -; CHECK-NEXT: s_mov_b32 s44, s13 -; CHECK-NEXT: s_mov_b32 s45, s12 -; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11] -; CHECK-NEXT: s_mov_b64 s[38:39], s[8:9] -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v41 -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] -; CHECK-NEXT: s_mov_b64 s[6:7], s[34:35] -; CHECK-NEXT: s_mov_b64 s[8:9], s[38:39] -; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s45 -; CHECK-NEXT: s_mov_b32 s13, s44 -; CHECK-NEXT: s_mov_b32 s14, s43 -; CHECK-NEXT: s_mov_b32 s15, s42 -; CHECK-NEXT: v_mov_b32_e32 v31, v40 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 31, v41 -; CHECK-NEXT: v_and_b32_e32 v2, v2, v42 -; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: v_or_b32_e32 v1, v2, v1 -; CHECK-NEXT: v_readlane_b32 s45, v43, 13 -; CHECK-NEXT: v_readlane_b32 s44, v43, 12 -; CHECK-NEXT: v_readlane_b32 s43, v43, 11 -; CHECK-NEXT: v_readlane_b32 s42, v43, 10 -; CHECK-NEXT: v_readlane_b32 s41, v43, 9 -; CHECK-NEXT: v_readlane_b32 s40, v43, 8 -; CHECK-NEXT: v_readlane_b32 s39, v43, 7 -; CHECK-NEXT: v_readlane_b32 s38, v43, 6 -; CHECK-NEXT: v_readlane_b32 s37, v43, 5 -; CHECK-NEXT: v_readlane_b32 s36, v43, 4 -; CHECK-NEXT: v_readlane_b32 s35, v43, 3 -; CHECK-NEXT: v_readlane_b32 s34, v43, 2 -; CHECK-NEXT: v_readlane_b32 s31, v43, 1 -; CHECK-NEXT: v_readlane_b32 s30, v43, 0 -; CHECK-NEXT: v_readlane_b32 s4, v43, 14 -; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xf800 -; CHECK-NEXT: s_mov_b32 s33, s4 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: s_setpc_b64 s[30:31] %y = sitofp i32 %y.i to double %pow = tail call fast double @_Z3powdd(double %x, double %y) ret double %pow @@ -211,132 +55,16 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) { ; -------------------------------------------------------------------- define half @test_powr_fast_f16(half %x, half %y) { -; CHECK-LABEL: test_powr_fast_f16: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_log_f16_e32 v0, v0 -; CHECK-NEXT: v_mul_f16_e32 v0, v1, v0 -; CHECK-NEXT: v_exp_f16_e32 v0, v0 -; CHECK-NEXT: s_setpc_b64 s[30:31] %powr = tail call fast half @_Z4powrDhDh(half %x, half %y) ret half %powr } define float @test_powr_fast_f32(float %x, float %y) { -; CHECK-LABEL: test_powr_fast_f32: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s4, 0x800000 -; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; CHECK-NEXT: v_mov_b32_e32 v3, 0x4f800000 -; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc -; CHECK-NEXT: v_mul_f32_e32 v0, v0, v3 -; CHECK-NEXT: v_log_f32_e32 v0, v0 -; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000 -; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; CHECK-NEXT: s_mov_b32 s4, 0xc2fc0000 -; CHECK-NEXT: v_sub_f32_e32 v0, v0, v2 -; CHECK-NEXT: v_mul_f32_e32 v2, v1, v0 -; CHECK-NEXT: v_mov_b32_e32 v3, 0x42800000 -; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc -; CHECK-NEXT: v_fma_f32 v0, v1, v0, v2 -; CHECK-NEXT: v_exp_f32_e32 v0, v0 -; CHECK-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; CHECK-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; CHECK-NEXT: v_mul_f32_e32 v0, v0, v1 -; CHECK-NEXT: s_setpc_b64 s[30:31] %powr = tail call fast float @_Z4powrff(float %x, float %y) ret float %powr } define double @test_powr_fast_f64(double %x, double %y) { -; CHECK-LABEL: test_powr_fast_f64: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s16, s33 -; CHECK-NEXT: s_mov_b32 s33, s32 -; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 -; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: v_writelane_b32 v43, s16, 14 -; CHECK-NEXT: v_writelane_b32 v43, s30, 0 -; CHECK-NEXT: v_writelane_b32 v43, s31, 1 -; CHECK-NEXT: v_writelane_b32 v43, s34, 2 -; CHECK-NEXT: v_writelane_b32 v43, s35, 3 -; CHECK-NEXT: v_writelane_b32 v43, s36, 4 -; CHECK-NEXT: v_writelane_b32 v43, s37, 5 -; CHECK-NEXT: v_writelane_b32 v43, s38, 6 -; CHECK-NEXT: v_writelane_b32 v43, s39, 7 -; CHECK-NEXT: s_addk_i32 s32, 0x800 -; CHECK-NEXT: v_writelane_b32 v43, s40, 8 -; CHECK-NEXT: v_writelane_b32 v43, s41, 9 -; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v43, s42, 10 -; CHECK-NEXT: v_writelane_b32 v43, s43, 11 -; CHECK-NEXT: v_writelane_b32 v43, s44, 12 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v43, s45, 13 -; CHECK-NEXT: v_mov_b32_e32 v42, v31 -; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7] -; CHECK-NEXT: v_mov_b32_e32 v41, v3 -; CHECK-NEXT: v_mov_b32_e32 v40, v2 -; CHECK-NEXT: s_mov_b32 s42, s15 -; CHECK-NEXT: s_mov_b32 s43, s14 -; CHECK-NEXT: s_mov_b32 s44, s13 -; CHECK-NEXT: s_mov_b32 s45, s12 -; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11] -; CHECK-NEXT: s_mov_b64 s[38:39], s[8:9] -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_mul_f64 v[0:1], v[40:41], v[0:1] -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: s_mov_b64 s[6:7], s[34:35] -; CHECK-NEXT: s_mov_b64 s[8:9], s[38:39] -; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s45 -; CHECK-NEXT: s_mov_b32 s13, s44 -; CHECK-NEXT: s_mov_b32 s14, s43 -; CHECK-NEXT: s_mov_b32 s15, s42 -; CHECK-NEXT: v_mov_b32_e32 v31, v42 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: v_readlane_b32 s45, v43, 13 -; CHECK-NEXT: v_readlane_b32 s44, v43, 12 -; CHECK-NEXT: v_readlane_b32 s43, v43, 11 -; CHECK-NEXT: v_readlane_b32 s42, v43, 10 -; CHECK-NEXT: v_readlane_b32 s41, v43, 9 -; CHECK-NEXT: v_readlane_b32 s40, v43, 8 -; CHECK-NEXT: v_readlane_b32 s39, v43, 7 -; CHECK-NEXT: v_readlane_b32 s38, v43, 6 -; CHECK-NEXT: v_readlane_b32 s37, v43, 5 -; CHECK-NEXT: v_readlane_b32 s36, v43, 4 -; CHECK-NEXT: v_readlane_b32 s35, v43, 3 -; CHECK-NEXT: v_readlane_b32 s34, v43, 2 -; CHECK-NEXT: v_readlane_b32 s31, v43, 1 -; CHECK-NEXT: v_readlane_b32 s30, v43, 0 -; CHECK-NEXT: v_readlane_b32 s4, v43, 14 -; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xf800 -; CHECK-NEXT: s_mov_b32 s33, s4 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: s_setpc_b64 s[30:31] %powr = tail call fast double @_Z4powrdd(double %x, double %y) ret double %powr } @@ -346,429 +74,51 @@ define double @test_powr_fast_f64(double %x, double %y) { ; -------------------------------------------------------------------- define half @test_pown_fast_f16(half %x, i32 %y) { -; CHECK-LABEL: test_pown_fast_f16: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cvt_f32_i32_e32 v2, v1 -; CHECK-NEXT: v_log_f16_e64 v3, |v0| -; CHECK-NEXT: v_lshlrev_b16_e32 v1, 15, v1 -; CHECK-NEXT: v_and_b32_e32 v0, v1, v0 -; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_f16_e32 v2, v3, v2 -; CHECK-NEXT: v_exp_f16_e32 v2, v2 -; CHECK-NEXT: v_or_b32_e32 v0, v0, v2 -; CHECK-NEXT: s_setpc_b64 s[30:31] %call = tail call fast half @_Z4pownDhi(half %x, i32 %y) ret half %call } define float @test_pown_fast_f32(float %x, i32 %y) { -; CHECK-LABEL: test_pown_fast_f32: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s4, 0x800000 -; CHECK-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; CHECK-NEXT: v_mov_b32_e32 v3, 0x4f800000 -; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc -; CHECK-NEXT: v_mul_f32_e64 v3, |v0|, v3 -; CHECK-NEXT: v_log_f32_e32 v3, v3 -; CHECK-NEXT: v_cvt_f32_i32_e32 v4, v1 -; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000 -; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; CHECK-NEXT: v_sub_f32_e32 v2, v3, v2 -; CHECK-NEXT: v_mul_f32_e32 v3, v2, v4 -; CHECK-NEXT: s_mov_b32 s4, 0xc2fc0000 -; CHECK-NEXT: v_mov_b32_e32 v5, 0x42800000 -; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v3 -; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc -; CHECK-NEXT: v_fma_f32 v2, v2, v4, v3 -; CHECK-NEXT: v_exp_f32_e32 v2, v2 -; CHECK-NEXT: v_mov_b32_e32 v3, 0x1f800000 -; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 31, v1 -; CHECK-NEXT: v_mul_f32_e32 v2, v2, v3 -; CHECK-NEXT: v_and_or_b32 v0, v1, v0, v2 -; CHECK-NEXT: s_setpc_b64 s[30:31] %call = tail call fast float @_Z4pownfi(float %x, i32 %y) ret float %call } define double @test_pown_fast_f64(double %x, i32 %y) { -; CHECK-LABEL: test_pown_fast_f64: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s16, s33 -; CHECK-NEXT: s_mov_b32 s33, s32 -; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 -; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: v_writelane_b32 v43, s16, 14 -; CHECK-NEXT: v_writelane_b32 v43, s30, 0 -; CHECK-NEXT: v_writelane_b32 v43, s31, 1 -; CHECK-NEXT: v_writelane_b32 v43, s34, 2 -; CHECK-NEXT: v_writelane_b32 v43, s35, 3 -; CHECK-NEXT: v_writelane_b32 v43, s36, 4 -; CHECK-NEXT: v_writelane_b32 v43, s37, 5 -; CHECK-NEXT: v_writelane_b32 v43, s38, 6 -; CHECK-NEXT: v_writelane_b32 v43, s39, 7 -; CHECK-NEXT: s_addk_i32 s32, 0x800 -; CHECK-NEXT: v_writelane_b32 v43, s40, 8 -; CHECK-NEXT: v_writelane_b32 v43, s41, 9 -; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v43, s42, 10 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v43, s43, 11 -; CHECK-NEXT: v_mov_b32_e32 v42, v1 -; CHECK-NEXT: v_writelane_b32 v43, s44, 12 -; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v42 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: v_writelane_b32 v43, s45, 13 -; CHECK-NEXT: v_mov_b32_e32 v40, v31 -; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7] -; CHECK-NEXT: v_mov_b32_e32 v41, v2 -; CHECK-NEXT: s_mov_b32 s42, s15 -; CHECK-NEXT: s_mov_b32 s43, s14 -; CHECK-NEXT: s_mov_b32 s44, s13 -; CHECK-NEXT: s_mov_b32 s45, s12 -; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11] -; CHECK-NEXT: s_mov_b64 s[38:39], s[8:9] -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v41 -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] -; CHECK-NEXT: s_mov_b64 s[6:7], s[34:35] -; CHECK-NEXT: s_mov_b64 s[8:9], s[38:39] -; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s45 -; CHECK-NEXT: s_mov_b32 s13, s44 -; CHECK-NEXT: s_mov_b32 s14, s43 -; CHECK-NEXT: s_mov_b32 s15, s42 -; CHECK-NEXT: v_mov_b32_e32 v31, v40 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 31, v41 -; CHECK-NEXT: v_and_b32_e32 v2, v2, v42 -; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: v_or_b32_e32 v1, v2, v1 -; CHECK-NEXT: v_readlane_b32 s45, v43, 13 -; CHECK-NEXT: v_readlane_b32 s44, v43, 12 -; CHECK-NEXT: v_readlane_b32 s43, v43, 11 -; CHECK-NEXT: v_readlane_b32 s42, v43, 10 -; CHECK-NEXT: v_readlane_b32 s41, v43, 9 -; CHECK-NEXT: v_readlane_b32 s40, v43, 8 -; CHECK-NEXT: v_readlane_b32 s39, v43, 7 -; CHECK-NEXT: v_readlane_b32 s38, v43, 6 -; CHECK-NEXT: v_readlane_b32 s37, v43, 5 -; CHECK-NEXT: v_readlane_b32 s36, v43, 4 -; CHECK-NEXT: v_readlane_b32 s35, v43, 3 -; CHECK-NEXT: v_readlane_b32 s34, v43, 2 -; CHECK-NEXT: v_readlane_b32 s31, v43, 1 -; CHECK-NEXT: v_readlane_b32 s30, v43, 0 -; CHECK-NEXT: v_readlane_b32 s4, v43, 14 -; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xf800 -; CHECK-NEXT: s_mov_b32 s33, s4 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: s_setpc_b64 s[30:31] %call = tail call fast double @_Z4powndi(double %x, i32 %y) ret double %call } define half @test_pown_fast_f16_known_even(half %x, i32 %y.arg) { -; CHECK-LABEL: test_pown_fast_f16_known_even: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 1, v1 -; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1 -; CHECK-NEXT: v_log_f16_e64 v0, |v0| -; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CHECK-NEXT: v_mul_f16_e32 v0, v0, v1 -; CHECK-NEXT: v_exp_f16_e32 v0, v0 -; CHECK-NEXT: s_setpc_b64 s[30:31] %y = shl i32 %y.arg, 1 %call = tail call fast half @_Z4pownDhi(half %x, i32 %y) ret half %call } define float @test_pown_fast_f32_known_even(float %x, i32 %y.arg) { -; CHECK-LABEL: test_pown_fast_f32_known_even: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s4, 0x800000 -; CHECK-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; CHECK-NEXT: v_mov_b32_e32 v3, 0x4f800000 -; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc -; CHECK-NEXT: v_mul_f32_e64 v0, |v0|, v3 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 1, v1 -; CHECK-NEXT: v_log_f32_e32 v0, v0 -; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1 -; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000 -; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; CHECK-NEXT: v_sub_f32_e32 v0, v0, v2 -; CHECK-NEXT: v_mul_f32_e32 v2, v0, v1 -; CHECK-NEXT: s_mov_b32 s4, 0xc2fc0000 -; CHECK-NEXT: v_mov_b32_e32 v3, 0x42800000 -; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc -; CHECK-NEXT: v_fma_f32 v0, v0, v1, v2 -; CHECK-NEXT: v_exp_f32_e32 v0, v0 -; CHECK-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; CHECK-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; CHECK-NEXT: v_mul_f32_e32 v0, v0, v1 -; CHECK-NEXT: s_setpc_b64 s[30:31] %y = shl i32 %y.arg, 1 %call = tail call fast float @_Z4pownfi(float %x, i32 %y) ret float %call } define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) { -; CHECK-LABEL: test_pown_fast_f64_known_even: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s16, s33 -; CHECK-NEXT: s_mov_b32 s33, s32 -; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 -; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: v_writelane_b32 v42, s16, 14 -; CHECK-NEXT: v_writelane_b32 v42, s30, 0 -; CHECK-NEXT: v_writelane_b32 v42, s31, 1 -; CHECK-NEXT: v_writelane_b32 v42, s34, 2 -; CHECK-NEXT: v_writelane_b32 v42, s35, 3 -; CHECK-NEXT: v_writelane_b32 v42, s36, 4 -; CHECK-NEXT: v_writelane_b32 v42, s37, 5 -; CHECK-NEXT: v_writelane_b32 v42, s38, 6 -; CHECK-NEXT: v_writelane_b32 v42, s39, 7 -; CHECK-NEXT: s_addk_i32 s32, 0x400 -; CHECK-NEXT: v_writelane_b32 v42, s40, 8 -; CHECK-NEXT: v_writelane_b32 v42, s41, 9 -; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v42, s42, 10 -; CHECK-NEXT: v_writelane_b32 v42, s43, 11 -; CHECK-NEXT: v_writelane_b32 v42, s44, 12 -; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v42, s45, 13 -; CHECK-NEXT: v_mov_b32_e32 v40, v31 -; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7] -; CHECK-NEXT: s_mov_b32 s42, s15 -; CHECK-NEXT: s_mov_b32 s43, s14 -; CHECK-NEXT: s_mov_b32 s44, s13 -; CHECK-NEXT: s_mov_b32 s45, s12 -; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11] -; CHECK-NEXT: s_mov_b64 s[38:39], s[8:9] -; CHECK-NEXT: v_lshlrev_b32_e32 v41, 1, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v41 -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] -; CHECK-NEXT: s_mov_b64 s[6:7], s[34:35] -; CHECK-NEXT: s_mov_b64 s[8:9], s[38:39] -; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s45 -; CHECK-NEXT: s_mov_b32 s13, s44 -; CHECK-NEXT: s_mov_b32 s14, s43 -; CHECK-NEXT: s_mov_b32 s15, s42 -; CHECK-NEXT: v_mov_b32_e32 v31, v40 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: v_readlane_b32 s45, v42, 13 -; CHECK-NEXT: v_readlane_b32 s44, v42, 12 -; CHECK-NEXT: v_readlane_b32 s43, v42, 11 -; CHECK-NEXT: v_readlane_b32 s42, v42, 10 -; CHECK-NEXT: v_readlane_b32 s41, v42, 9 -; CHECK-NEXT: v_readlane_b32 s40, v42, 8 -; CHECK-NEXT: v_readlane_b32 s39, v42, 7 -; CHECK-NEXT: v_readlane_b32 s38, v42, 6 -; CHECK-NEXT: v_readlane_b32 s37, v42, 5 -; CHECK-NEXT: v_readlane_b32 s36, v42, 4 -; CHECK-NEXT: v_readlane_b32 s35, v42, 3 -; CHECK-NEXT: v_readlane_b32 s34, v42, 2 -; CHECK-NEXT: v_readlane_b32 s31, v42, 1 -; CHECK-NEXT: v_readlane_b32 s30, v42, 0 -; CHECK-NEXT: v_readlane_b32 s4, v42, 14 -; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xfc00 -; CHECK-NEXT: s_mov_b32 s33, s4 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: s_setpc_b64 s[30:31] %y = shl i32 %y.arg, 1 %call = tail call fast double @_Z4powndi(double %x, i32 %y) ret double %call } define half @test_pown_fast_f16_known_odd(half %x, i32 %y.arg) { -; CHECK-LABEL: test_pown_fast_f16_known_odd: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_or_b32_e32 v1, 1, v1 -; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1 -; CHECK-NEXT: v_log_f16_e64 v2, |v0| -; CHECK-NEXT: s_movk_i32 s4, 0x7fff -; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CHECK-NEXT: v_mul_f16_e32 v1, v2, v1 -; CHECK-NEXT: v_exp_f16_e32 v1, v1 -; CHECK-NEXT: v_bfi_b32 v0, s4, v1, v0 -; CHECK-NEXT: s_setpc_b64 s[30:31] %y = or i32 %y.arg, 1 %call = tail call fast half @_Z4pownDhi(half %x, i32 %y) ret half %call } define float @test_pown_fast_f32_known_odd(float %x, i32 %y.arg) { -; CHECK-LABEL: test_pown_fast_f32_known_odd: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s4, 0x800000 -; CHECK-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; CHECK-NEXT: v_mov_b32_e32 v3, 0x4f800000 -; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc -; CHECK-NEXT: v_mul_f32_e64 v3, |v0|, v3 -; CHECK-NEXT: v_or_b32_e32 v1, 1, v1 -; CHECK-NEXT: v_log_f32_e32 v3, v3 -; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1 -; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000 -; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; CHECK-NEXT: v_sub_f32_e32 v2, v3, v2 -; CHECK-NEXT: v_mul_f32_e32 v3, v2, v1 -; CHECK-NEXT: s_mov_b32 s4, 0xc2fc0000 -; CHECK-NEXT: v_mov_b32_e32 v4, 0x42800000 -; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v3 -; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc -; CHECK-NEXT: v_fma_f32 v1, v2, v1, v3 -; CHECK-NEXT: v_exp_f32_e32 v1, v1 -; CHECK-NEXT: v_mov_b32_e32 v2, 0x1f800000 -; CHECK-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; CHECK-NEXT: s_brev_b32 s4, -2 -; CHECK-NEXT: v_mul_f32_e32 v1, v1, v2 -; CHECK-NEXT: v_bfi_b32 v0, s4, v1, v0 -; CHECK-NEXT: s_setpc_b64 s[30:31] %y = or i32 %y.arg, 1 %call = tail call fast float @_Z4pownfi(float %x, i32 %y) ret float %call } define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) { -; CHECK-LABEL: test_pown_fast_f64_known_odd: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s16, s33 -; CHECK-NEXT: s_mov_b32 s33, s32 -; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 -; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: v_writelane_b32 v43, s16, 14 -; CHECK-NEXT: v_writelane_b32 v43, s30, 0 -; CHECK-NEXT: v_writelane_b32 v43, s31, 1 -; CHECK-NEXT: v_writelane_b32 v43, s34, 2 -; CHECK-NEXT: v_writelane_b32 v43, s35, 3 -; CHECK-NEXT: v_writelane_b32 v43, s36, 4 -; CHECK-NEXT: v_writelane_b32 v43, s37, 5 -; CHECK-NEXT: v_writelane_b32 v43, s38, 6 -; CHECK-NEXT: v_writelane_b32 v43, s39, 7 -; CHECK-NEXT: s_addk_i32 s32, 0x800 -; CHECK-NEXT: v_writelane_b32 v43, s40, 8 -; CHECK-NEXT: v_writelane_b32 v43, s41, 9 -; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v43, s42, 10 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v43, s43, 11 -; CHECK-NEXT: v_mov_b32_e32 v41, v1 -; CHECK-NEXT: v_writelane_b32 v43, s44, 12 -; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v41 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: v_writelane_b32 v43, s45, 13 -; CHECK-NEXT: v_mov_b32_e32 v40, v31 -; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7] -; CHECK-NEXT: s_mov_b32 s42, s15 -; CHECK-NEXT: s_mov_b32 s43, s14 -; CHECK-NEXT: s_mov_b32 s44, s13 -; CHECK-NEXT: s_mov_b32 s45, s12 -; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11] -; CHECK-NEXT: s_mov_b64 s[38:39], s[8:9] -; CHECK-NEXT: v_or_b32_e32 v42, 1, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v42 -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] -; CHECK-NEXT: s_mov_b64 s[6:7], s[34:35] -; CHECK-NEXT: s_mov_b64 s[8:9], s[38:39] -; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s45 -; CHECK-NEXT: s_mov_b32 s13, s44 -; CHECK-NEXT: s_mov_b32 s14, s43 -; CHECK-NEXT: s_mov_b32 s15, s42 -; CHECK-NEXT: v_mov_b32_e32 v31, v40 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_and_b32_e32 v2, 0x80000000, v41 -; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: v_or_b32_e32 v1, v2, v1 -; CHECK-NEXT: v_readlane_b32 s45, v43, 13 -; CHECK-NEXT: v_readlane_b32 s44, v43, 12 -; CHECK-NEXT: v_readlane_b32 s43, v43, 11 -; CHECK-NEXT: v_readlane_b32 s42, v43, 10 -; CHECK-NEXT: v_readlane_b32 s41, v43, 9 -; CHECK-NEXT: v_readlane_b32 s40, v43, 8 -; CHECK-NEXT: v_readlane_b32 s39, v43, 7 -; CHECK-NEXT: v_readlane_b32 s38, v43, 6 -; CHECK-NEXT: v_readlane_b32 s37, v43, 5 -; CHECK-NEXT: v_readlane_b32 s36, v43, 4 -; CHECK-NEXT: v_readlane_b32 s35, v43, 3 -; CHECK-NEXT: v_readlane_b32 s34, v43, 2 -; CHECK-NEXT: v_readlane_b32 s31, v43, 1 -; CHECK-NEXT: v_readlane_b32 s30, v43, 0 -; CHECK-NEXT: v_readlane_b32 s4, v43, 14 -; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xf800 -; CHECK-NEXT: s_mov_b32 s33, s4 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: s_setpc_b64 s[30:31] %y = or i32 %y.arg, 1 %call = tail call fast double @_Z4powndi(double %x, i32 %y) ret double %call @@ -776,3 +126,5 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) { !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll index 9ec8e425a3f55c..5889af70a8f092 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -236,7 +236,7 @@ entry: ; R600-VECT: MOVA_INT ; SI-PROMOTE-VECT-DAG: s_lshl_b32 -; SI-PROMOTE-VECT-DAG: v_lshrrev +; SI-PROMOTE-VECT-DAG: s_lshr_b32 ; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 ; encoding: [0x00,0x00,0x60,0xe0 ; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:1 ; encoding: [0x01,0x00,0x60,0xe0 diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index 04d72691a088ab..86254329923971 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -951,11 +951,11 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr3 = V_OR_B32_e32 killed $vgpr11, killed $vgpr19, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr3, killed $vgpr2, implicit $exec ; GFX90A-NEXT: renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_sdwa 0, killed $vgpr53, 0, $vgpr3, 0, 0, 6, implicit $exec + ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr53, 0, $vgpr3, 0, 0, 6, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 killed $vgpr52, killed $vgpr13, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr10, killed $vgpr2, implicit $exec - ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_sdwa 0, killed $vgpr17, 0, $vgpr3, 0, 0, 6, implicit $exec + ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr17, 0, $vgpr3, 0, 0, 6, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr15, implicit $exec ; GFX90A-NEXT: DS_WRITE2_B32_gfx9 killed renamable $vgpr3, killed renamable $vgpr2, renamable $vgpr3, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3) diff --git a/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll b/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll index 82808cd3092270..07816f1ed6a650 100644 --- a/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll +++ b/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefix=ISA ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -stop-before=si-fix-sgpr-copies < %s | FileCheck %s -check-prefix=MIR @@ -41,67 +40,6 @@ define void @f(i32 %arg, ptr %ptr) { ; ISA-NEXT: flat_store_dword v[1:2], v7 ; ISA-NEXT: s_waitcnt lgkmcnt(0) ; ISA-NEXT: s_setpc_b64 s[30:31] - ; MIR-LABEL: name: f - ; MIR: bb.0.bb: - ; MIR-NEXT: successors: %bb.1(0x80000000) - ; MIR-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; MIR-NEXT: {{ $}} - ; MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; MIR-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; MIR-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; MIR-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; MIR-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; MIR-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; MIR-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[S_MOV_B64_]], 0, 0 :: (invariant load (s64) from `ptr addrspace(4) null`, align 4294967296, addrspace 4) - ; MIR-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 - ; MIR-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0 - ; MIR-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 - ; MIR-NEXT: [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc - ; MIR-NEXT: [[S_LSHR_B32_1:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[S_MOV_B32_]], [[COPY5]], implicit-def dead $scc - ; MIR-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; MIR-NEXT: S_CMP_LG_U32 [[COPY5]], [[S_MOV_B32_1]], implicit-def $scc - ; MIR-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc - ; MIR-NEXT: $scc = COPY [[COPY6]] - ; MIR-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_LSHR_B32_]], [[S_MOV_B32_1]], implicit $scc - ; MIR-NEXT: [[V_CVT_F32_I32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 killed [[S_CSELECT_B32_]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_I32_e64_]] - ; MIR-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216 - ; MIR-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 - ; MIR-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_2]] - ; MIR-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_3]], 0, [[COPY8]], [[COPY6]], implicit $exec - ; MIR-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY [[V_CNDMASK_B32_e64_]] - ; MIR-NEXT: $scc = COPY [[COPY6]] - ; MIR-NEXT: [[S_CSELECT_B32_1:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_LSHR_B32_1]], [[S_MOV_B32_1]], implicit $scc - ; MIR-NEXT: [[V_CVT_F32_UBYTE0_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_UBYTE0_e64 killed [[S_CSELECT_B32_1]], 0, 0, implicit $exec - ; MIR-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_UBYTE0_e64_]] - ; MIR-NEXT: $scc = COPY [[COPY6]] - ; MIR-NEXT: [[S_CSELECT_B32_2:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[COPY4]], [[S_MOV_B32_1]], implicit $scc - ; MIR-NEXT: [[V_CVT_F32_I32_e64_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 killed [[S_CSELECT_B32_2]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_I32_e64_1]] - ; MIR-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[COPY2]], [[S_MOV_B32_]], implicit $exec - ; MIR-NEXT: [[COPY12:%[0-9]+]]:vreg_1 = COPY [[V_CMP_LT_I32_e64_]] - ; MIR-NEXT: {{ $}} - ; MIR-NEXT: bb.1.bb14: - ; MIR-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) - ; MIR-NEXT: {{ $}} - ; MIR-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_1]], %bb.0, %7, %bb.1 - ; MIR-NEXT: [[PHI1:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_3]], %bb.0, %8, %bb.1 - ; MIR-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY12]] - ; MIR-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[COPY13]], [[PHI]], implicit-def dead $scc - ; MIR-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[PHI1]], 0, [[COPY9]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_]], 2, [[COPY7]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_1]], 0, [[COPY10]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_2]], 0, [[COPY11]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[COPY14:%[0-9]+]]:sgpr_32 = COPY [[V_ADD_F32_e64_3]] - ; MIR-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; MIR-NEXT: S_BRANCH %bb.2 - ; MIR-NEXT: {{ $}} - ; MIR-NEXT: bb.2.bb21: - ; MIR-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1 - ; MIR-NEXT: [[PHI3:%[0-9]+]]:sreg_32 = PHI [[SI_IF_BREAK]], %bb.1 - ; MIR-NEXT: SI_END_CF [[PHI3]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; MIR-NEXT: FLAT_STORE_DWORD [[COPY3]], [[PHI2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.ptr) - ; MIR-NEXT: SI_RETURN bb: %i = load <2 x i32>, ptr addrspace(4) null, align 4294967296 %i1 = extractelement <2 x i32> %i, i64 1 @@ -134,3 +72,5 @@ bb21: } declare float @llvm.fabs.f32(float) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; MIR: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll index 231d3d97c8f4f3..29770738f83d57 100644 --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -1069,31 +1069,32 @@ define amdgpu_kernel void @amd_kernel_v2i8(<2 x i8> %arg0) { ; VI-LABEL: amd_kernel_v2i8: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dword s0, s[2:3], 0x24 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_add_i32 s0, s0, s0 -; VI-NEXT: v_add_u32_sdwa v2, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; VI-NEXT: v_mov_b32_e32 v3, s0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_bfe_u32 s1, s0, 0x80008 +; VI-NEXT: s_add_i32 s0, s0, s0 +; VI-NEXT: s_add_i32 s1, s1, s1 +; VI-NEXT: s_and_b32 s0, s0, 0xff +; VI-NEXT: s_lshl_b32 s1, s1, 8 +; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: amd_kernel_v2i8: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b16 v0, 8, s0 -; GFX11-NEXT: v_add_nc_u16 v1, s0, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_nc_u16 v0, v0, v0 -; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshlrev_b16 v3, 8, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bfe_u32 s1, s0, 0x80008 +; GFX11-NEXT: s_add_i32 s0, s0, s0 +; GFX11-NEXT: s_add_i32 s1, s1, s1 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1136,51 +1137,52 @@ define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) { ; VI-LABEL: amd_kernel_v4i8: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s1, s0, 24 ; VI-NEXT: s_lshr_b32 s2, s0, 16 +; VI-NEXT: s_lshr_b32 s1, s0, 24 ; VI-NEXT: s_add_i32 s2, s2, s2 +; VI-NEXT: s_bfe_u32 s3, s0, 0x80008 ; VI-NEXT: s_add_i32 s1, s1, s1 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_and_b32 s2, s2, 0xff +; VI-NEXT: s_add_i32 s3, s3, s3 ; VI-NEXT: s_add_i32 s0, s0, s0 -; VI-NEXT: v_lshlrev_b16_e64 v1, 8, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; VI-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v0, 0 -; VI-NEXT: v_mov_b32_e32 v1, 0 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshl_b32 s1, s1, 24 +; VI-NEXT: s_lshl_b32 s2, s2, 16 +; VI-NEXT: s_or_b32 s1, s1, s2 +; VI-NEXT: s_and_b32 s0, s0, 0xff +; VI-NEXT: s_lshl_b32 s2, s3, 8 +; VI-NEXT: s_or_b32 s0, s0, s2 +; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: amd_kernel_v4i8: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b16 v0, 8, s0 ; GFX11-NEXT: s_lshr_b32 s1, s0, 16 ; GFX11-NEXT: s_lshr_b32 s2, s0, 24 -; GFX11-NEXT: v_add_nc_u16 v1, s0, s0 -; GFX11-NEXT: v_add_nc_u16 v2, s2, s2 -; GFX11-NEXT: v_add_nc_u16 v0, v0, v0 -; GFX11-NEXT: v_add_nc_u16 v3, s1, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: v_lshlrev_b16 v2, 8, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b16 v0, 8, v0 -; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_or_b32_e32 v1, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v3, 16, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-NEXT: s_add_i32 s3, s0, s0 +; GFX11-NEXT: s_bfe_u32 s0, s0, 0x80008 +; GFX11-NEXT: s_add_i32 s2, s2, s2 +; GFX11-NEXT: s_add_i32 s0, s0, s0 +; GFX11-NEXT: s_add_i32 s1, s1, s1 +; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_lshl_b32 s0, s0, 8 +; GFX11-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_or_b32 s0, s3, s0 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1219,45 +1221,44 @@ define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) { ; VI-LABEL: amd_kernel_v3i8: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dword s0, s[2:3], 0x24 -; VI-NEXT: v_mov_b32_e32 v2, 0 -; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: v_mov_b32_e32 v0, 2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s1, s0, 16 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_bfe_u32 s2, s0, 0x80008 ; VI-NEXT: s_add_i32 s0, s0, s0 -; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: s_add_i32 s1, s1, s1 -; VI-NEXT: v_or_b32_sdwa v4, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v0, 2 +; VI-NEXT: s_add_i32 s2, s2, s2 +; VI-NEXT: s_and_b32 s0, s0, 0xff +; VI-NEXT: s_lshl_b32 s2, s2, 8 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: s_or_b32 s0, s0, s2 +; VI-NEXT: flat_store_byte v[0:1], v2 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: flat_store_byte v[0:1], v5 -; VI-NEXT: flat_store_short v[2:3], v4 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: amd_kernel_v3i8: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: v_mov_b32_e32 v3, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b16 v0, 8, s0 -; GFX11-NEXT: v_add_nc_u16 v1, s0, s0 -; GFX11-NEXT: s_lshr_b32 s0, s0, 16 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_nc_u16 v6, s0, s0 -; GFX11-NEXT: v_add_nc_u16 v0, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v1 -; GFX11-NEXT: v_lshlrev_b16 v5, 8, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 2 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bfe_u32 s2, s0, 0x80008 +; GFX11-NEXT: s_lshr_b32 s1, s0, 16 +; GFX11-NEXT: s_add_i32 s0, s0, s0 +; GFX11-NEXT: s_add_i32 s2, s2, s2 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-NEXT: s_add_i32 s1, s1, s1 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b8 v[0:1], v6, off -; GFX11-NEXT: global_store_b16 v[2:3], v4, off +; GFX11-NEXT: global_store_b8 v[0:1], v4, off +; GFX11-NEXT: global_store_b16 v[2:3], v5, off ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1304,60 +1305,63 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) { ; VI-LABEL: amd_kernel_v5i8: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 4 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s2, s0, 24 ; VI-NEXT: s_lshr_b32 s3, s0, 16 +; VI-NEXT: s_lshr_b32 s2, s0, 24 ; VI-NEXT: s_add_i32 s3, s3, s3 +; VI-NEXT: s_bfe_u32 s4, s0, 0x80008 ; VI-NEXT: s_add_i32 s2, s2, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_and_b32 s3, s3, 0xff +; VI-NEXT: s_add_i32 s4, s4, s4 ; VI-NEXT: s_add_i32 s0, s0, s0 -; VI-NEXT: v_lshlrev_b16_e64 v1, 8, s2 -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_lshl_b32 s2, s2, 24 +; VI-NEXT: s_lshl_b32 s3, s3, 16 ; VI-NEXT: s_and_b32 s1, s1, 0xff -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s2, s2, s3 +; VI-NEXT: s_and_b32 s0, s0, 0xff +; VI-NEXT: s_lshl_b32 s3, s4, 8 ; VI-NEXT: s_add_i32 s1, s1, s1 -; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v0, 4 +; VI-NEXT: s_or_b32 s0, s0, s3 +; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: s_or_b32 s0, s0, s2 +; VI-NEXT: flat_store_byte v[0:1], v2 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v2, 0 -; VI-NEXT: v_mov_b32_e32 v3, 0 -; VI-NEXT: flat_store_byte v[0:1], v5 -; VI-NEXT: flat_store_dword v[2:3], v4 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: amd_kernel_v5i8: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b16 v0, 8, s0 ; GFX11-NEXT: s_lshr_b32 s2, s0, 16 ; GFX11-NEXT: s_lshr_b32 s3, s0, 24 -; GFX11-NEXT: v_add_nc_u16 v1, s0, s0 -; GFX11-NEXT: v_add_nc_u16 v2, s3, s3 -; GFX11-NEXT: v_add_nc_u16 v0, v0, v0 -; GFX11-NEXT: v_add_nc_u16 v3, s2, s2 -; GFX11-NEXT: v_add_nc_u16 v6, s1, s1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: v_lshlrev_b16 v2, 8, v2 -; GFX11-NEXT: v_lshlrev_b16 v0, 8, v0 -; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_or_b32_e32 v1, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v4, 0xffff, v0 -; GFX11-NEXT: v_dual_mov_b32 v0, 4 :: v_dual_lshlrev_b32 v5, 16, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX11-NEXT: s_add_i32 s4, s0, s0 +; GFX11-NEXT: s_bfe_u32 s0, s0, 0x80008 +; GFX11-NEXT: s_add_i32 s3, s3, s3 +; GFX11-NEXT: s_add_i32 s0, s0, s0 +; GFX11-NEXT: s_add_i32 s2, s2, s2 +; GFX11-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-NEXT: s_lshl_b32 s0, s0, 8 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_or_b32 s0, s4, s0 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_add_i32 s1, s1, s1 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b8 v[0:1], v6, off -; GFX11-NEXT: global_store_b32 v[2:3], v4, off +; GFX11-NEXT: global_store_b8 v[0:1], v4, off +; GFX11-NEXT: global_store_b32 v[2:3], v5, off ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1416,35 +1420,43 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) { ; VI-LABEL: amd_kernel_v8i8: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 0 +; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s2, s1, 24 ; VI-NEXT: s_lshr_b32 s3, s1, 16 +; VI-NEXT: s_lshr_b32 s2, s1, 24 ; VI-NEXT: s_add_i32 s3, s3, s3 +; VI-NEXT: s_bfe_u32 s6, s1, 0x80008 ; VI-NEXT: s_add_i32 s2, s2, s2 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_and_b32 s3, s3, 0xff +; VI-NEXT: s_add_i32 s6, s6, s6 ; VI-NEXT: s_add_i32 s1, s1, s1 -; VI-NEXT: v_lshlrev_b16_e64 v2, 8, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_add_u32_sdwa v1, vcc, v1, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: s_lshr_b32 s4, s0, 24 +; VI-NEXT: s_lshl_b32 s2, s2, 24 +; VI-NEXT: s_lshl_b32 s3, s3, 16 ; VI-NEXT: s_lshr_b32 s5, s0, 16 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_add_i32 s0, s0, s0 -; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s2, s2, s3 +; VI-NEXT: s_and_b32 s1, s1, 0xff +; VI-NEXT: s_lshl_b32 s3, s6, 8 +; VI-NEXT: s_lshr_b32 s4, s0, 24 ; VI-NEXT: s_add_i32 s5, s5, s5 +; VI-NEXT: s_or_b32 s1, s1, s3 +; VI-NEXT: s_bfe_u32 s7, s0, 0x80008 ; VI-NEXT: s_add_i32 s4, s4, s4 -; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e64 v2, 8, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_or_b32_sdwa v4, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v2, 0 -; VI-NEXT: v_mov_b32_e32 v3, 0 -; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s1, s1, 0xffff +; VI-NEXT: s_and_b32 s3, s5, 0xff +; VI-NEXT: s_add_i32 s7, s7, s7 +; VI-NEXT: s_add_i32 s0, s0, s0 +; VI-NEXT: s_or_b32 s1, s1, s2 +; VI-NEXT: s_lshl_b32 s2, s4, 24 +; VI-NEXT: s_lshl_b32 s3, s3, 16 +; VI-NEXT: s_or_b32 s2, s2, s3 +; VI-NEXT: s_and_b32 s0, s0, 0xff +; VI-NEXT: s_lshl_b32 s3, s7, 8 +; VI-NEXT: s_or_b32 s0, s0, s3 +; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_or_b32 s0, s0, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1452,45 +1464,42 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) { ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b16 v0, 8, s0 -; GFX11-NEXT: v_lshrrev_b16 v1, 8, s1 ; GFX11-NEXT: s_lshr_b32 s2, s0, 16 ; GFX11-NEXT: s_lshr_b32 s3, s0, 24 ; GFX11-NEXT: s_lshr_b32 s4, s1, 16 ; GFX11-NEXT: s_lshr_b32 s5, s1, 24 -; GFX11-NEXT: v_add_nc_u16 v2, s1, s1 -; GFX11-NEXT: v_add_nc_u16 v3, s0, s0 -; GFX11-NEXT: v_add_nc_u16 v4, s5, s5 -; GFX11-NEXT: v_add_nc_u16 v5, s4, s4 -; GFX11-NEXT: v_add_nc_u16 v6, s3, s3 -; GFX11-NEXT: v_add_nc_u16 v7, s2, s2 -; GFX11-NEXT: v_add_nc_u16 v1, v1, v1 -; GFX11-NEXT: v_add_nc_u16 v0, v0, v0 -; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-NEXT: v_lshlrev_b16 v4, 8, v4 -; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-NEXT: v_lshlrev_b16 v6, 8, v6 -; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX11-NEXT: v_lshlrev_b16 v0, 8, v0 -; GFX11-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v5, v7, v6 -; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v5, 0xffff, v0 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_or_b32_e32 v3, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_or_b32_e32 v2, v5, v4 -; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX11-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX11-NEXT: s_add_i32 s1, s1, s1 +; GFX11-NEXT: s_add_i32 s0, s0, s0 +; GFX11-NEXT: s_add_i32 s7, s7, s7 +; GFX11-NEXT: s_add_i32 s5, s5, s5 +; GFX11-NEXT: s_add_i32 s4, s4, s4 +; GFX11-NEXT: s_add_i32 s6, s6, s6 +; GFX11-NEXT: s_add_i32 s3, s3, s3 +; GFX11-NEXT: s_add_i32 s2, s2, s2 +; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_or_b32 s1, s1, s7 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s0, s0, s6 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_lshl_b32 s3, s4, 16 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_or_b32 s1, s1, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, 0 +; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1583,61 +1592,77 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) { ; VI-LABEL: amd_kernel_v16i8: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: v_mov_b32_e32 v4, 0 +; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s4, s3, 24 ; VI-NEXT: s_lshr_b32 s5, s3, 16 +; VI-NEXT: s_lshr_b32 s4, s3, 24 ; VI-NEXT: s_add_i32 s5, s5, s5 +; VI-NEXT: s_bfe_u32 s12, s3, 0x80008 ; VI-NEXT: s_add_i32 s4, s4, s4 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_and_b32 s5, s5, 0xff +; VI-NEXT: s_add_i32 s12, s12, s12 ; VI-NEXT: s_add_i32 s3, s3, s3 -; VI-NEXT: v_lshlrev_b16_e64 v4, 8, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: s_lshr_b32 s6, s2, 24 +; VI-NEXT: s_lshl_b32 s4, s4, 24 +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_lshr_b32 s7, s2, 16 -; VI-NEXT: v_add_u32_sdwa v3, vcc, v3, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s3, s3, 0xff +; VI-NEXT: s_lshl_b32 s5, s12, 8 +; VI-NEXT: s_lshr_b32 s6, s2, 24 ; VI-NEXT: s_add_i32 s7, s7, s7 +; VI-NEXT: s_or_b32 s3, s3, s5 +; VI-NEXT: s_bfe_u32 s13, s2, 0x80008 ; VI-NEXT: s_add_i32 s6, s6, s6 -; VI-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_and_b32 s3, s3, 0xffff +; VI-NEXT: s_and_b32 s5, s7, 0xff +; VI-NEXT: s_add_i32 s13, s13, s13 ; VI-NEXT: s_add_i32 s2, s2, s2 -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e64 v4, 8, s6 -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: s_lshr_b32 s8, s1, 24 +; VI-NEXT: s_or_b32 s3, s3, s4 +; VI-NEXT: s_lshl_b32 s4, s6, 24 +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_lshr_b32 s9, s1, 16 -; VI-NEXT: v_add_u32_sdwa v2, vcc, v2, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v5, s2 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s2, s2, 0xff +; VI-NEXT: s_lshl_b32 s5, s13, 8 +; VI-NEXT: s_lshr_b32 s8, s1, 24 ; VI-NEXT: s_add_i32 s9, s9, s9 +; VI-NEXT: s_or_b32 s2, s2, s5 +; VI-NEXT: s_bfe_u32 s14, s1, 0x80008 ; VI-NEXT: s_add_i32 s8, s8, s8 -; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_and_b32 s5, s9, 0xff +; VI-NEXT: s_add_i32 s14, s14, s14 ; VI-NEXT: s_add_i32 s1, s1, s1 -; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e64 v4, 8, s8 -; VI-NEXT: v_mov_b32_e32 v5, s9 -; VI-NEXT: s_lshr_b32 s10, s0, 24 +; VI-NEXT: s_or_b32 s2, s2, s4 +; VI-NEXT: s_lshl_b32 s4, s8, 24 +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_lshr_b32 s11, s0, 16 -; VI-NEXT: v_add_u32_sdwa v1, vcc, v1, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s1, s1, 0xff +; VI-NEXT: s_lshl_b32 s5, s14, 8 +; VI-NEXT: s_lshr_b32 s10, s0, 24 ; VI-NEXT: s_add_i32 s11, s11, s11 +; VI-NEXT: s_or_b32 s1, s1, s5 +; VI-NEXT: s_bfe_u32 s15, s0, 0x80008 ; VI-NEXT: s_add_i32 s10, s10, s10 -; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_and_b32 s1, s1, 0xffff +; VI-NEXT: s_and_b32 s5, s11, 0xff +; VI-NEXT: s_add_i32 s15, s15, s15 ; VI-NEXT: s_add_i32 s0, s0, s0 -; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e64 v4, 8, s10 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; VI-NEXT: v_or_b32_sdwa v6, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v4, 0 -; VI-NEXT: v_mov_b32_e32 v5, 0 -; VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s1, s1, s4 +; VI-NEXT: s_lshl_b32 s4, s10, 24 +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s0, s0, 0xff +; VI-NEXT: s_lshl_b32 s5, s15, 8 +; VI-NEXT: s_or_b32 s0, s0, s5 +; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_or_b32 s0, s0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; @@ -1645,72 +1670,73 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) { ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s6, s1, 16 +; GFX11-NEXT: s_lshr_b32 s7, s1, 24 +; GFX11-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-NEXT: s_lshr_b32 s9, s2, 24 ; GFX11-NEXT: s_lshr_b32 s10, s3, 16 ; GFX11-NEXT: s_lshr_b32 s11, s3, 24 -; GFX11-NEXT: v_lshrrev_b16 v2, 8, s2 -; GFX11-NEXT: v_lshrrev_b16 v3, 8, s3 -; GFX11-NEXT: v_add_nc_u16 v7, s11, s11 -; GFX11-NEXT: v_add_nc_u16 v8, s10, s10 -; GFX11-NEXT: v_add_nc_u16 v4, s3, s3 -; GFX11-NEXT: v_add_nc_u16 v5, s2, s2 -; GFX11-NEXT: v_add_nc_u16 v3, v3, v3 -; GFX11-NEXT: v_add_nc_u16 v2, v2, v2 -; GFX11-NEXT: v_lshlrev_b16 v7, 8, v7 -; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-NEXT: s_lshr_b32 s7, s1, 24 -; GFX11-NEXT: v_lshrrev_b16 v1, 8, s1 -; GFX11-NEXT: v_lshrrev_b16 v0, 8, s0 -; GFX11-NEXT: v_add_nc_u16 v11, s7, s7 -; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-NEXT: v_lshlrev_b16 v3, 8, v3 -; GFX11-NEXT: v_lshlrev_b16 v2, 8, v2 -; GFX11-NEXT: v_or_b32_e32 v7, v8, v7 -; GFX11-NEXT: s_lshr_b32 s6, s1, 16 ; GFX11-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11-NEXT: s_lshr_b32 s5, s0, 24 -; GFX11-NEXT: s_lshr_b32 s8, s2, 16 -; GFX11-NEXT: s_lshr_b32 s9, s2, 24 -; GFX11-NEXT: v_add_nc_u16 v6, s1, s1 -; GFX11-NEXT: v_add_nc_u16 v12, s6, s6 -; GFX11-NEXT: v_add_nc_u16 v1, v1, v1 -; GFX11-NEXT: v_add_nc_u16 v9, s9, s9 -; GFX11-NEXT: v_add_nc_u16 v10, s8, s8 -; GFX11-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX11-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX11-NEXT: v_lshlrev_b16 v4, 8, v11 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; GFX11-NEXT: v_add_nc_u16 v7, s0, s0 -; GFX11-NEXT: v_add_nc_u16 v0, v0, v0 -; GFX11-NEXT: v_add_nc_u16 v8, s5, s5 -; GFX11-NEXT: v_add_nc_u16 v11, s4, s4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-NEXT: v_lshlrev_b16 v9, 8, v9 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-NEXT: v_lshlrev_b16 v0, 8, v0 -; GFX11-NEXT: v_lshlrev_b16 v8, 8, v8 -; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX11-NEXT: v_or_b32_e32 v4, v12, v4 -; GFX11-NEXT: v_or_b32_e32 v9, v10, v9 -; GFX11-NEXT: v_or_b32_e32 v0, v7, v0 -; GFX11-NEXT: v_or_b32_e32 v6, v11, v8 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-NEXT: v_or_b32_e32 v3, v3, v5 -; GFX11-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: v_or_b32_e32 v2, v2, v7 -; GFX11-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX11-NEXT: s_bfe_u32 s12, s0, 0x80008 +; GFX11-NEXT: s_bfe_u32 s13, s1, 0x80008 +; GFX11-NEXT: s_bfe_u32 s14, s2, 0x80008 +; GFX11-NEXT: s_bfe_u32 s15, s3, 0x80008 +; GFX11-NEXT: s_add_i32 s11, s11, s11 +; GFX11-NEXT: s_add_i32 s10, s10, s10 +; GFX11-NEXT: s_add_i32 s9, s9, s9 +; GFX11-NEXT: s_add_i32 s8, s8, s8 +; GFX11-NEXT: s_add_i32 s7, s7, s7 +; GFX11-NEXT: s_add_i32 s6, s6, s6 +; GFX11-NEXT: s_add_i32 s3, s3, s3 +; GFX11-NEXT: s_add_i32 s2, s2, s2 +; GFX11-NEXT: s_add_i32 s15, s15, s15 +; GFX11-NEXT: s_add_i32 s14, s14, s14 +; GFX11-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-NEXT: s_and_b32 s8, s8, 0xff +; GFX11-NEXT: s_add_i32 s1, s1, s1 +; GFX11-NEXT: s_add_i32 s13, s13, s13 +; GFX11-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-NEXT: s_add_i32 s0, s0, s0 +; GFX11-NEXT: s_add_i32 s12, s12, s12 +; GFX11-NEXT: s_add_i32 s5, s5, s5 +; GFX11-NEXT: s_add_i32 s4, s4, s4 +; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s15, s15, 8 +; GFX11-NEXT: s_or_b32 s10, s10, s11 +; GFX11-NEXT: s_lshl_b32 s11, s14, 8 +; GFX11-NEXT: s_or_b32 s8, s8, s9 +; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s13, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s12, 8 +; GFX11-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-NEXT: s_or_b32 s3, s3, s15 +; GFX11-NEXT: s_or_b32 s2, s2, s11 +; GFX11-NEXT: s_or_b32 s1, s1, s9 +; GFX11-NEXT: s_or_b32 s0, s0, s7 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-NEXT: s_lshl_b32 s5, s6, 16 +; GFX11-NEXT: s_or_b32 s3, s3, s10 +; GFX11-NEXT: s_or_b32 s2, s2, s8 +; GFX11-NEXT: s_or_b32 s0, s0, s4 +; GFX11-NEXT: s_or_b32 s1, s1, s5 +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, 0 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1875,258 +1901,292 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) { ; VI-LABEL: amd_kernel_v32i8: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 -; VI-NEXT: v_mov_b32_e32 v10, 0 -; VI-NEXT: v_mov_b32_e32 v11, 0 +; VI-NEXT: v_mov_b32_e32 v4, 16 +; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s8, s3, 24 ; VI-NEXT: s_lshr_b32 s9, s3, 16 +; VI-NEXT: s_lshr_b32 s8, s3, 24 ; VI-NEXT: s_add_i32 s9, s9, s9 +; VI-NEXT: s_bfe_u32 s24, s3, 0x80008 ; VI-NEXT: s_add_i32 s8, s8, s8 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_and_b32 s9, s9, 0xff +; VI-NEXT: s_add_i32 s24, s24, s24 ; VI-NEXT: s_add_i32 s3, s3, s3 -; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s8 -; VI-NEXT: v_mov_b32_e32 v9, s9 -; VI-NEXT: s_lshr_b32 s10, s2, 24 +; VI-NEXT: s_lshl_b32 s8, s8, 24 +; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_lshr_b32 s11, s2, 16 -; VI-NEXT: v_add_u32_sdwa v3, vcc, v3, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v9, s3 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s3, s3, 0xff +; VI-NEXT: s_lshl_b32 s9, s24, 8 +; VI-NEXT: s_lshr_b32 s10, s2, 24 ; VI-NEXT: s_add_i32 s11, s11, s11 +; VI-NEXT: s_or_b32 s3, s3, s9 +; VI-NEXT: s_bfe_u32 s25, s2, 0x80008 ; VI-NEXT: s_add_i32 s10, s10, s10 -; VI-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_and_b32 s3, s3, 0xffff +; VI-NEXT: s_and_b32 s9, s11, 0xff +; VI-NEXT: s_add_i32 s25, s25, s25 ; VI-NEXT: s_add_i32 s2, s2, s2 -; VI-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s10 -; VI-NEXT: v_mov_b32_e32 v9, s11 -; VI-NEXT: s_lshr_b32 s12, s1, 24 +; VI-NEXT: s_or_b32 s3, s3, s8 +; VI-NEXT: s_lshl_b32 s8, s10, 24 +; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_lshr_b32 s13, s1, 16 -; VI-NEXT: v_add_u32_sdwa v2, vcc, v2, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v9, s2 -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s2, s2, 0xff +; VI-NEXT: s_lshl_b32 s9, s25, 8 +; VI-NEXT: s_lshr_b32 s12, s1, 24 ; VI-NEXT: s_add_i32 s13, s13, s13 +; VI-NEXT: s_or_b32 s2, s2, s9 +; VI-NEXT: s_bfe_u32 s26, s1, 0x80008 ; VI-NEXT: s_add_i32 s12, s12, s12 -; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_sdwa v4, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_and_b32 s9, s13, 0xff +; VI-NEXT: s_add_i32 s26, s26, s26 ; VI-NEXT: s_add_i32 s1, s1, s1 -; VI-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s12 -; VI-NEXT: v_mov_b32_e32 v9, s13 -; VI-NEXT: s_lshr_b32 s14, s0, 24 +; VI-NEXT: s_or_b32 s2, s2, s8 +; VI-NEXT: s_lshl_b32 s8, s12, 24 +; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_lshr_b32 s15, s0, 16 -; VI-NEXT: v_add_u32_sdwa v5, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_add_u32_sdwa v1, vcc, v1, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v9, s1 -; VI-NEXT: v_add_u32_sdwa v6, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s1, s1, 0xff +; VI-NEXT: s_lshl_b32 s9, s26, 8 +; VI-NEXT: s_lshr_b32 s14, s0, 24 ; VI-NEXT: s_add_i32 s15, s15, s15 +; VI-NEXT: s_or_b32 s1, s1, s9 +; VI-NEXT: s_bfe_u32 s27, s0, 0x80008 ; VI-NEXT: s_add_i32 s14, s14, s14 -; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_sdwa v7, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_and_b32 s1, s1, 0xffff +; VI-NEXT: s_and_b32 s9, s15, 0xff +; VI-NEXT: s_add_i32 s27, s27, s27 ; VI-NEXT: s_add_i32 s0, s0, s0 -; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s14 -; VI-NEXT: v_mov_b32_e32 v9, s15 -; VI-NEXT: s_lshr_b32 s16, s7, 24 +; VI-NEXT: s_or_b32 s1, s1, s8 +; VI-NEXT: s_lshl_b32 s8, s14, 24 +; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_lshr_b32 s17, s7, 16 -; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v9, s0 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s0, s0, 0xff +; VI-NEXT: s_lshl_b32 s9, s27, 8 +; VI-NEXT: s_lshr_b32 s16, s7, 24 ; VI-NEXT: s_add_i32 s17, s17, s17 +; VI-NEXT: s_or_b32 s0, s0, s9 +; VI-NEXT: s_bfe_u32 s28, s7, 0x80008 ; VI-NEXT: s_add_i32 s16, s16, s16 -; VI-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_and_b32 s9, s17, 0xff +; VI-NEXT: s_add_i32 s28, s28, s28 ; VI-NEXT: s_add_i32 s7, s7, s7 -; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s16 -; VI-NEXT: v_mov_b32_e32 v9, s17 -; VI-NEXT: s_lshr_b32 s18, s6, 24 +; VI-NEXT: s_or_b32 s0, s0, s8 +; VI-NEXT: s_lshl_b32 s8, s16, 24 +; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_lshr_b32 s19, s6, 16 -; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v9, s7 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s7, s7, 0xff +; VI-NEXT: s_lshl_b32 s9, s28, 8 +; VI-NEXT: s_lshr_b32 s18, s6, 24 ; VI-NEXT: s_add_i32 s19, s19, s19 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_bfe_u32 s29, s6, 0x80008 ; VI-NEXT: s_add_i32 s18, s18, s18 -; VI-NEXT: v_or_b32_sdwa v7, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s9, s19, 0xff +; VI-NEXT: s_add_i32 s29, s29, s29 ; VI-NEXT: s_add_i32 s6, s6, s6 -; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s18 -; VI-NEXT: v_mov_b32_e32 v9, s19 -; VI-NEXT: s_lshr_b32 s20, s5, 24 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_lshl_b32 s8, s18, 24 +; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_lshr_b32 s21, s5, 16 -; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v9, s6 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s6, s6, 0xff +; VI-NEXT: s_lshl_b32 s9, s29, 8 +; VI-NEXT: s_lshr_b32 s20, s5, 24 ; VI-NEXT: s_add_i32 s21, s21, s21 +; VI-NEXT: s_or_b32 s6, s6, s9 +; VI-NEXT: s_bfe_u32 s30, s5, 0x80008 ; VI-NEXT: s_add_i32 s20, s20, s20 -; VI-NEXT: v_or_b32_sdwa v6, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_and_b32 s9, s21, 0xff +; VI-NEXT: s_add_i32 s30, s30, s30 ; VI-NEXT: s_add_i32 s5, s5, s5 -; VI-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s20 -; VI-NEXT: v_mov_b32_e32 v9, s21 -; VI-NEXT: s_lshr_b32 s22, s4, 24 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_lshl_b32 s8, s20, 24 +; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_lshr_b32 s23, s4, 16 -; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v9, s5 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xff +; VI-NEXT: s_lshl_b32 s9, s30, 8 +; VI-NEXT: s_lshr_b32 s22, s4, 24 ; VI-NEXT: s_add_i32 s23, s23, s23 +; VI-NEXT: s_or_b32 s5, s5, s9 +; VI-NEXT: s_bfe_u32 s31, s4, 0x80008 ; VI-NEXT: s_add_i32 s22, s22, s22 -; VI-NEXT: v_or_b32_sdwa v5, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_and_b32 s9, s23, 0xff +; VI-NEXT: s_add_i32 s31, s31, s31 ; VI-NEXT: s_add_i32 s4, s4, s4 -; VI-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s22 -; VI-NEXT: v_mov_b32_e32 v9, s23 -; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v9, s4 -; VI-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v8, 16 -; VI-NEXT: v_mov_b32_e32 v9, 0 -; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; VI-NEXT: flat_store_dwordx4 v[10:11], v[0:3] +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_lshl_b32 s8, s22, 24 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s4, s4, 0xff +; VI-NEXT: s_lshl_b32 s9, s31, 8 +; VI-NEXT: s_or_b32 s4, s4, s9 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s8 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v4, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v5, 0 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: amd_kernel_v32i8: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b16 v3, 8, s2 -; GFX11-NEXT: v_lshrrev_b16 v7, 8, s3 -; GFX11-NEXT: s_lshr_b32 s21, s3, 16 -; GFX11-NEXT: s_lshr_b32 s22, s3, 24 -; GFX11-NEXT: v_add_nc_u16 v8, s3, s3 -; GFX11-NEXT: v_add_nc_u16 v9, s2, s2 -; GFX11-NEXT: v_add_nc_u16 v7, v7, v7 -; GFX11-NEXT: v_add_nc_u16 v10, s22, s22 -; GFX11-NEXT: v_add_nc_u16 v11, s21, s21 -; GFX11-NEXT: v_add_nc_u16 v3, v3, v3 -; GFX11-NEXT: v_lshrrev_b16 v2, 8, s1 -; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-NEXT: v_lshlrev_b16 v7, 8, v7 -; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-NEXT: v_lshlrev_b16 v10, 8, v10 -; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-NEXT: v_lshlrev_b16 v3, 8, v3 -; GFX11-NEXT: s_lshr_b32 s18, s1, 16 -; GFX11-NEXT: s_lshr_b32 s19, s1, 24 -; GFX11-NEXT: s_lshr_b32 s20, s2, 24 -; GFX11-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-NEXT: v_or_b32_e32 v7, v8, v7 -; GFX11-NEXT: v_add_nc_u16 v8, s20, s20 -; GFX11-NEXT: v_or_b32_e32 v10, v11, v10 -; GFX11-NEXT: v_or_b32_e32 v3, v9, v3 -; GFX11-NEXT: v_add_nc_u16 v9, s2, s2 -; GFX11-NEXT: v_add_nc_u16 v11, s1, s1 -; GFX11-NEXT: v_add_nc_u16 v2, v2, v2 -; GFX11-NEXT: v_add_nc_u16 v12, s19, s19 -; GFX11-NEXT: v_add_nc_u16 v13, s18, s18 -; GFX11-NEXT: v_lshrrev_b16 v1, 8, s0 -; GFX11-NEXT: v_lshlrev_b16 v8, 8, v8 -; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-NEXT: v_lshlrev_b16 v2, 8, v2 -; GFX11-NEXT: v_lshlrev_b16 v12, 8, v12 -; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v3 -; GFX11-NEXT: v_or_b32_e32 v3, v9, v8 -; GFX11-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX11-NEXT: v_add_nc_u16 v9, s0, s0 -; GFX11-NEXT: v_or_b32_e32 v8, v13, v12 -; GFX11-NEXT: v_add_nc_u16 v1, v1, v1 -; GFX11-NEXT: v_lshrrev_b16 v6, 8, s7 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-NEXT: v_lshlrev_b16 v13, 8, v1 -; GFX11-NEXT: v_lshrrev_b16 v5, 8, s6 -; GFX11-NEXT: s_lshr_b32 s14, s7, 16 -; GFX11-NEXT: s_lshr_b32 s15, s7, 24 ; GFX11-NEXT: s_lshr_b32 s16, s0, 16 ; GFX11-NEXT: s_lshr_b32 s17, s0, 24 -; GFX11-NEXT: v_or_b32_e32 v3, v7, v10 -; GFX11-NEXT: v_or_b32_e32 v2, v14, v11 -; GFX11-NEXT: v_add_nc_u16 v7, s7, s7 -; GFX11-NEXT: v_or_b32_e32 v1, v12, v8 -; GFX11-NEXT: v_or_b32_e32 v8, v9, v13 -; GFX11-NEXT: v_add_nc_u16 v9, s17, s17 -; GFX11-NEXT: v_add_nc_u16 v10, s16, s16 -; GFX11-NEXT: v_add_nc_u16 v6, v6, v6 -; GFX11-NEXT: v_add_nc_u16 v11, s15, s15 -; GFX11-NEXT: v_add_nc_u16 v12, s14, s14 -; GFX11-NEXT: v_add_nc_u16 v13, s6, s6 -; GFX11-NEXT: v_add_nc_u16 v5, v5, v5 -; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-NEXT: v_lshlrev_b16 v6, 8, v6 -; GFX11-NEXT: v_lshlrev_b16 v11, 8, v11 -; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-NEXT: v_lshlrev_b16 v9, 8, v9 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-NEXT: v_lshrrev_b16 v0, 8, s4 -; GFX11-NEXT: v_lshrrev_b16 v4, 8, s5 -; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GFX11-NEXT: v_lshlrev_b16 v5, 8, v5 +; GFX11-NEXT: s_lshr_b32 s20, s2, 16 +; GFX11-NEXT: s_lshr_b32 s21, s2, 24 +; GFX11-NEXT: s_lshr_b32 s14, s7, 16 +; GFX11-NEXT: s_lshr_b32 s15, s7, 24 +; GFX11-NEXT: s_bfe_u32 s27, s7, 0x80008 +; GFX11-NEXT: s_add_i32 s17, s17, s17 +; GFX11-NEXT: s_add_i32 s16, s16, s16 +; GFX11-NEXT: s_lshr_b32 s18, s1, 16 +; GFX11-NEXT: s_lshr_b32 s19, s1, 24 +; GFX11-NEXT: s_lshr_b32 s22, s3, 16 +; GFX11-NEXT: s_lshr_b32 s23, s3, 24 +; GFX11-NEXT: s_bfe_u32 s29, s1, 0x80008 +; GFX11-NEXT: s_bfe_u32 s30, s3, 0x80008 +; GFX11-NEXT: s_add_i32 s21, s21, s21 +; GFX11-NEXT: s_add_i32 s20, s20, s20 +; GFX11-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-NEXT: s_add_i32 s7, s7, s7 +; GFX11-NEXT: s_add_i32 s27, s27, s27 +; GFX11-NEXT: s_add_i32 s15, s15, s15 +; GFX11-NEXT: s_add_i32 s14, s14, s14 +; GFX11-NEXT: s_add_i32 s3, s3, s3 +; GFX11-NEXT: s_add_i32 s30, s30, s30 +; GFX11-NEXT: s_add_i32 s23, s23, s23 +; GFX11-NEXT: s_add_i32 s22, s22, s22 +; GFX11-NEXT: s_lshl_b32 s21, s21, 8 +; GFX11-NEXT: s_and_b32 s20, s20, 0xff +; GFX11-NEXT: s_add_i32 s1, s1, s1 +; GFX11-NEXT: s_add_i32 s29, s29, s29 +; GFX11-NEXT: s_add_i32 s19, s19, s19 +; GFX11-NEXT: s_add_i32 s18, s18, s18 +; GFX11-NEXT: s_lshr_b32 s10, s5, 16 +; GFX11-NEXT: s_lshr_b32 s11, s5, 24 ; GFX11-NEXT: s_lshr_b32 s12, s6, 16 ; GFX11-NEXT: s_lshr_b32 s13, s6, 24 +; GFX11-NEXT: s_or_b32 s16, s16, s17 +; GFX11-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-NEXT: s_lshl_b32 s17, s27, 8 +; GFX11-NEXT: s_lshl_b32 s15, s15, 8 +; GFX11-NEXT: s_and_b32 s14, s14, 0xff +; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_lshl_b32 s30, s30, 8 +; GFX11-NEXT: s_lshl_b32 s23, s23, 8 +; GFX11-NEXT: s_and_b32 s22, s22, 0xff +; GFX11-NEXT: s_or_b32 s20, s20, s21 +; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_lshl_b32 s21, s29, 8 +; GFX11-NEXT: s_lshl_b32 s19, s19, 8 +; GFX11-NEXT: s_and_b32 s18, s18, 0xff ; GFX11-NEXT: s_lshr_b32 s8, s4, 16 ; GFX11-NEXT: s_lshr_b32 s9, s4, 24 -; GFX11-NEXT: s_lshr_b32 s10, s5, 16 -; GFX11-NEXT: s_lshr_b32 s11, s5, 24 -; GFX11-NEXT: v_or_b32_e32 v6, v7, v6 -; GFX11-NEXT: v_or_b32_e32 v7, v12, v11 -; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v8 -; GFX11-NEXT: v_or_b32_e32 v8, v10, v9 -; GFX11-NEXT: v_add_nc_u16 v9, s13, s13 -; GFX11-NEXT: v_add_nc_u16 v10, s12, s12 -; GFX11-NEXT: v_or_b32_e32 v5, v13, v5 -; GFX11-NEXT: v_add_nc_u16 v11, s5, s5 -; GFX11-NEXT: v_add_nc_u16 v4, v4, v4 -; GFX11-NEXT: v_add_nc_u16 v13, s11, s11 -; GFX11-NEXT: v_add_nc_u16 v14, s10, s10 -; GFX11-NEXT: v_add_nc_u16 v15, s4, s4 -; GFX11-NEXT: v_add_nc_u16 v0, v0, v0 -; GFX11-NEXT: v_add_nc_u16 v16, s9, s9 -; GFX11-NEXT: v_add_nc_u16 v17, s8, s8 -; GFX11-NEXT: v_lshlrev_b16 v9, 8, v9 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-NEXT: v_lshlrev_b16 v4, 8, v4 -; GFX11-NEXT: v_lshlrev_b16 v13, 8, v13 -; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GFX11-NEXT: v_lshlrev_b16 v0, 8, v0 -; GFX11-NEXT: v_lshlrev_b16 v16, 8, v16 -; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GFX11-NEXT: v_or_b32_e32 v9, v10, v9 -; GFX11-NEXT: v_or_b32_e32 v4, v11, v4 -; GFX11-NEXT: v_or_b32_e32 v10, v14, v13 -; GFX11-NEXT: v_or_b32_e32 v0, v15, v0 -; GFX11-NEXT: v_or_b32_e32 v11, v17, v16 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v8 -; GFX11-NEXT: v_or_b32_e32 v7, v6, v7 -; GFX11-NEXT: v_or_b32_e32 v6, v5, v9 -; GFX11-NEXT: v_mov_b32_e32 v8, 16 -; GFX11-NEXT: v_mov_b32_e32 v9, 0 -; GFX11-NEXT: v_or_b32_e32 v5, v4, v10 -; GFX11-NEXT: v_or_b32_e32 v4, v0, v11 -; GFX11-NEXT: v_mov_b32_e32 v10, 0 -; GFX11-NEXT: v_mov_b32_e32 v11, 0 -; GFX11-NEXT: v_or_b32_e32 v0, v12, v13 +; GFX11-NEXT: s_bfe_u32 s24, s4, 0x80008 +; GFX11-NEXT: s_bfe_u32 s25, s5, 0x80008 +; GFX11-NEXT: s_bfe_u32 s26, s6, 0x80008 +; GFX11-NEXT: s_or_b32 s7, s7, s17 +; GFX11-NEXT: s_or_b32 s14, s14, s15 +; GFX11-NEXT: s_add_i32 s13, s13, s13 +; GFX11-NEXT: s_add_i32 s12, s12, s12 +; GFX11-NEXT: s_add_i32 s11, s11, s11 +; GFX11-NEXT: s_add_i32 s10, s10, s10 +; GFX11-NEXT: s_bfe_u32 s28, s0, 0x80008 +; GFX11-NEXT: s_or_b32 s3, s3, s30 +; GFX11-NEXT: s_or_b32 s22, s22, s23 +; GFX11-NEXT: s_bfe_u32 s23, s2, 0x80008 +; GFX11-NEXT: s_or_b32 s1, s1, s21 +; GFX11-NEXT: s_or_b32 s18, s18, s19 +; GFX11-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-NEXT: s_lshl_b32 s14, s14, 16 +; GFX11-NEXT: s_add_i32 s6, s6, s6 +; GFX11-NEXT: s_add_i32 s26, s26, s26 +; GFX11-NEXT: s_lshl_b32 s13, s13, 8 +; GFX11-NEXT: s_and_b32 s12, s12, 0xff +; GFX11-NEXT: s_add_i32 s5, s5, s5 +; GFX11-NEXT: s_add_i32 s25, s25, s25 +; GFX11-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-NEXT: s_add_i32 s4, s4, s4 +; GFX11-NEXT: s_add_i32 s24, s24, s24 +; GFX11-NEXT: s_add_i32 s9, s9, s9 +; GFX11-NEXT: s_add_i32 s8, s8, s8 +; GFX11-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-NEXT: s_lshl_b32 s22, s22, 16 +; GFX11-NEXT: s_add_i32 s2, s2, s2 +; GFX11-NEXT: s_add_i32 s23, s23, s23 +; GFX11-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-NEXT: s_lshl_b32 s18, s18, 16 +; GFX11-NEXT: s_add_i32 s0, s0, s0 +; GFX11-NEXT: s_add_i32 s28, s28, s28 +; GFX11-NEXT: s_or_b32 s7, s7, s14 +; GFX11-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-NEXT: s_lshl_b32 s14, s26, 8 +; GFX11-NEXT: s_or_b32 s12, s12, s13 +; GFX11-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-NEXT: s_lshl_b32 s13, s25, 8 +; GFX11-NEXT: s_or_b32 s10, s10, s11 +; GFX11-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-NEXT: s_lshl_b32 s11, s24, 8 +; GFX11-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-NEXT: s_and_b32 s8, s8, 0xff +; GFX11-NEXT: s_or_b32 s3, s3, s22 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s22, s23, 8 +; GFX11-NEXT: s_or_b32 s1, s1, s18 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s18, s28, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s14 +; GFX11-NEXT: s_or_b32 s5, s5, s13 +; GFX11-NEXT: s_or_b32 s4, s4, s11 +; GFX11-NEXT: s_or_b32 s8, s8, s9 +; GFX11-NEXT: s_or_b32 s2, s2, s22 +; GFX11-NEXT: s_or_b32 s0, s0, s18 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s12, s12, 16 +; GFX11-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-NEXT: s_lshl_b32 s9, s10, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s20, s20, 16 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s16, s16, 16 +; GFX11-NEXT: s_or_b32 s6, s6, s12 +; GFX11-NEXT: s_or_b32 s4, s4, s8 +; GFX11-NEXT: s_or_b32 s5, s5, s9 +; GFX11-NEXT: s_or_b32 s2, s2, s20 +; GFX11-NEXT: s_or_b32 s0, s0, s16 +; GFX11-NEXT: v_dual_mov_b32 v8, 16 :: v_dual_mov_b32 v5, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v4, s0 +; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v7, s3 +; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v11, 0 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v[8:9], v[4:7], off -; GFX11-NEXT: global_store_b128 v[10:11], v[0:3], off +; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off +; GFX11-NEXT: global_store_b128 v[10:11], v[4:7], off ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -2777,57 +2837,56 @@ define amdgpu_cs void @amdgpu_cs_inreg_v8i1(<8 x i1> inreg %arg0) { ; ; VI-LABEL: amdgpu_cs_inreg_v8i1: ; VI: ; %bb.0: -; VI-NEXT: v_and_b32_e64 v1, s6, 1 -; VI-NEXT: v_lshlrev_b16_e64 v0, 3, s7 -; VI-NEXT: v_lshlrev_b16_e32 v1, 2, v1 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_lshlrev_b16_e64 v1, 1, s5 -; VI-NEXT: v_and_b32_e64 v2, s4, 1 -; VI-NEXT: v_or_b32_e32 v1, v2, v1 -; VI-NEXT: v_and_b32_e32 v1, 3, v1 -; VI-NEXT: v_and_b32_e64 v2, s2, 1 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_lshlrev_b16_e64 v1, 3, s3 -; VI-NEXT: v_lshlrev_b16_e32 v2, 2, v2 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 -; VI-NEXT: v_lshlrev_b16_e64 v2, 1, s1 -; VI-NEXT: v_and_b32_e64 v3, s0, 1 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: v_and_b32_e32 v2, 3, v2 -; VI-NEXT: v_or_b32_e32 v1, v2, v1 -; VI-NEXT: v_lshlrev_b16_e32 v0, 4, v0 -; VI-NEXT: v_and_b32_e32 v1, 15, v1 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: s_and_b32 s6, s6, 1 +; VI-NEXT: s_lshl_b32 s5, s5, 1 +; VI-NEXT: s_and_b32 s4, s4, 1 +; VI-NEXT: s_and_b32 s2, s2, 1 +; VI-NEXT: s_lshl_b32 s1, s1, 1 +; VI-NEXT: s_and_b32 s0, s0, 1 +; VI-NEXT: s_lshl_b32 s7, s7, 3 +; VI-NEXT: s_lshl_b32 s6, s6, 2 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_lshl_b32 s3, s3, 3 +; VI-NEXT: s_lshl_b32 s2, s2, 2 +; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s4, s4, 3 +; VI-NEXT: s_or_b32 s2, s3, s2 +; VI-NEXT: s_and_b32 s0, s0, 3 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_or_b32 s0, s0, s2 +; VI-NEXT: s_lshl_b32 s4, s4, 4 +; VI-NEXT: s_and_b32 s0, s0, 15 +; VI-NEXT: s_or_b32 s0, s0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: flat_store_byte v[0:1], v0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: amdgpu_cs_inreg_v8i1: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_and_b32_e64 v1, s6, 1 -; GFX11-NEXT: v_lshlrev_b16 v2, 1, s5 -; GFX11-NEXT: v_and_b32_e64 v3, s4, 1 -; GFX11-NEXT: v_and_b32_e64 v4, s2, 1 -; GFX11-NEXT: v_lshlrev_b16 v5, 1, s1 -; GFX11-NEXT: v_and_b32_e64 v6, s0, 1 -; GFX11-NEXT: v_lshlrev_b16 v0, 3, s7 -; GFX11-NEXT: v_lshlrev_b16 v1, 2, v1 -; GFX11-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX11-NEXT: v_lshlrev_b16 v3, 3, s3 -; GFX11-NEXT: v_lshlrev_b16 v4, 2, v4 -; GFX11-NEXT: v_or_b32_e32 v5, v6, v5 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v2, v3, v4 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_or_b32_e32 v1, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b16 v0, 4, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-NEXT: s_and_b32 s6, s6, 1 +; GFX11-NEXT: s_lshl_b32 s5, s5, 1 +; GFX11-NEXT: s_and_b32 s4, s4, 1 +; GFX11-NEXT: s_and_b32 s2, s2, 1 +; GFX11-NEXT: s_lshl_b32 s1, s1, 1 +; GFX11-NEXT: s_and_b32 s0, s0, 1 +; GFX11-NEXT: s_lshl_b32 s7, s7, 3 +; GFX11-NEXT: s_lshl_b32 s6, s6, 2 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_lshl_b32 s3, s3, 3 +; GFX11-NEXT: s_lshl_b32 s2, s2, 2 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_or_b32 s5, s7, s6 +; GFX11-NEXT: s_and_b32 s4, s4, 3 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_and_b32 s0, s0, 3 +; GFX11-NEXT: s_or_b32 s2, s4, s5 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_lshl_b32 s1, s2, 4 +; GFX11-NEXT: s_and_b32 s0, s0, 15 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b8 v[0:1], v0, off ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2892,105 +2951,104 @@ define amdgpu_cs void @amdgpu_cs_inreg_v16i1(<16 x i1> inreg %arg0) { ; ; VI-LABEL: amdgpu_cs_inreg_v16i1: ; VI: ; %bb.0: -; VI-NEXT: v_and_b32_e64 v1, s14, 1 -; VI-NEXT: v_lshlrev_b16_e64 v0, 3, s15 -; VI-NEXT: v_lshlrev_b16_e32 v1, 2, v1 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_lshlrev_b16_e64 v1, 1, s13 -; VI-NEXT: v_and_b32_e64 v2, s12, 1 -; VI-NEXT: v_or_b32_e32 v1, v2, v1 -; VI-NEXT: v_and_b32_e32 v1, 3, v1 -; VI-NEXT: v_and_b32_e64 v2, s10, 1 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_lshlrev_b16_e64 v1, 3, s11 -; VI-NEXT: v_lshlrev_b16_e32 v2, 2, v2 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 -; VI-NEXT: v_lshlrev_b16_e64 v2, 1, s9 -; VI-NEXT: v_and_b32_e64 v3, s8, 1 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: v_and_b32_e32 v2, 3, v2 -; VI-NEXT: v_or_b32_e32 v1, v2, v1 -; VI-NEXT: v_mov_b32_e32 v2, 15 -; VI-NEXT: v_lshlrev_b16_e32 v0, 12, v0 -; VI-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_and_b32_e64 v2, s6, 1 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_lshlrev_b16_e64 v1, 3, s7 -; VI-NEXT: v_lshlrev_b16_e32 v2, 2, v2 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 -; VI-NEXT: v_lshlrev_b16_e64 v2, 1, s5 -; VI-NEXT: v_and_b32_e64 v3, s4, 1 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: v_and_b32_e32 v2, 3, v2 -; VI-NEXT: v_and_b32_e64 v3, s2, 1 -; VI-NEXT: v_or_b32_e32 v1, v2, v1 -; VI-NEXT: v_lshlrev_b16_e64 v2, 3, s3 -; VI-NEXT: v_lshlrev_b16_e32 v3, 2, v3 -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_lshlrev_b16_e64 v3, 1, s1 -; VI-NEXT: v_and_b32_e64 v4, s0, 1 -; VI-NEXT: v_or_b32_e32 v3, v4, v3 -; VI-NEXT: v_and_b32_e32 v3, 3, v3 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: v_lshlrev_b16_e32 v1, 4, v1 -; VI-NEXT: v_and_b32_e32 v2, 15, v2 -; VI-NEXT: v_or_b32_e32 v1, v2, v1 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s10, s10, 1 +; VI-NEXT: s_lshl_b32 s9, s9, 1 +; VI-NEXT: s_and_b32 s8, s8, 1 +; VI-NEXT: s_and_b32 s6, s6, 1 +; VI-NEXT: s_lshl_b32 s5, s5, 1 +; VI-NEXT: s_and_b32 s4, s4, 1 +; VI-NEXT: s_and_b32 s2, s2, 1 +; VI-NEXT: s_lshl_b32 s1, s1, 1 +; VI-NEXT: s_and_b32 s0, s0, 1 +; VI-NEXT: s_and_b32 s14, s14, 1 +; VI-NEXT: s_lshl_b32 s13, s13, 1 +; VI-NEXT: s_and_b32 s12, s12, 1 +; VI-NEXT: s_lshl_b32 s11, s11, 3 +; VI-NEXT: s_lshl_b32 s10, s10, 2 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_lshl_b32 s7, s7, 3 +; VI-NEXT: s_lshl_b32 s6, s6, 2 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_lshl_b32 s3, s3, 3 +; VI-NEXT: s_lshl_b32 s2, s2, 2 +; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: s_lshl_b32 s15, s15, 3 +; VI-NEXT: s_lshl_b32 s14, s14, 2 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s11, s10 +; VI-NEXT: s_and_b32 s8, s8, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s4, s4, 3 +; VI-NEXT: s_or_b32 s2, s3, s2 +; VI-NEXT: s_and_b32 s0, s0, 3 +; VI-NEXT: s_or_b32 s14, s15, s14 +; VI-NEXT: s_and_b32 s12, s12, 3 +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_or_b32 s0, s0, s2 +; VI-NEXT: s_or_b32 s12, s12, s14 +; VI-NEXT: s_and_b32 s8, s8, 15 +; VI-NEXT: s_lshl_b32 s4, s4, 4 +; VI-NEXT: s_and_b32 s0, s0, 15 +; VI-NEXT: s_lshl_b32 s12, s12, 12 +; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_or_b32 s0, s0, s4 +; VI-NEXT: s_or_b32 s8, s12, s8 +; VI-NEXT: s_and_b32 s0, s0, 0xff +; VI-NEXT: s_or_b32 s0, s0, s8 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: flat_store_short v[0:1], v0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: amdgpu_cs_inreg_v16i1: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_and_b32_e64 v0, s10, 1 -; GFX11-NEXT: v_lshlrev_b16 v2, 1, s13 -; GFX11-NEXT: v_and_b32_e64 v3, s12, 1 -; GFX11-NEXT: v_lshlrev_b16 v5, 1, s9 -; GFX11-NEXT: v_and_b32_e64 v6, s8, 1 -; GFX11-NEXT: v_lshlrev_b16 v4, 3, s11 -; GFX11-NEXT: v_lshlrev_b16 v0, 2, v0 -; GFX11-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX11-NEXT: v_and_b32_e64 v8, s4, 1 -; GFX11-NEXT: v_or_b32_e32 v3, v6, v5 -; GFX11-NEXT: v_and_b32_e64 v5, s6, 1 -; GFX11-NEXT: v_lshlrev_b16 v6, 1, s5 -; GFX11-NEXT: v_and_b32_e64 v9, s2, 1 -; GFX11-NEXT: v_lshlrev_b16 v10, 1, s1 -; GFX11-NEXT: v_and_b32_e64 v11, s0, 1 -; GFX11-NEXT: v_and_b32_e64 v1, s14, 1 -; GFX11-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX11-NEXT: v_lshlrev_b16 v4, 3, s7 -; GFX11-NEXT: v_lshlrev_b16 v5, 2, v5 -; GFX11-NEXT: v_or_b32_e32 v6, v8, v6 -; GFX11-NEXT: v_lshlrev_b16 v8, 3, s3 -; GFX11-NEXT: v_lshlrev_b16 v9, 2, v9 -; GFX11-NEXT: v_or_b32_e32 v10, v11, v10 -; GFX11-NEXT: v_lshlrev_b16 v7, 3, s15 -; GFX11-NEXT: v_lshlrev_b16 v1, 2, v1 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX11-NEXT: v_and_b32_e32 v5, 3, v6 -; GFX11-NEXT: v_or_b32_e32 v6, v8, v9 -; GFX11-NEXT: v_and_b32_e32 v8, 3, v10 -; GFX11-NEXT: v_or_b32_e32 v1, v7, v1 -; GFX11-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX11-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX11-NEXT: v_or_b32_e32 v3, v5, v4 -; GFX11-NEXT: v_or_b32_e32 v4, v8, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b16 v2, 4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 15, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b16 v1, 12, v1 -; GFX11-NEXT: v_lshlrev_b16 v0, 8, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-NEXT: s_and_b32 s10, s10, 1 +; GFX11-NEXT: s_lshl_b32 s9, s9, 1 +; GFX11-NEXT: s_and_b32 s8, s8, 1 +; GFX11-NEXT: s_and_b32 s6, s6, 1 +; GFX11-NEXT: s_lshl_b32 s5, s5, 1 +; GFX11-NEXT: s_and_b32 s4, s4, 1 +; GFX11-NEXT: s_and_b32 s2, s2, 1 +; GFX11-NEXT: s_lshl_b32 s1, s1, 1 +; GFX11-NEXT: s_and_b32 s0, s0, 1 +; GFX11-NEXT: s_and_b32 s14, s14, 1 +; GFX11-NEXT: s_lshl_b32 s13, s13, 1 +; GFX11-NEXT: s_and_b32 s12, s12, 1 +; GFX11-NEXT: s_lshl_b32 s11, s11, 3 +; GFX11-NEXT: s_lshl_b32 s10, s10, 2 +; GFX11-NEXT: s_or_b32 s8, s8, s9 +; GFX11-NEXT: s_lshl_b32 s7, s7, 3 +; GFX11-NEXT: s_lshl_b32 s6, s6, 2 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_lshl_b32 s3, s3, 3 +; GFX11-NEXT: s_lshl_b32 s2, s2, 2 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_lshl_b32 s15, s15, 3 +; GFX11-NEXT: s_lshl_b32 s14, s14, 2 +; GFX11-NEXT: s_or_b32 s12, s12, s13 +; GFX11-NEXT: s_or_b32 s9, s11, s10 +; GFX11-NEXT: s_and_b32 s8, s8, 3 +; GFX11-NEXT: s_or_b32 s5, s7, s6 +; GFX11-NEXT: s_and_b32 s4, s4, 3 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_and_b32 s0, s0, 3 +; GFX11-NEXT: s_or_b32 s13, s15, s14 +; GFX11-NEXT: s_and_b32 s12, s12, 3 +; GFX11-NEXT: s_or_b32 s8, s8, s9 +; GFX11-NEXT: s_or_b32 s2, s4, s5 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_or_b32 s10, s12, s13 +; GFX11-NEXT: s_and_b32 s8, s8, 15 +; GFX11-NEXT: s_lshl_b32 s1, s2, 4 +; GFX11-NEXT: s_and_b32 s0, s0, 15 +; GFX11-NEXT: s_lshl_b32 s9, s10, 12 +; GFX11-NEXT: s_lshl_b32 s2, s8, 8 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_or_b32 s1, s9, s2 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b16 v[0:1], v0, off ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3103,196 +3161,200 @@ define amdgpu_cs void @amdgpu_cs_inreg_v32i1(<32 x i1> inreg %arg0) { ; ; VI-LABEL: amdgpu_cs_inreg_v32i1: ; VI: ; %bb.0: -; VI-NEXT: v_and_b32_e64 v1, s14, 1 -; VI-NEXT: v_lshlrev_b16_e64 v0, 3, s15 -; VI-NEXT: v_lshlrev_b16_e32 v1, 2, v1 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_lshlrev_b16_e64 v1, 1, s13 -; VI-NEXT: v_and_b32_e64 v2, s12, 1 -; VI-NEXT: v_or_b32_e32 v1, v2, v1 -; VI-NEXT: v_and_b32_e32 v1, 3, v1 -; VI-NEXT: v_and_b32_e64 v2, s10, 1 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_lshlrev_b16_e64 v1, 3, s11 -; VI-NEXT: v_lshlrev_b16_e32 v2, 2, v2 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 -; VI-NEXT: v_lshlrev_b16_e64 v2, 1, s9 -; VI-NEXT: v_and_b32_e64 v3, s8, 1 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: v_and_b32_e32 v2, 3, v2 -; VI-NEXT: v_or_b32_e32 v1, v2, v1 -; VI-NEXT: v_mov_b32_e32 v2, 15 -; VI-NEXT: v_lshlrev_b16_e32 v0, 12, v0 -; VI-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_and_b32_e64 v3, s6, 1 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_lshlrev_b16_e64 v1, 3, s7 -; VI-NEXT: v_lshlrev_b16_e32 v3, 2, v3 -; VI-NEXT: v_or_b32_e32 v1, v1, v3 -; VI-NEXT: v_lshlrev_b16_e64 v3, 1, s5 -; VI-NEXT: v_and_b32_e64 v4, s4, 1 -; VI-NEXT: v_or_b32_e32 v3, v4, v3 -; VI-NEXT: v_and_b32_e32 v3, 3, v3 -; VI-NEXT: v_and_b32_e64 v4, s2, 1 -; VI-NEXT: v_or_b32_e32 v1, v3, v1 -; VI-NEXT: v_lshlrev_b16_e64 v3, 3, s3 -; VI-NEXT: v_lshlrev_b16_e32 v4, 2, v4 -; VI-NEXT: v_or_b32_e32 v3, v3, v4 -; VI-NEXT: v_lshlrev_b16_e64 v4, 1, s1 -; VI-NEXT: v_and_b32_e64 v5, s0, 1 -; VI-NEXT: v_or_b32_e32 v4, v5, v4 -; VI-NEXT: v_and_b32_e32 v4, 3, v4 -; VI-NEXT: v_or_b32_e32 v3, v4, v3 -; VI-NEXT: v_lshlrev_b16_e32 v1, 4, v1 -; VI-NEXT: v_and_b32_e32 v3, 15, v3 -; VI-NEXT: v_or_b32_e32 v1, v3, v1 -; VI-NEXT: v_and_b32_e64 v3, s30, 1 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e64 v1, 3, s31 -; VI-NEXT: v_lshlrev_b16_e32 v3, 2, v3 -; VI-NEXT: v_or_b32_e32 v1, v1, v3 -; VI-NEXT: v_lshlrev_b16_e64 v3, 1, s29 -; VI-NEXT: v_and_b32_e64 v4, s28, 1 -; VI-NEXT: v_or_b32_e32 v3, v4, v3 -; VI-NEXT: v_and_b32_e32 v3, 3, v3 -; VI-NEXT: v_and_b32_e64 v4, s26, 1 -; VI-NEXT: v_or_b32_e32 v1, v3, v1 -; VI-NEXT: v_lshlrev_b16_e64 v3, 3, s27 -; VI-NEXT: v_lshlrev_b16_e32 v4, 2, v4 -; VI-NEXT: v_or_b32_e32 v3, v3, v4 -; VI-NEXT: v_lshlrev_b16_e64 v4, 1, s25 -; VI-NEXT: v_and_b32_e64 v5, s24, 1 -; VI-NEXT: v_or_b32_e32 v4, v5, v4 -; VI-NEXT: v_and_b32_e32 v4, 3, v4 -; VI-NEXT: v_or_b32_e32 v3, v4, v3 -; VI-NEXT: v_lshlrev_b16_e32 v1, 12, v1 -; VI-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_and_b32_e64 v3, s22, 1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 -; VI-NEXT: v_lshlrev_b16_e64 v2, 3, s23 -; VI-NEXT: v_lshlrev_b16_e32 v3, 2, v3 -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_lshlrev_b16_e64 v3, 1, s21 -; VI-NEXT: v_and_b32_e64 v4, s20, 1 -; VI-NEXT: v_or_b32_e32 v3, v4, v3 -; VI-NEXT: v_and_b32_e32 v3, 3, v3 -; VI-NEXT: v_and_b32_e64 v4, s18, 1 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: v_lshlrev_b16_e64 v3, 3, s19 -; VI-NEXT: v_lshlrev_b16_e32 v4, 2, v4 -; VI-NEXT: v_or_b32_e32 v3, v3, v4 -; VI-NEXT: v_lshlrev_b16_e64 v4, 1, s17 -; VI-NEXT: v_and_b32_e64 v5, s16, 1 -; VI-NEXT: v_or_b32_e32 v4, v5, v4 -; VI-NEXT: v_and_b32_e32 v4, 3, v4 -; VI-NEXT: v_or_b32_e32 v3, v4, v3 -; VI-NEXT: v_lshlrev_b16_e32 v2, 4, v2 -; VI-NEXT: v_and_b32_e32 v3, 15, v3 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s26, s26, 1 +; VI-NEXT: s_lshl_b32 s25, s25, 1 +; VI-NEXT: s_and_b32 s24, s24, 1 +; VI-NEXT: s_and_b32 s22, s22, 1 +; VI-NEXT: s_lshl_b32 s21, s21, 1 +; VI-NEXT: s_and_b32 s20, s20, 1 +; VI-NEXT: s_and_b32 s18, s18, 1 +; VI-NEXT: s_lshl_b32 s17, s17, 1 +; VI-NEXT: s_and_b32 s16, s16, 1 +; VI-NEXT: s_and_b32 s10, s10, 1 +; VI-NEXT: s_lshl_b32 s9, s9, 1 +; VI-NEXT: s_and_b32 s8, s8, 1 +; VI-NEXT: s_and_b32 s6, s6, 1 +; VI-NEXT: s_lshl_b32 s5, s5, 1 +; VI-NEXT: s_and_b32 s4, s4, 1 +; VI-NEXT: s_and_b32 s2, s2, 1 +; VI-NEXT: s_lshl_b32 s1, s1, 1 +; VI-NEXT: s_and_b32 s0, s0, 1 +; VI-NEXT: s_and_b32 s30, s30, 1 +; VI-NEXT: s_lshl_b32 s29, s29, 1 +; VI-NEXT: s_and_b32 s28, s28, 1 +; VI-NEXT: s_lshl_b32 s27, s27, 3 +; VI-NEXT: s_lshl_b32 s26, s26, 2 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_lshl_b32 s23, s23, 3 +; VI-NEXT: s_lshl_b32 s22, s22, 2 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_lshl_b32 s19, s19, 3 +; VI-NEXT: s_lshl_b32 s18, s18, 2 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s14, s14, 1 +; VI-NEXT: s_lshl_b32 s13, s13, 1 +; VI-NEXT: s_and_b32 s12, s12, 1 +; VI-NEXT: s_lshl_b32 s11, s11, 3 +; VI-NEXT: s_lshl_b32 s10, s10, 2 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_lshl_b32 s7, s7, 3 +; VI-NEXT: s_lshl_b32 s6, s6, 2 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_lshl_b32 s3, s3, 3 +; VI-NEXT: s_lshl_b32 s2, s2, 2 +; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: s_lshl_b32 s31, s31, 3 +; VI-NEXT: s_lshl_b32 s30, s30, 2 +; VI-NEXT: s_or_b32 s28, s28, s29 +; VI-NEXT: s_or_b32 s26, s27, s26 +; VI-NEXT: s_and_b32 s24, s24, 3 +; VI-NEXT: s_or_b32 s22, s23, s22 +; VI-NEXT: s_and_b32 s20, s20, 3 +; VI-NEXT: s_or_b32 s18, s19, s18 +; VI-NEXT: s_and_b32 s16, s16, 3 +; VI-NEXT: s_lshl_b32 s15, s15, 3 +; VI-NEXT: s_lshl_b32 s14, s14, 2 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s11, s10 +; VI-NEXT: s_and_b32 s8, s8, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s4, s4, 3 +; VI-NEXT: s_or_b32 s2, s3, s2 +; VI-NEXT: s_and_b32 s0, s0, 3 +; VI-NEXT: s_or_b32 s30, s31, s30 +; VI-NEXT: s_and_b32 s28, s28, 3 +; VI-NEXT: s_or_b32 s24, s24, s26 +; VI-NEXT: s_or_b32 s20, s20, s22 +; VI-NEXT: s_or_b32 s16, s16, s18 +; VI-NEXT: s_or_b32 s14, s15, s14 +; VI-NEXT: s_and_b32 s12, s12, 3 +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_or_b32 s0, s0, s2 +; VI-NEXT: s_or_b32 s28, s28, s30 +; VI-NEXT: s_and_b32 s24, s24, 15 +; VI-NEXT: s_lshl_b32 s20, s20, 4 +; VI-NEXT: s_and_b32 s16, s16, 15 +; VI-NEXT: s_or_b32 s12, s12, s14 +; VI-NEXT: s_and_b32 s8, s8, 15 +; VI-NEXT: s_lshl_b32 s4, s4, 4 +; VI-NEXT: s_and_b32 s0, s0, 15 +; VI-NEXT: s_lshl_b32 s28, s28, 12 +; VI-NEXT: s_lshl_b32 s24, s24, 8 +; VI-NEXT: s_or_b32 s16, s16, s20 +; VI-NEXT: s_lshl_b32 s12, s12, 12 +; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_or_b32 s0, s0, s4 +; VI-NEXT: s_or_b32 s24, s28, s24 +; VI-NEXT: s_and_b32 s16, s16, 0xff +; VI-NEXT: s_or_b32 s8, s12, s8 +; VI-NEXT: s_and_b32 s0, s0, 0xff +; VI-NEXT: s_or_b32 s16, s16, s24 +; VI-NEXT: s_or_b32 s0, s0, s8 +; VI-NEXT: s_lshl_b32 s16, s16, 16 +; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_or_b32 s0, s0, s16 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: amdgpu_cs_inreg_v32i1: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_and_b32_e64 v0, s14, 1 -; GFX11-NEXT: v_lshlrev_b16 v1, 1, s13 -; GFX11-NEXT: v_and_b32_e64 v2, s12, 1 -; GFX11-NEXT: v_lshlrev_b16 v3, 3, s15 -; GFX11-NEXT: v_lshlrev_b16 v4, 1, s9 -; GFX11-NEXT: v_lshlrev_b16 v0, 2, v0 -; GFX11-NEXT: v_and_b32_e64 v5, s8, 1 -; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX11-NEXT: v_and_b32_e64 v2, s10, 1 -; GFX11-NEXT: v_lshlrev_b16 v6, 1, s5 -; GFX11-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX11-NEXT: v_lshlrev_b16 v3, 3, s11 -; GFX11-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX11-NEXT: v_lshlrev_b16 v2, 2, v2 -; GFX11-NEXT: v_and_b32_e64 v5, s6, 1 -; GFX11-NEXT: v_and_b32_e64 v7, s4, 1 -; GFX11-NEXT: v_lshlrev_b16 v8, 1, s1 -; GFX11-NEXT: v_and_b32_e64 v9, s0, 1 -; GFX11-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX11-NEXT: v_lshlrev_b16 v4, 3, s7 -; GFX11-NEXT: v_lshlrev_b16 v5, 2, v5 -; GFX11-NEXT: v_or_b32_e32 v6, v7, v6 -; GFX11-NEXT: v_and_b32_e64 v7, s2, 1 -; GFX11-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX11-NEXT: v_or_b32_e32 v8, v9, v8 -; GFX11-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX11-NEXT: v_and_b32_e32 v5, 3, v6 -; GFX11-NEXT: v_lshlrev_b16 v6, 3, s3 -; GFX11-NEXT: v_lshlrev_b16 v7, 2, v7 -; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_or_b32_e32 v1, v3, v2 -; GFX11-NEXT: v_or_b32_e32 v2, v5, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v8 -; GFX11-NEXT: v_or_b32_e32 v3, v6, v7 -; GFX11-NEXT: v_lshlrev_b16 v6, 1, s29 -; GFX11-NEXT: v_and_b32_e64 v7, s28, 1 -; GFX11-NEXT: v_lshlrev_b16 v9, 1, s25 -; GFX11-NEXT: v_and_b32_e64 v10, s24, 1 -; GFX11-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX11-NEXT: v_and_b32_e64 v4, s26, 1 -; GFX11-NEXT: v_lshlrev_b16 v8, 3, s27 -; GFX11-NEXT: v_or_b32_e32 v6, v7, v6 -; GFX11-NEXT: v_or_b32_e32 v7, v10, v9 -; GFX11-NEXT: v_and_b32_e64 v9, s22, 1 -; GFX11-NEXT: v_lshlrev_b16 v4, 2, v4 -; GFX11-NEXT: v_lshlrev_b16 v10, 1, s21 -; GFX11-NEXT: v_and_b32_e64 v12, s20, 1 -; GFX11-NEXT: v_and_b32_e64 v13, s18, 1 -; GFX11-NEXT: v_lshlrev_b16 v14, 1, s17 -; GFX11-NEXT: v_and_b32_e64 v15, s16, 1 -; GFX11-NEXT: v_and_b32_e64 v5, s30, 1 -; GFX11-NEXT: v_or_b32_e32 v4, v8, v4 -; GFX11-NEXT: v_lshlrev_b16 v8, 3, s23 -; GFX11-NEXT: v_lshlrev_b16 v9, 2, v9 -; GFX11-NEXT: v_or_b32_e32 v10, v12, v10 -; GFX11-NEXT: v_lshlrev_b16 v12, 3, s19 -; GFX11-NEXT: v_lshlrev_b16 v13, 2, v13 -; GFX11-NEXT: v_or_b32_e32 v14, v15, v14 -; GFX11-NEXT: v_lshlrev_b16 v11, 3, s31 -; GFX11-NEXT: v_lshlrev_b16 v5, 2, v5 -; GFX11-NEXT: v_and_b32_e32 v7, 3, v7 -; GFX11-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX11-NEXT: v_and_b32_e32 v9, 3, v10 -; GFX11-NEXT: v_or_b32_e32 v10, v12, v13 -; GFX11-NEXT: v_and_b32_e32 v12, 3, v14 -; GFX11-NEXT: v_or_b32_e32 v5, v11, v5 -; GFX11-NEXT: v_and_b32_e32 v6, 3, v6 -; GFX11-NEXT: v_or_b32_e32 v4, v7, v4 -; GFX11-NEXT: v_or_b32_e32 v7, v9, v8 -; GFX11-NEXT: v_or_b32_e32 v8, v12, v10 -; GFX11-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX11-NEXT: v_lshlrev_b16 v2, 4, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX11-NEXT: v_or_b32_e32 v5, v6, v5 -; GFX11-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX11-NEXT: v_lshlrev_b16 v6, 4, v7 -; GFX11-NEXT: v_and_b32_e32 v7, 15, v8 -; GFX11-NEXT: v_lshlrev_b16 v0, 12, v0 -; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX11-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX11-NEXT: v_lshlrev_b16 v3, 12, v5 -; GFX11-NEXT: v_lshlrev_b16 v4, 8, v4 -; GFX11-NEXT: v_or_b32_e32 v5, v7, v6 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v2, v3, v4 -; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_or_b32_e32 v1, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_and_b32 s10, s10, 1 +; GFX11-NEXT: s_lshl_b32 s9, s9, 1 +; GFX11-NEXT: s_and_b32 s8, s8, 1 +; GFX11-NEXT: s_and_b32 s14, s14, 1 +; GFX11-NEXT: s_lshl_b32 s13, s13, 1 +; GFX11-NEXT: s_and_b32 s12, s12, 1 +; GFX11-NEXT: s_lshl_b32 s11, s11, 3 +; GFX11-NEXT: s_lshl_b32 s10, s10, 2 +; GFX11-NEXT: s_or_b32 s8, s8, s9 +; GFX11-NEXT: s_and_b32 s6, s6, 1 +; GFX11-NEXT: s_lshl_b32 s5, s5, 1 +; GFX11-NEXT: s_and_b32 s4, s4, 1 +; GFX11-NEXT: s_and_b32 s2, s2, 1 +; GFX11-NEXT: s_lshl_b32 s1, s1, 1 +; GFX11-NEXT: s_and_b32 s0, s0, 1 +; GFX11-NEXT: s_lshl_b32 s15, s15, 3 +; GFX11-NEXT: s_lshl_b32 s14, s14, 2 +; GFX11-NEXT: s_or_b32 s12, s12, s13 +; GFX11-NEXT: s_or_b32 s9, s11, s10 +; GFX11-NEXT: s_and_b32 s8, s8, 3 +; GFX11-NEXT: s_lshl_b32 s7, s7, 3 +; GFX11-NEXT: s_lshl_b32 s6, s6, 2 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_lshl_b32 s3, s3, 3 +; GFX11-NEXT: s_lshl_b32 s2, s2, 2 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_or_b32 s13, s15, s14 +; GFX11-NEXT: s_and_b32 s12, s12, 3 +; GFX11-NEXT: s_or_b32 s8, s8, s9 +; GFX11-NEXT: s_or_b32 s5, s7, s6 +; GFX11-NEXT: s_and_b32 s4, s4, 3 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_and_b32 s0, s0, 3 +; GFX11-NEXT: s_or_b32 s10, s12, s13 +; GFX11-NEXT: s_and_b32 s8, s8, 15 +; GFX11-NEXT: s_or_b32 s2, s4, s5 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_lshl_b32 s9, s10, 12 +; GFX11-NEXT: s_lshl_b32 s1, s2, 4 +; GFX11-NEXT: s_and_b32 s0, s0, 15 +; GFX11-NEXT: s_lshl_b32 s2, s8, 8 +; GFX11-NEXT: s_and_b32 s3, s30, 1 +; GFX11-NEXT: s_lshl_b32 s4, s29, 1 +; GFX11-NEXT: s_and_b32 s5, s28, 1 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_or_b32 s1, s9, s2 +; GFX11-NEXT: s_lshl_b32 s2, s31, 3 +; GFX11-NEXT: s_lshl_b32 s3, s3, 2 +; GFX11-NEXT: s_or_b32 s4, s5, s4 +; GFX11-NEXT: s_and_b32 s5, s26, 1 +; GFX11-NEXT: s_lshl_b32 s6, s25, 1 +; GFX11-NEXT: s_and_b32 s7, s24, 1 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_and_b32 s3, s4, 3 +; GFX11-NEXT: s_lshl_b32 s4, s27, 3 +; GFX11-NEXT: s_lshl_b32 s5, s5, 2 +; GFX11-NEXT: s_or_b32 s6, s7, s6 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_and_b32 s5, s6, 3 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s5, s4 +; GFX11-NEXT: s_and_b32 s5, s22, 1 +; GFX11-NEXT: s_lshl_b32 s6, s21, 1 +; GFX11-NEXT: s_and_b32 s7, s20, 1 +; GFX11-NEXT: s_lshl_b32 s4, s23, 3 +; GFX11-NEXT: s_lshl_b32 s5, s5, 2 +; GFX11-NEXT: s_or_b32 s6, s7, s6 +; GFX11-NEXT: s_and_b32 s7, s18, 1 +; GFX11-NEXT: s_lshl_b32 s8, s17, 1 +; GFX11-NEXT: s_and_b32 s9, s16, 1 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_and_b32 s5, s6, 3 +; GFX11-NEXT: s_lshl_b32 s6, s19, 3 +; GFX11-NEXT: s_lshl_b32 s7, s7, 2 +; GFX11-NEXT: s_or_b32 s8, s9, s8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_and_b32 s7, s8, 3 +; GFX11-NEXT: s_or_b32 s4, s5, s4 +; GFX11-NEXT: s_or_b32 s5, s7, s6 +; GFX11-NEXT: s_and_b32 s3, s3, 15 +; GFX11-NEXT: s_lshl_b32 s4, s4, 4 +; GFX11-NEXT: s_and_b32 s5, s5, 15 +; GFX11-NEXT: s_lshl_b32 s2, s2, 12 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s5, s4 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_and_b32 s3, s4, 0xff +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll index 237e06def15763..6505e390355a8c 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll @@ -124,11 +124,11 @@ ret: ; GCN: s_cbranch_scc{{[0-1]}} ; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70004 -; VI: v_mov_b32_e32 v{{[0-9]+}}, 0x7f +; VI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7f ; GCN: .LBB2_3: ; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80004 -; VI: v_mov_b32_e32 v{{[0-9]+}}, 0xff +; VI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xff ; GCN: buffer_store_short ; GCN: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll index 9e5dbe91504a0c..456d0ffd48e7f1 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -539,7 +539,7 @@ define amdgpu_kernel void @test_copy_v3i8_align1(ptr addrspace(1) %out, ptr addr ; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2 -; VI-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:1 ; VI-NEXT: s_endpgm %val = load <3 x i8>, ptr addrspace(1) %in, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index 2168e7fe1dd285..4588bee49f037f 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -608,8 +608,9 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v0 +; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; VI-NEXT: v_ffbh_u32_e32 v1, v1 -; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -708,15 +709,19 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v2 +; VI-NEXT: v_readfirstlane_b32 s2, v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_ffbh_u32_e32 v1, v1 -; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc +; VI-NEXT: v_readfirstlane_b32 s3, v0 +; VI-NEXT: s_lshl_b32 s2, s2, 8 +; VI-NEXT: s_or_b32 s2, s2, s3 +; VI-NEXT: s_lshl_b32 s3, s2, 16 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_flbit_i32_b32 s3, s3 +; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cselect_b32 s2, s3, 32 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll index 14e6c4bcf6d8fe..ccd23a91c35733 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz.ll @@ -1407,8 +1407,9 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_e32 v2, 0x10000, v0 +; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; VI-NEXT: v_ffbl_b32_e32 v2, v2 -; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -1448,7 +1449,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_or_b32_e32 v2, 0x10000, v1 -; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v1 +; GFX10-NEXT: v_cmp_ne_u32_sdwa vcc_lo, v1, v0 src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo ; GFX10-NEXT: global_store_short v0, v1, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll index 81ed823bad2044..4c7c8bc1c027d7 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -584,7 +584,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v1, v0 -; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 +; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -679,11 +680,12 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_ffbl_b32_e32 v1, v0 -; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 +; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1461,8 +1463,9 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; VI-NEXT: v_mov_b32_e32 v1, 0xff ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_e32 v2, 0x100, v0 +; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; VI-NEXT: v_ffbl_b32_e32 v2, v2 -; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1554,17 +1557,20 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, 0xffff ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_readfirstlane_b32 s2, v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_e32 v0, v2, v0 -; VI-NEXT: v_or_b32_e32 v2, 0x10000, v0 -; VI-NEXT: v_ffbl_b32_e32 v2, v2 -; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; VI-NEXT: v_readfirstlane_b32 s3, v0 +; VI-NEXT: s_lshl_b32 s2, s2, 8 +; VI-NEXT: s_or_b32 s2, s2, s3 +; VI-NEXT: s_or_b32 s3, s2, 0x10000 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_ff1_i32_b32 s3, s3 +; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_cselect_b32 s2, s3, 0xffff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll index 4b1eb7cb08e306..f4d8ec180cf916 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll @@ -126,7 +126,7 @@ define amdgpu_kernel void @sel_constants_sub_constant_sel_constants(ptr addrspac } ; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants_i16: -; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 9, +; GCN: s_cselect_b32 s{{[0-9]+}}, 9, 2 define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_i16(ptr addrspace(1) %p, i1 %cond) { %sel = select i1 %cond, i16 -4, i16 3 %bo = sub i16 5, %sel @@ -135,8 +135,7 @@ define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_i16(ptr addr } ; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants_i16_neg: -; GCN: v_mov_b32_e32 [[F:v[0-9]+]], 0xfffff449 -; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, [[F]], -3, +; GCN: s_cselect_b32 s[[SGPR:[0-9]+]], s[[SGPR]], 0xf449 define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_i16_neg(ptr addrspace(1) %p, i1 %cond) { %sel = select i1 %cond, i16 4, i16 3000 %bo = sub i16 1, %sel diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll index 81d6ce219b5d51..54ec7578700df8 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll @@ -232,28 +232,22 @@ define amdgpu_kernel void @short8_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s4, 1 -; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN-NEXT: s_cselect_b32 s2, 2, 1 ; GCN-NEXT: s_cmp_lg_u32 s4, 2 -; GCN-NEXT: v_cndmask_b32_e64 v0, 1, 2, s[2:3] -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 3 ; GCN-NEXT: s_cmp_lg_u32 s4, 3 -; GCN-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 4 ; GCN-NEXT: s_cmp_lg_u32 s4, 4 -; GCN-NEXT: v_cndmask_b32_e32 v0, 4, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 5 ; GCN-NEXT: s_cmp_lg_u32 s4, 5 -; GCN-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 6 ; GCN-NEXT: s_cmp_lg_u32 s4, 6 -; GCN-NEXT: v_cndmask_b32_e32 v0, 6, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 7 ; GCN-NEXT: s_cmp_lg_u32 s4, 7 -; GCN-NEXT: v_cndmask_b32_e32 v0, 7, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v2, 8, v0, vcc +; GCN-NEXT: s_cselect_b32 s2, s2, 8 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: flat_store_short v[0:1], v2 ; GCN-NEXT: s_endpgm entry: @@ -668,52 +662,38 @@ define amdgpu_kernel void @byte16_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s4, 1 -; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN-NEXT: s_cselect_b32 s2, 2, 1 ; GCN-NEXT: s_cmp_lg_u32 s4, 2 -; GCN-NEXT: v_cndmask_b32_e64 v0, 1, 2, s[2:3] -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 3 ; GCN-NEXT: s_cmp_lg_u32 s4, 3 -; GCN-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 4 ; GCN-NEXT: s_cmp_lg_u32 s4, 4 -; GCN-NEXT: v_cndmask_b32_e32 v0, 4, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 5 ; GCN-NEXT: s_cmp_lg_u32 s4, 5 -; GCN-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 6 ; GCN-NEXT: s_cmp_lg_u32 s4, 6 -; GCN-NEXT: v_cndmask_b32_e32 v0, 6, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 7 ; GCN-NEXT: s_cmp_lg_u32 s4, 7 -; GCN-NEXT: v_cndmask_b32_e32 v0, 7, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 8 ; GCN-NEXT: s_cmp_lg_u32 s4, 8 -; GCN-NEXT: v_cndmask_b32_e32 v0, 8, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 9 ; GCN-NEXT: s_cmp_lg_u32 s4, 9 -; GCN-NEXT: v_cndmask_b32_e32 v0, 9, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 10 ; GCN-NEXT: s_cmp_lg_u32 s4, 10 -; GCN-NEXT: v_cndmask_b32_e32 v0, 10, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 11 ; GCN-NEXT: s_cmp_lg_u32 s4, 11 -; GCN-NEXT: v_cndmask_b32_e32 v0, 11, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 12 ; GCN-NEXT: s_cmp_lg_u32 s4, 12 -; GCN-NEXT: v_cndmask_b32_e32 v0, 12, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 13 ; GCN-NEXT: s_cmp_lg_u32 s4, 13 -; GCN-NEXT: v_cndmask_b32_e32 v0, 13, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 14 ; GCN-NEXT: s_cmp_lg_u32 s4, 14 -; GCN-NEXT: v_cndmask_b32_e32 v0, 14, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 15 ; GCN-NEXT: s_cmp_lg_u32 s4, 15 -; GCN-NEXT: v_cndmask_b32_e32 v0, 15, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v2, 16, v0, vcc +; GCN-NEXT: s_cselect_b32 s2, s2, 16 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: flat_store_byte v[0:1], v2 ; GCN-NEXT: s_endpgm entry: @@ -751,388 +731,264 @@ define amdgpu_kernel void @bit128_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s4, 1 ; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 2 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 2 +; GCN-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s4, 3 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 4 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s4, 5 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 6 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s4, 7 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 8 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s4, 9 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 10 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s4, 11 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 12 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s4, 13 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 14 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s4, 15 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 16 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s4, 17 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 18 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s4, 19 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 20 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s4, 21 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 22 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s4, 23 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 24 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s4, 25 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 26 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s4, 27 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 28 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s4, 29 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 30 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s4, 31 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 32 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s4, 33 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 34 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s4, 35 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 36 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s4, 37 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 38 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s4, 39 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 40 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s4, 41 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 42 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s4, 43 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 44 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s4, 45 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 46 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s4, 47 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 48 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s4, 49 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 50 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s4, 51 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 52 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s4, 53 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 54 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s4, 55 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 56 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s4, 57 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 58 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s4, 59 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 60 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s4, 61 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 62 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s4, 63 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 64 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x41 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x42 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x43 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x44 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x45 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x46 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x47 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x48 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x49 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x4a -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x4b -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x4c -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x4d -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x4e -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x4f -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x50 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x51 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x52 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x53 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x54 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x55 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x56 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x57 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x58 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x59 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x5a -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x5b -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x5c -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x5d -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x5e -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x5f -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x60 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x61 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x62 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x63 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x64 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x65 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x66 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x67 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x68 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x69 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x6a -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x6b -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x6c -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x6d -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x6e -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x6f -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x70 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x71 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x72 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x73 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x74 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x75 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x76 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x77 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x78 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x79 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x7a -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x7b -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x7c -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x7d -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x7e -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 ; GCN-NEXT: s_cmpk_lg_i32 s4, 0x7f -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: v_and_b32_e32 v2, 1, v0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 +; GCN-NEXT: s_and_b32 s2, s2, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll index e831da036acc7c..a7df29dbf7415c 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll @@ -55,16 +55,17 @@ define amdgpu_kernel void @extract_vector_elt_v2i8(ptr addrspace(1) %out, <2 x i ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_lshr_b32 s3, s2, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_lshrrev_b16_e64 v3, 8, s2 -; VI-NEXT: s_add_u32 s2, s0, 1 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: flat_store_byte v[0:1], v3 +; VI-NEXT: s_add_u32 s0, s0, 1 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm @@ -371,10 +372,11 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(ptr addrspace(1) %out ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b32 s2, s2, 3 -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_lshrrev_b16_e32 v2, s2, v0 +; VI-NEXT: s_and_b32 s3, s3, 0xffff +; VI-NEXT: s_lshr_b32 s2, s3, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index fe672f1b3b1313..e6f9889440f0cd 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -369,11 +369,11 @@ define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s4, s[2:3], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, s4, v0 +; VI-NEXT: s_and_b32 s2, s4, 0x8000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -381,10 +381,10 @@ define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX9-NEXT: s_and_b32 s2, s4, 0x8000 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -393,9 +393,10 @@ define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e64 v1, 0xffff8000, s4 +; GFX11-NEXT: s_and_b32 s2, s4, 0x8000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -426,12 +427,12 @@ define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s4, s[2:3], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, s4, v0 -; VI-NEXT: v_or_b32_e32 v2, 0x3c00, v0 +; VI-NEXT: s_and_b32 s2, s4, 0x8000 +; VI-NEXT: s_or_b32 s2, s2, 0x3c00 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -439,11 +440,11 @@ define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_or_b32_e32 v1, 0x3c00, v1 +; GFX9-NEXT: s_and_b32 s2, s4, 0x8000 +; GFX9-NEXT: s_or_b32 s2, s2, 0x3c00 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -452,12 +453,12 @@ define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v0, 0x3c00, v0 -; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-NEXT: s_and_b32 s2, s4, 0x8000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s2, s2, 0x3c00 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -487,12 +488,12 @@ define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, hal ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s4, s[2:3], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, s4, v0 -; VI-NEXT: v_or_b32_e32 v2, 0x4900, v0 +; VI-NEXT: s_and_b32 s2, s4, 0x8000 +; VI-NEXT: s_or_b32 s2, s2, 0x4900 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -500,11 +501,11 @@ define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, hal ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_or_b32_e32 v1, 0x4900, v1 +; GFX9-NEXT: s_and_b32 s2, s4, 0x8000 +; GFX9-NEXT: s_or_b32 s2, s2, 0x4900 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -513,12 +514,12 @@ define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, hal ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v0, 0x4900, v0 -; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-NEXT: s_and_b32 s2, s4, 0x8000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s2, s2, 0x4900 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -547,12 +548,12 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, h ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s4, s[2:3], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, s4, v0 -; VI-NEXT: v_or_b32_e32 v2, 0x3c00, v0 +; VI-NEXT: s_and_b32 s2, s4, 0x8000 +; VI-NEXT: s_or_b32 s2, s2, 0x3c00 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -560,11 +561,11 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, h ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_or_b32_e32 v1, 0x3c00, v1 +; GFX9-NEXT: s_and_b32 s2, s4, 0x8000 +; GFX9-NEXT: s_or_b32 s2, s2, 0x3c00 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -573,12 +574,12 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, h ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v0, 0x3c00, v0 -; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-NEXT: s_and_b32 s2, s4, 0x8000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s2, s2, 0x3c00 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -608,12 +609,12 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10_mag(ptr addrspace(1) %out, ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s4, s[2:3], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, s4, v0 -; VI-NEXT: v_or_b32_e32 v2, 0x4900, v0 +; VI-NEXT: s_and_b32 s2, s4, 0x8000 +; VI-NEXT: s_or_b32 s2, s2, 0x4900 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -621,11 +622,11 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10_mag(ptr addrspace(1) %out, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_or_b32_e32 v1, 0x4900, v1 +; GFX9-NEXT: s_and_b32 s2, s4, 0x8000 +; GFX9-NEXT: s_or_b32 s2, s2, 0x4900 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -634,12 +635,12 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10_mag(ptr addrspace(1) %out, ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v0, 0x4900, v0 -; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-NEXT: s_and_b32 s2, s4, 0x8000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s2, s2, 0x4900 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll index 932b10f14780b1..4f3086a9eb1f9a 100644 --- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll @@ -6,34 +6,6 @@ ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL %s define double @v_sqrt_f64(double %x) { -; SDAG-LABEL: v_sqrt_f64: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: s_mov_b32 s4, 0 -; SDAG-NEXT: s_brev_b32 s5, 8 -; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] -; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] -; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 -; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 -; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 -; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 -; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 -; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; SDAG-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-LABEL: v_sqrt_f64: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -66,34 +38,6 @@ define double @v_sqrt_f64(double %x) { } define double @v_sqrt_f64_fneg(double %x) { -; SDAG-LABEL: v_sqrt_f64_fneg: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: s_mov_b32 s4, 0 -; SDAG-NEXT: s_brev_b32 s5, 9 -; SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[4:5], v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SDAG-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2 -; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] -; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] -; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 -; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 -; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 -; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 -; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 -; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; SDAG-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-LABEL: v_sqrt_f64_fneg: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -127,34 +71,6 @@ define double @v_sqrt_f64_fneg(double %x) { } define double @v_sqrt_f64_fabs(double %x) { -; SDAG-LABEL: v_sqrt_f64_fabs: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: s_mov_b32 s4, 0 -; SDAG-NEXT: s_brev_b32 s5, 8 -; SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SDAG-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2 -; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] -; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] -; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 -; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 -; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 -; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 -; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 -; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; SDAG-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-LABEL: v_sqrt_f64_fabs: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -188,34 +104,6 @@ define double @v_sqrt_f64_fabs(double %x) { } define double @v_sqrt_f64_fneg_fabs(double %x) { -; SDAG-LABEL: v_sqrt_f64_fneg_fabs: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: s_mov_b32 s4, 0 -; SDAG-NEXT: s_brev_b32 s5, 9 -; SDAG-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SDAG-NEXT: v_ldexp_f64 v[0:1], -|v[0:1]|, v2 -; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] -; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] -; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 -; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 -; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 -; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 -; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 -; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; SDAG-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-LABEL: v_sqrt_f64_fneg_fabs: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -250,34 +138,6 @@ define double @v_sqrt_f64_fneg_fabs(double %x) { } define double @v_sqrt_f64_ninf(double %x) { -; SDAG-LABEL: v_sqrt_f64_ninf: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: s_mov_b32 s4, 0 -; SDAG-NEXT: s_brev_b32 s5, 8 -; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] -; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] -; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 -; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 -; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 -; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 -; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 -; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; SDAG-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-LABEL: v_sqrt_f64_ninf: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -310,34 +170,6 @@ define double @v_sqrt_f64_ninf(double %x) { } define double @v_sqrt_f64_no_infs_attribute(double %x) "no-infs-fp-math"="true" { -; SDAG-LABEL: v_sqrt_f64_no_infs_attribute: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: s_mov_b32 s4, 0 -; SDAG-NEXT: s_brev_b32 s5, 8 -; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] -; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] -; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 -; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 -; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 -; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 -; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 -; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; SDAG-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-LABEL: v_sqrt_f64_no_infs_attribute: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -370,34 +202,6 @@ define double @v_sqrt_f64_no_infs_attribute(double %x) "no-infs-fp-math"="true" } define double @v_sqrt_f64_nnan(double %x) { -; SDAG-LABEL: v_sqrt_f64_nnan: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: s_mov_b32 s4, 0 -; SDAG-NEXT: s_brev_b32 s5, 8 -; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] -; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] -; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 -; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 -; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 -; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 -; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 -; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; SDAG-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-LABEL: v_sqrt_f64_nnan: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -706,34 +510,6 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn_nnan_ninf(double inreg %x) { } define double @v_sqrt_f64_nsz(double %x) { -; SDAG-LABEL: v_sqrt_f64_nsz: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: s_mov_b32 s4, 0 -; SDAG-NEXT: s_brev_b32 s5, 8 -; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] -; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] -; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 -; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 -; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 -; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 -; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 -; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; SDAG-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-LABEL: v_sqrt_f64_nsz: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -766,34 +542,6 @@ define double @v_sqrt_f64_nsz(double %x) { } define double @v_sqrt_f64_nnan_ninf(double %x) { -; SDAG-LABEL: v_sqrt_f64_nnan_ninf: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: s_mov_b32 s4, 0 -; SDAG-NEXT: s_brev_b32 s5, 8 -; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] -; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] -; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 -; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 -; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 -; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 -; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 -; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; SDAG-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-LABEL: v_sqrt_f64_nnan_ninf: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -826,34 +574,6 @@ define double @v_sqrt_f64_nnan_ninf(double %x) { } define double @v_sqrt_f64_nnan_ninf_nsz(double %x) { -; SDAG-LABEL: v_sqrt_f64_nnan_ninf_nsz: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: s_mov_b32 s4, 0 -; SDAG-NEXT: s_brev_b32 s5, 8 -; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] -; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] -; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 -; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 -; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 -; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 -; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 -; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; SDAG-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-LABEL: v_sqrt_f64_nnan_ninf_nsz: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -886,34 +606,6 @@ define double @v_sqrt_f64_nnan_ninf_nsz(double %x) { } define double @v_sqrt_f64_afn(double %x) { -; SDAG-LABEL: v_sqrt_f64_afn: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: s_mov_b32 s4, 0 -; SDAG-NEXT: s_brev_b32 s5, 8 -; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] -; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] -; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 -; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 -; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 -; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 -; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 -; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; SDAG-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-LABEL: v_sqrt_f64_afn: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -946,34 +638,6 @@ define double @v_sqrt_f64_afn(double %x) { } define double @v_sqrt_f64_afn_nsz(double %x) { -; SDAG-LABEL: v_sqrt_f64_afn_nsz: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: s_mov_b32 s4, 0 -; SDAG-NEXT: s_brev_b32 s5, 8 -; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] -; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] -; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 -; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 -; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 -; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 -; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 -; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; SDAG-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-LABEL: v_sqrt_f64_afn_nsz: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1106,34 +770,6 @@ define <2 x double> @v_sqrt_v2f64_afn(<2 x double> %x) { } define double @v_sqrt_f64_afn_nnan(double %x) { -; SDAG-LABEL: v_sqrt_f64_afn_nnan: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: s_mov_b32 s4, 0 -; SDAG-NEXT: s_brev_b32 s5, 8 -; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] -; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] -; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 -; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 -; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 -; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 -; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 -; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; SDAG-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-LABEL: v_sqrt_f64_afn_nnan: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1166,34 +802,6 @@ define double @v_sqrt_f64_afn_nnan(double %x) { } define double @v_sqrt_f64_fabs_afn_ninf(double %x) { -; SDAG-LABEL: v_sqrt_f64_fabs_afn_ninf: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: s_mov_b32 s4, 0 -; SDAG-NEXT: s_brev_b32 s5, 8 -; SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SDAG-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2 -; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] -; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] -; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 -; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 -; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 -; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 -; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 -; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; SDAG-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-LABEL: v_sqrt_f64_fabs_afn_ninf: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1227,34 +835,6 @@ define double @v_sqrt_f64_fabs_afn_ninf(double %x) { } define double @v_sqrt_f64_afn_nnan_ninf(double %x) { -; SDAG-LABEL: v_sqrt_f64_afn_nnan_ninf: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: s_mov_b32 s4, 0 -; SDAG-NEXT: s_brev_b32 s5, 8 -; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] -; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] -; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 -; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 -; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 -; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 -; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 -; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; SDAG-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-LABEL: v_sqrt_f64_afn_nnan_ninf: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1387,34 +967,6 @@ define <2 x double> @v_sqrt_v2f64_afn_nnan_ninf(<2 x double> %x) { } define double @v_sqrt_f64_afn_nnan_ninf_nsz(double %x) { -; SDAG-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: s_mov_b32 s4, 0 -; SDAG-NEXT: s_brev_b32 s5, 8 -; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] -; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] -; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 -; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 -; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 -; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 -; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 -; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; SDAG-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1447,34 +999,6 @@ define double @v_sqrt_f64_afn_nnan_ninf_nsz(double %x) { } define double @v_sqrt_f64__approx_func_fp_math(double %x) #2 { -; SDAG-LABEL: v_sqrt_f64__approx_func_fp_math: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: s_mov_b32 s4, 0 -; SDAG-NEXT: s_brev_b32 s5, 8 -; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] -; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] -; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 -; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 -; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 -; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 -; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 -; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; SDAG-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-LABEL: v_sqrt_f64__approx_func_fp_math: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1507,34 +1031,6 @@ define double @v_sqrt_f64__approx_func_fp_math(double %x) #2 { } define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 { -; SDAG-LABEL: v_sqrt_f64__enough_unsafe_attrs: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: s_mov_b32 s4, 0 -; SDAG-NEXT: s_brev_b32 s5, 8 -; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] -; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] -; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 -; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 -; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 -; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 -; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 -; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; SDAG-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-LABEL: v_sqrt_f64__enough_unsafe_attrs: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1567,34 +1063,6 @@ define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 { } define double @v_sqrt_f64__unsafe_attr(double %x) #4 { -; SDAG-LABEL: v_sqrt_f64__unsafe_attr: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: s_mov_b32 s4, 0 -; SDAG-NEXT: s_brev_b32 s5, 8 -; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] -; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] -; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 -; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 -; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] -; SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80 -; SDAG-NEXT: v_mov_b32_e32 v5, 0x260 -; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 -; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; SDAG-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-LABEL: v_sqrt_f64__unsafe_attr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll index 645e48f1bb1ab0..545a9af3f9a0bd 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -2990,13 +2990,14 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 { ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_ushort v0, v[0:1], off ; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2i8@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2i8@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b16_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 @@ -3025,10 +3026,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 { ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i8@abs32@lo ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b16 v1, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 @@ -3060,9 +3062,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b16 v1, 8, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 @@ -3090,10 +3093,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 { ; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i8@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: global_load_ushort v0, v[0:1], off +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_lshrrev_b16 v1, 8, v0 +; GFX10-SCRATCH-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 @@ -4152,13 +4156,14 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 { ; GFX9-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-NEXT: global_load_ushort v0, v[40:41], off ; GFX9-NEXT: v_writelane_b32 v42, s34, 2 +; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: v_writelane_b32 v42, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2i8_ret@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2i8_ret@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v42, s31, 1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b16_e32 v1, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -4191,14 +4196,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 { ; GFX10-NEXT: v_mov_b32_e32 v40, 0 ; GFX10-NEXT: v_mov_b32_e32 v41, 0 ; GFX10-NEXT: v_writelane_b32 v42, s34, 2 +; GFX10-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2i8_ret@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i8_ret@abs32@lo -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: global_load_ushort v0, v[40:41], off ; GFX10-NEXT: v_writelane_b32 v42, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v42, s31, 1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b16 v1, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-NEXT: v_readlane_b32 s31, v42, 1 @@ -4239,14 +4245,16 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 { ; GFX11-NEXT: v_writelane_b32 v42, s30, 0 ; GFX11-NEXT: v_writelane_b32 v42, s31, 1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b16 v1, 8, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-NEXT: v_readlane_b32 s30, v42, 0 ; GFX11-NEXT: v_readlane_b32 s0, v42, 2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-NEXT: global_store_b16 v[40:41], v0, off ; GFX11-NEXT: s_clause 0x1 @@ -4274,14 +4282,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 { ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v40, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s0, 2 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2i8_ret@abs32@hi ; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i8_ret@abs32@lo -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: global_load_ushort v0, v[40:41], off ; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s30, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s31, 1 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_lshrrev_b16 v1, 8, v0 +; GFX10-SCRATCH-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll index 81210d8f5d0ca3..5eb4d9b7a2beb7 100644 --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -638,92 +638,96 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-LABEL: udiv16_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_movk_i32 s2, 0x400 -; GFX9-NEXT: v_mov_b32_e32 v3, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s2, 0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB4_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_cvt_f32_u32_sdwa v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_add_u16_e32 v2, 1, v2 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s2, v2 -; GFX9-NEXT: v_mul_f32_e32 v6, v4, v1 -; GFX9-NEXT: v_trunc_f32_e32 v6, v6 -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6 -; GFX9-NEXT: v_mad_f32 v4, -v6, v0, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, v0 -; GFX9-NEXT: v_addc_co_u32_e64 v4, s[0:1], 0, v7, s[0:1] -; GFX9-NEXT: global_store_short v5, v4, s[4:5] -; GFX9-NEXT: s_cbranch_vccz .LBB4_1 +; GFX9-NEXT: s_and_b32 s3, 0xffff, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s3 +; GFX9-NEXT: s_add_i32 s2, s2, 1 +; GFX9-NEXT: s_lshl_b32 s3, s3, 1 +; GFX9-NEXT: s_and_b32 s4, s2, 0xffff +; GFX9-NEXT: v_mul_f32_e32 v4, v2, v1 +; GFX9-NEXT: v_trunc_f32_e32 v4, v4 +; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v4 +; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: s_cmpk_eq_i32 s4, 0x400 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_short v3, v2, s[0:1] +; GFX9-NEXT: s_cbranch_scc0 .LBB4_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udiv16_invariant_denom: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, 1 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX10-NEXT: s_and_b32 s2, s4, 0xffff +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX10-NEXT: .LBB4_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_cvt_f32_u32_sdwa v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_add_nc_u16 v2, v2, 1 -; GFX10-NEXT: v_mul_f32_e32 v6, v4, v1 -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 -; GFX10-NEXT: v_trunc_f32_e32 v6, v6 -; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo -; GFX10-NEXT: v_mad_f32 v4, -v6, v0, v4 -; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GFX10-NEXT: v_cmp_ge_f32_e64 s0, |v4|, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, 0, v6, s0 -; GFX10-NEXT: global_store_short v5, v4, s[4:5] -; GFX10-NEXT: s_cbranch_vccz .LBB4_1 +; GFX10-NEXT: s_and_b32 s3, 0xffff, s2 +; GFX10-NEXT: s_add_i32 s2, s2, 1 +; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s3 +; GFX10-NEXT: s_lshl_b32 s3, s3, 1 +; GFX10-NEXT: v_mov_b32_e32 v4, s3 +; GFX10-NEXT: s_and_b32 s3, s2, 0xffff +; GFX10-NEXT: v_mul_f32_e32 v3, v2, v1 +; GFX10-NEXT: s_cmpk_eq_i32 s3, 0x400 +; GFX10-NEXT: v_trunc_f32_e32 v3, v3 +; GFX10-NEXT: v_mad_f32 v2, -v3, v0, v2 +; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: global_store_short v4, v2, s[0:1] +; GFX10-NEXT: s_cbranch_scc0 .LBB4_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: udiv16_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX11-NEXT: s_and_b32 s2, s4, 0xffff +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX11-NEXT: s_mov_b32 s2, 0 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB4_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v2 -; GFX11-NEXT: v_add_nc_u16 v2, v2, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v3 -; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 1, v3 +; GFX11-NEXT: s_and_b32 s3, 0xffff, s2 +; GFX11-NEXT: s_add_i32 s2, s2, 1 +; GFX11-NEXT: v_cvt_f32_u32_e32 v2, s3 +; GFX11-NEXT: s_lshl_b32 s3, s3, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v4, s3 +; GFX11-NEXT: s_and_b32 s3, s2, 0xffff ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v5, v4, v1 -; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_trunc_f32_e32 v5, v5 -; GFX11-NEXT: v_fma_f32 v4, -v5, v0, v4 -; GFX11-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v4|, v0 -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s0, 0, v5, s0 -; GFX11-NEXT: global_store_b16 v3, v4, s[2:3] -; GFX11-NEXT: s_cbranch_vccz .LBB4_1 +; GFX11-NEXT: v_mul_f32_e32 v3, v2, v1 +; GFX11-NEXT: s_cmpk_eq_i32 s3, 0x400 +; GFX11-NEXT: v_trunc_f32_e32 v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f32 v2, -v3, v0, v2 +; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v3, vcc_lo +; GFX11-NEXT: global_store_b16 v4, v2, s[0:1] +; GFX11-NEXT: s_cbranch_scc0 .LBB4_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -749,31 +753,32 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-LABEL: urem16_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s4, s0, 0xffff ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_movk_i32 s2, 0x400 +; GFX9-NEXT: s_mov_b32 s2, 0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB5_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_cvt_f32_u32_sdwa v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: v_mul_f32_e32 v5, v4, v1 -; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v5 -; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, v4, s4 -; GFX9-NEXT: v_sub_u32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u16_e32 v2, 1, v2 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s2, v2 +; GFX9-NEXT: s_and_b32 s3, 0xffff, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s3 +; GFX9-NEXT: s_add_i32 s2, s2, 1 +; GFX9-NEXT: s_lshl_b32 s5, s3, 1 +; GFX9-NEXT: s_and_b32 s6, s2, 0xffff +; GFX9-NEXT: v_mul_f32_e32 v3, v2, v1 +; GFX9-NEXT: v_trunc_f32_e32 v3, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3 +; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v4, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s4 +; GFX9-NEXT: s_cmpk_eq_i32 s6, 0x400 +; GFX9-NEXT: v_sub_u32_e32 v2, s3, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_short v5, v4, s[0:1] -; GFX9-NEXT: s_cbranch_vccz .LBB5_1 +; GFX9-NEXT: global_store_short v3, v2, s[0:1] +; GFX9-NEXT: s_cbranch_scc0 .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm ; @@ -782,28 +787,30 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, 1 +; GFX10-NEXT: s_mov_b32 s3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s2, s4, 0xffff ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX10-NEXT: .LBB5_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_cvt_f32_u32_sdwa v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX10-NEXT: v_mul_f32_e32 v5, v4, v1 -; GFX10-NEXT: v_trunc_f32_e32 v5, v5 -; GFX10-NEXT: v_mad_f32 v4, -v5, v0, v4 -; GFX10-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v4|, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v5, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_mul_lo_u32 v4, v4, s2 -; GFX10-NEXT: v_sub_nc_u32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: v_add_nc_u16 v2, v2, 1 -; GFX10-NEXT: global_store_short v5, v4, s[0:1] -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 -; GFX10-NEXT: s_cbranch_vccz .LBB5_1 +; GFX10-NEXT: s_and_b32 s4, 0xffff, s3 +; GFX10-NEXT: s_add_i32 s3, s3, 1 +; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s4 +; GFX10-NEXT: s_lshl_b32 s5, s4, 1 +; GFX10-NEXT: v_mul_f32_e32 v3, v2, v1 +; GFX10-NEXT: v_trunc_f32_e32 v3, v3 +; GFX10-NEXT: v_mad_f32 v2, -v3, v0, v2 +; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: v_mul_lo_u32 v2, v2, s2 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s4, v2 +; GFX10-NEXT: s_and_b32 s4, s3, 0xffff +; GFX10-NEXT: s_cmpk_eq_i32 s4, 0x400 +; GFX10-NEXT: global_store_short v3, v2, s[0:1] +; GFX10-NEXT: s_cbranch_scc0 .LBB5_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm ; @@ -812,7 +819,7 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_mov_b32 s3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, s4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -821,26 +828,28 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB5_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v2 -; GFX11-NEXT: v_add_nc_u16 v2, v2, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GFX11-NEXT: s_and_b32 s4, 0xffff, s3 +; GFX11-NEXT: s_add_i32 s3, s3, 1 +; GFX11-NEXT: v_cvt_f32_u32_e32 v2, s4 +; GFX11-NEXT: s_lshl_b32 s5, s4, 1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v5, v4, v1 -; GFX11-NEXT: v_trunc_f32_e32 v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f32 v4, -v5, v0, v4 -; GFX11-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v4|, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v5, vcc_lo -; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 1, v3 -; GFX11-NEXT: v_mul_lo_u32 v4, v4, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_sub_nc_u32_e32 v3, v3, v4 -; GFX11-NEXT: global_store_b16 v5, v3, s[0:1] -; GFX11-NEXT: s_cbranch_vccz .LBB5_1 +; GFX11-NEXT: v_mul_f32_e32 v3, v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_trunc_f32_e32 v3, v3 +; GFX11-NEXT: v_fma_f32 v2, -v3, v0, v2 +; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v3, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v3, s5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_lo_u32 v2, v2, s2 +; GFX11-NEXT: v_sub_nc_u32_e32 v2, s4, v2 +; GFX11-NEXT: s_and_b32 s4, s3, 0xffff +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400 +; GFX11-NEXT: global_store_b16 v3, v2, s[0:1] +; GFX11-NEXT: s_cbranch_scc0 .LBB5_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -870,33 +879,32 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: s_sext_i32_i16 s4, s0 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0 -; GFX9-NEXT: s_movk_i32 s2, 0x400 +; GFX9-NEXT: s_mov_b32 s2, 0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB6_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_sext_i32_i16 s5, s3 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5 -; GFX9-NEXT: s_xor_b32 s6, s5, s4 -; GFX9-NEXT: s_ashr_i32 s5, s6, 30 -; GFX9-NEXT: s_or_b32 s5, s5, 1 -; GFX9-NEXT: v_mul_f32_e32 v4, v3, v1 -; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 -; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v3|, |v0| +; GFX9-NEXT: s_sext_i32_i16 s3, s2 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 +; GFX9-NEXT: s_xor_b32 s3, s3, s4 +; GFX9-NEXT: s_ashr_i32 s3, s3, 30 +; GFX9-NEXT: s_or_b32 s3, s3, 1 +; GFX9-NEXT: v_mul_f32_e32 v3, v2, v1 +; GFX9-NEXT: v_trunc_f32_e32 v3, v3 +; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 +; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v2|, |v0| ; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec -; GFX9-NEXT: v_add_u16_e64 v2, s3, 1 -; GFX9-NEXT: s_cselect_b32 s5, s5, 0 -; GFX9-NEXT: s_and_b32 s6, 0xffff, s3 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s2, v2 -; GFX9-NEXT: v_readfirstlane_b32 s3, v2 -; GFX9-NEXT: v_add_u32_e32 v2, s5, v4 -; GFX9-NEXT: s_lshl_b32 s5, s6, 1 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_and_b32 s5, 0xffff, s2 +; GFX9-NEXT: s_add_i32 s2, s2, 1 +; GFX9-NEXT: v_add_u32_e32 v2, s3, v3 +; GFX9-NEXT: s_lshl_b32 s3, s5, 1 +; GFX9-NEXT: s_and_b32 s5, s2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: s_cmpk_eq_i32 s5, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v3, v2, s[0:1] -; GFX9-NEXT: s_cbranch_vccz .LBB6_1 +; GFX9-NEXT: s_cbranch_scc0 .LBB6_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm ; @@ -913,26 +921,26 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: .LBB6_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_sext_i32_i16 s4, s3 -; GFX10-NEXT: v_add_nc_u16 v2, s3, 1 -; GFX10-NEXT: v_cvt_f32_i32_e32 v3, s4 -; GFX10-NEXT: s_xor_b32 s5, s4, s2 -; GFX10-NEXT: s_ashr_i32 s4, s5, 30 -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 -; GFX10-NEXT: v_mul_f32_e32 v4, v3, v1 +; GFX10-NEXT: v_cvt_f32_i32_e32 v2, s4 +; GFX10-NEXT: s_xor_b32 s4, s4, s2 +; GFX10-NEXT: s_ashr_i32 s4, s4, 30 ; GFX10-NEXT: s_or_b32 s4, s4, 1 -; GFX10-NEXT: v_trunc_f32_e32 v4, v4 -; GFX10-NEXT: v_mad_f32 v3, -v4, v0, v3 -; GFX10-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX10-NEXT: v_cmp_ge_f32_e64 s5, |v3|, |v0| +; GFX10-NEXT: v_mul_f32_e32 v3, v2, v1 +; GFX10-NEXT: v_trunc_f32_e32 v3, v3 +; GFX10-NEXT: v_mad_f32 v2, -v3, v0, v2 +; GFX10-NEXT: v_cmp_ge_f32_e64 s5, |v2|, |v0| +; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v3 ; GFX10-NEXT: s_and_b32 s5, s5, exec_lo ; GFX10-NEXT: s_cselect_b32 s4, s4, 0 ; GFX10-NEXT: s_and_b32 s5, 0xffff, s3 -; GFX10-NEXT: v_readfirstlane_b32 s3, v2 +; GFX10-NEXT: s_add_i32 s3, s3, 1 ; GFX10-NEXT: s_lshl_b32 s5, s5, 1 -; GFX10-NEXT: v_add_nc_u32_e32 v2, s4, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v2, s4, v2 ; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: s_and_b32 s4, s3, 0xffff +; GFX10-NEXT: s_cmpk_eq_i32 s4, 0x400 ; GFX10-NEXT: global_store_short v3, v2, s[0:1] -; GFX10-NEXT: s_cbranch_vccz .LBB6_1 +; GFX10-NEXT: s_cbranch_scc0 .LBB6_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm ; @@ -951,30 +959,32 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: .LBB6_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_sext_i32_i16 s4, s3 -; GFX11-NEXT: v_add_nc_u16 v2, s3, 1 -; GFX11-NEXT: v_cvt_f32_i32_e32 v3, s4 -; GFX11-NEXT: s_xor_b32 s5, s4, s2 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: s_ashr_i32 s4, s5, 30 -; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v4, v3, v1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cvt_f32_i32_e32 v2, s4 +; GFX11-NEXT: s_xor_b32 s4, s4, s2 +; GFX11-NEXT: s_ashr_i32 s4, s4, 30 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: s_or_b32 s4, s4, 1 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v3, v2, v1 +; GFX11-NEXT: v_trunc_f32_e32 v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_trunc_f32_e32 v4, v4 -; GFX11-NEXT: v_fma_f32 v3, -v4, v0, v3 -; GFX11-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_ge_f32_e64 s5, |v3|, |v0| +; GFX11-NEXT: v_fma_f32 v2, -v3, v0, v2 +; GFX11-NEXT: v_cmp_ge_f32_e64 s5, |v2|, |v0| +; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: s_and_b32 s5, s5, exec_lo ; GFX11-NEXT: s_cselect_b32 s4, s4, 0 ; GFX11-NEXT: s_and_b32 s5, 0xffff, s3 -; GFX11-NEXT: v_readfirstlane_b32 s3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, s4, v2 ; GFX11-NEXT: s_lshl_b32 s5, s5, 1 +; GFX11-NEXT: s_add_i32 s3, s3, 1 +; GFX11-NEXT: v_mov_b32_e32 v3, s5 +; GFX11-NEXT: s_and_b32 s4, s3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v3, s5 :: v_dual_add_nc_u32 v2, s4, v4 +; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400 ; GFX11-NEXT: global_store_b16 v3, v2, s[0:1] -; GFX11-NEXT: s_cbranch_vccz .LBB6_1 +; GFX11-NEXT: s_cbranch_scc0 .LBB6_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1004,36 +1014,34 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: s_sext_i32_i16 s4, s0 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0 -; GFX9-NEXT: s_movk_i32 s2, 0x400 +; GFX9-NEXT: s_mov_b32 s2, 0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB7_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_sext_i32_i16 s5, s3 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5 -; GFX9-NEXT: s_xor_b32 s6, s5, s4 -; GFX9-NEXT: s_ashr_i32 s6, s6, 30 -; GFX9-NEXT: s_or_b32 s8, s6, 1 -; GFX9-NEXT: v_mul_f32_e32 v4, v3, v1 -; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 -; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v3|, |v0| +; GFX9-NEXT: s_sext_i32_i16 s3, s2 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 +; GFX9-NEXT: s_xor_b32 s5, s3, s4 +; GFX9-NEXT: s_ashr_i32 s5, s5, 30 +; GFX9-NEXT: s_or_b32 s5, s5, 1 +; GFX9-NEXT: v_mul_f32_e32 v3, v2, v1 +; GFX9-NEXT: v_trunc_f32_e32 v3, v3 +; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 +; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v2|, |v0| ; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec -; GFX9-NEXT: v_add_u16_e64 v2, s3, 1 -; GFX9-NEXT: s_cselect_b32 s6, s8, 0 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s2, v2 -; GFX9-NEXT: s_and_b32 s7, 0xffff, s3 -; GFX9-NEXT: v_readfirstlane_b32 s3, v2 -; GFX9-NEXT: v_add_u32_e32 v2, s6, v4 +; GFX9-NEXT: s_cselect_b32 s5, s5, 0 +; GFX9-NEXT: v_add_u32_e32 v2, s5, v3 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, s4 -; GFX9-NEXT: s_lshl_b32 s6, s7, 1 -; GFX9-NEXT: s_and_b64 vcc, exec, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 +; GFX9-NEXT: s_and_b32 s6, 0xffff, s2 +; GFX9-NEXT: s_add_i32 s2, s2, 1 +; GFX9-NEXT: s_lshl_b32 s5, s6, 1 +; GFX9-NEXT: s_and_b32 s6, s2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: s_cmpk_eq_i32 s6, 0x400 +; GFX9-NEXT: v_sub_u32_e32 v2, s3, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v3, v2, s[0:1] -; GFX9-NEXT: s_cbranch_vccz .LBB7_1 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm ; @@ -1050,29 +1058,28 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: .LBB7_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_sext_i32_i16 s4, s3 -; GFX10-NEXT: v_add_nc_u16 v2, s3, 1 -; GFX10-NEXT: v_cvt_f32_i32_e32 v3, s4 +; GFX10-NEXT: v_cvt_f32_i32_e32 v2, s4 ; GFX10-NEXT: s_xor_b32 s5, s4, s2 ; GFX10-NEXT: s_ashr_i32 s5, s5, 30 -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 -; GFX10-NEXT: v_mul_f32_e32 v4, v3, v1 ; GFX10-NEXT: s_or_b32 s5, s5, 1 -; GFX10-NEXT: v_trunc_f32_e32 v4, v4 -; GFX10-NEXT: v_mad_f32 v3, -v4, v0, v3 -; GFX10-NEXT: v_cmp_ge_f32_e64 s6, |v3|, |v0| -; GFX10-NEXT: v_cvt_i32_f32_e32 v3, v4 +; GFX10-NEXT: v_mul_f32_e32 v3, v2, v1 +; GFX10-NEXT: v_trunc_f32_e32 v3, v3 +; GFX10-NEXT: v_mad_f32 v2, -v3, v0, v2 +; GFX10-NEXT: v_cmp_ge_f32_e64 s6, |v2|, |v0| +; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v3 ; GFX10-NEXT: s_and_b32 s6, s6, exec_lo ; GFX10-NEXT: s_cselect_b32 s5, s5, 0 -; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo -; GFX10-NEXT: v_add_nc_u32_e32 v3, s5, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v2, s5, v2 ; GFX10-NEXT: s_and_b32 s5, 0xffff, s3 -; GFX10-NEXT: v_readfirstlane_b32 s3, v2 +; GFX10-NEXT: s_add_i32 s3, s3, 1 ; GFX10-NEXT: s_lshl_b32 s5, s5, 1 -; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: v_mul_lo_u32 v3, v3, s2 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, s4, v3 -; GFX10-NEXT: global_store_short v2, v3, s[0:1] -; GFX10-NEXT: s_cbranch_vccz .LBB7_1 +; GFX10-NEXT: v_mul_lo_u32 v2, v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s4, v2 +; GFX10-NEXT: s_and_b32 s4, s3, 0xffff +; GFX10-NEXT: s_cmpk_eq_i32 s4, 0x400 +; GFX10-NEXT: global_store_short v3, v2, s[0:1] +; GFX10-NEXT: s_cbranch_scc0 .LBB7_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm ; @@ -1091,35 +1098,36 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: .LBB7_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_sext_i32_i16 s4, s3 -; GFX11-NEXT: v_add_nc_u16 v2, s3, 1 -; GFX11-NEXT: v_cvt_f32_i32_e32 v3, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cvt_f32_i32_e32 v2, s4 ; GFX11-NEXT: s_xor_b32 s5, s4, s2 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: s_ashr_i32 s5, s5, 30 -; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v4, v3, v1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: s_or_b32 s5, s5, 1 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v3, v2, v1 +; GFX11-NEXT: v_trunc_f32_e32 v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_trunc_f32_e32 v4, v4 -; GFX11-NEXT: v_fma_f32 v3, -v4, v0, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_ge_f32_e64 s6, |v3|, |v0| -; GFX11-NEXT: v_cvt_i32_f32_e32 v3, v4 +; GFX11-NEXT: v_fma_f32 v2, -v3, v0, v2 +; GFX11-NEXT: v_cmp_ge_f32_e64 s6, |v2|, |v0| +; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: s_and_b32 s6, s6, exec_lo ; GFX11-NEXT: s_cselect_b32 s5, s5, 0 -; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_add_nc_u32_e32 v3, s5, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_add_nc_u32_e32 v2, s5, v2 ; GFX11-NEXT: s_and_b32 s5, 0xffff, s3 -; GFX11-NEXT: v_readfirstlane_b32 s3, v2 +; GFX11-NEXT: s_add_i32 s3, s3, 1 ; GFX11-NEXT: s_lshl_b32 s5, s5, 1 -; GFX11-NEXT: v_mov_b32_e32 v2, s5 -; GFX11-NEXT: v_mul_lo_u32 v3, v3, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_sub_nc_u32_e32 v3, s4, v3 -; GFX11-NEXT: global_store_b16 v2, v3, s[0:1] -; GFX11-NEXT: s_cbranch_vccz .LBB7_1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_lo_u32 v2, v2, s2 +; GFX11-NEXT: v_mov_b32_e32 v3, s5 +; GFX11-NEXT: v_sub_nc_u32_e32 v2, s4, v2 +; GFX11-NEXT: s_and_b32 s4, s3, 0xffff +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400 +; GFX11-NEXT: global_store_b16 v3, v2, s[0:1] +; GFX11-NEXT: s_cbranch_scc0 .LBB7_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/imm16.ll b/llvm/test/CodeGen/AMDGPU/imm16.ll index f407a1c26dd3eb..ecece35337a7a8 100644 --- a/llvm/test/CodeGen/AMDGPU/imm16.ll +++ b/llvm/test/CodeGen/AMDGPU/imm16.ll @@ -1505,7 +1505,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f16(ptr addrspace(1) %out, ptr a ; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x28,0xe0,0x00,0x00,0x02,0x80] ; GFX10-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x03,0x85,0xbe] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: v_add_nc_u16 v0, v0, -1 ; encoding: [0x00,0x00,0x03,0xd7,0x00,0x83,0x01,0x00] +; GFX10-NEXT: v_add_nc_u32_e32 v0, -1, v0 ; encoding: [0xc1,0x00,0x00,0x4a] ; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x01,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; @@ -1523,7 +1523,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f16(ptr addrspace(1) %out, ptr a ; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x02,0x80] ; GFX11-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] -; GFX11-NEXT: v_add_nc_u16 v0, v0, -1 ; encoding: [0x00,0x00,0x03,0xd7,0x00,0x83,0x01,0x00] +; GFX11-NEXT: v_add_nc_u32_e32 v0, -1, v0 ; encoding: [0xc1,0x00,0x00,0x4a] ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -1543,7 +1543,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f16(ptr addrspace(1) %out, ptr a ; VI-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe] ; VI-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe] ; VI-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] -; VI-NEXT: v_add_u16_e32 v0, -1, v0 ; encoding: [0xc1,0x00,0x00,0x4c] +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v0 ; encoding: [0xc1,0x00,0x00,0x32] ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x01,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; @@ -1586,7 +1586,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f16(ptr addrspace(1) %out, ptr a ; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x28,0xe0,0x00,0x00,0x02,0x80] ; GFX10-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x03,0x85,0xbe] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: v_add_nc_u16 v0, v0, -2 ; encoding: [0x00,0x00,0x03,0xd7,0x00,0x85,0x01,0x00] +; GFX10-NEXT: v_add_nc_u32_e32 v0, 0xfffe, v0 ; encoding: [0xff,0x00,0x00,0x4a,0xfe,0xff,0x00,0x00] ; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x01,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; @@ -1604,7 +1604,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f16(ptr addrspace(1) %out, ptr a ; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x02,0x80] ; GFX11-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] -; GFX11-NEXT: v_add_nc_u16 v0, v0, -2 ; encoding: [0x00,0x00,0x03,0xd7,0x00,0x85,0x01,0x00] +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0xfffe, v0 ; encoding: [0xff,0x00,0x00,0x4a,0xfe,0xff,0x00,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -1624,7 +1624,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f16(ptr addrspace(1) %out, ptr a ; VI-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe] ; VI-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe] ; VI-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] -; VI-NEXT: v_add_u16_e32 v0, -2, v0 ; encoding: [0xc2,0x00,0x00,0x4c] +; VI-NEXT: v_add_u32_e32 v0, vcc, 0xfffe, v0 ; encoding: [0xff,0x00,0x00,0x32,0xfe,0xff,0x00,0x00] ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x01,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; @@ -1667,7 +1667,7 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f16(ptr addrspace(1) %out, ptr ; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x28,0xe0,0x00,0x00,0x02,0x80] ; GFX10-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x03,0x85,0xbe] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: v_add_nc_u16 v0, v0, -16 ; encoding: [0x00,0x00,0x03,0xd7,0x00,0xa1,0x01,0x00] +; GFX10-NEXT: v_add_nc_u32_e32 v0, 0xfff0, v0 ; encoding: [0xff,0x00,0x00,0x4a,0xf0,0xff,0x00,0x00] ; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x01,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; @@ -1685,7 +1685,7 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f16(ptr addrspace(1) %out, ptr ; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x02,0x80] ; GFX11-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] -; GFX11-NEXT: v_add_nc_u16 v0, v0, -16 ; encoding: [0x00,0x00,0x03,0xd7,0x00,0xa1,0x01,0x00] +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0xfff0, v0 ; encoding: [0xff,0x00,0x00,0x4a,0xf0,0xff,0x00,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -1705,7 +1705,7 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f16(ptr addrspace(1) %out, ptr ; VI-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe] ; VI-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe] ; VI-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] -; VI-NEXT: v_add_u16_e32 v0, -16, v0 ; encoding: [0xd0,0x00,0x00,0x4c] +; VI-NEXT: v_add_u32_e32 v0, vcc, 0xfff0, v0 ; encoding: [0xff,0x00,0x00,0x32,0xf0,0xff,0x00,0x00] ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x01,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll index cddfb21a6fbdf4..9d1368b2ec105a 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -90,77 +90,85 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_mov_b32 s14, s23 ; GFX11-NEXT: s_mov_b64 s[2:3], s[6:7] -; GFX11-NEXT: s_mov_b32 s1, -1 -; GFX11-NEXT: s_cbranch_execz .LBB2_4 -; GFX11-NEXT: s_branch .LBB2_12 -; GFX11-NEXT: .LBB2_3: -; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: .LBB2_3: ; %Flow10 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_vccnz .LBB2_12 -; GFX11-NEXT: .LBB2_4: ; %bb16 -; GFX11-NEXT: s_load_b32 s6, s[2:3], 0x54 +; GFX11-NEXT: ; %bb.4: ; %bb16 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x54 ; GFX11-NEXT: s_bitcmp1_b32 s19, 0 -; GFX11-NEXT: s_cselect_b32 s0, -1, 0 -; GFX11-NEXT: s_and_b32 s7, s19, 1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitcmp1_b32 s6, 0 -; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_cselect_b32 s8, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s7, 0 +; GFX11-NEXT: s_and_b32 s1, s19, 1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bitcmp1_b32 s0, 0 +; GFX11-NEXT: s_mov_b32 s0, -1 +; GFX11-NEXT: s_cselect_b32 s7, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB2_8 ; GFX11-NEXT: ; %bb.5: ; %bb18.preheader ; GFX11-NEXT: s_load_b128 s[24:27], s[2:3], 0x44 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_hi_u32 s6, s25, s24 -; GFX11-NEXT: s_mul_i32 s7, s25, s24 +; GFX11-NEXT: s_mul_hi_u32 s0, s25, s24 +; GFX11-NEXT: s_mul_i32 s1, s25, s24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_alignbit_b32 v0, s6, s7, 1 -; GFX11-NEXT: s_mov_b32 s7, 0 -; GFX11-NEXT: v_readfirstlane_b32 s6, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s22 +; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 1 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s6, s6, 1 -; GFX11-NEXT: s_lshr_b32 s6, s6, s26 +; GFX11-NEXT: s_or_b32 s0, s0, 1 +; GFX11-NEXT: s_lshr_b32 s0, s0, s26 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_i32 s6, s6, s18 -; GFX11-NEXT: s_mul_i32 s6, s6, s16 +; GFX11-NEXT: s_mul_i32 s0, s0, s18 +; GFX11-NEXT: s_mul_i32 s0, s0, s16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s6, s21, s6 -; GFX11-NEXT: s_lshl_b64 s[18:19], s[6:7], 1 -; GFX11-NEXT: global_load_u16 v1, v2, s[18:19] +; GFX11-NEXT: s_or_b32 s0, s21, s0 +; GFX11-NEXT: s_lshl_b64 s[18:19], s[0:1], 1 +; GFX11-NEXT: s_mov_b32 s0, s1 +; GFX11-NEXT: global_load_u16 v1, v0, s[18:19] +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s22 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: s_mov_b32 vcc_lo, 0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB2_6: ; %bb18 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_cmp_ne_u16_e64 s6, s7, 0 -; GFX11-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11-NEXT: s_and_b32 vcc_lo, s8, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v1, v3, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo -; GFX11-NEXT: s_mov_b32 vcc_lo, 0 -; GFX11-NEXT: v_readfirstlane_b32 s6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX11-NEXT: s_bitcmp1_b32 s6, 0 -; GFX11-NEXT: s_cselect_b32 s6, 0x100, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s7, s6, s7 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: v_readfirstlane_b32 s9, v0 +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_cselect_b32 s1, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 +; GFX11-NEXT: s_and_b32 s1, s7, s1 +; GFX11-NEXT: s_and_b32 s1, s1, exec_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_readfirstlane_b32 s13, v2 +; GFX11-NEXT: s_cselect_b32 s1, s13, s9 +; GFX11-NEXT: s_and_b32 s9, 0xffff, s0 +; GFX11-NEXT: s_and_b32 s1, s1, 1 +; GFX11-NEXT: s_cmp_lg_u32 s9, 0 +; GFX11-NEXT: s_cselect_b32 s9, -1, 0 +; GFX11-NEXT: s_and_b32 s16, s8, exec_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s9 +; GFX11-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_readfirstlane_b32 s13, v2 +; GFX11-NEXT: s_cselect_b32 s9, s13, s9 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_bitcmp1_b32 s9, 0 +; GFX11-NEXT: s_cselect_b32 s9, 0x100, 0 +; GFX11-NEXT: s_or_b32 s0, s9, s0 ; GFX11-NEXT: s_cbranch_vccz .LBB2_6 ; GFX11-NEXT: ; %bb.7: ; %Flow -; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB2_8: ; %Flow12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_vccz .LBB2_12 ; GFX11-NEXT: ; %bb.9: -; GFX11-NEXT: s_xor_b32 s0, s8, -1 +; GFX11-NEXT: s_xor_b32 s0, s7, -1 ; GFX11-NEXT: .LBB2_10: ; %bb17 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -169,7 +177,7 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: ; %bb.11: ; %Flow6 ; GFX11-NEXT: s_mov_b32 s17, -1 ; GFX11-NEXT: .LBB2_12: ; %Flow11 -; GFX11-NEXT: s_and_b32 s6, s1, exec_lo +; GFX11-NEXT: s_and_b32 s6, s6, exec_lo ; GFX11-NEXT: s_or_not1_b32 s0, s17, exec_lo ; GFX11-NEXT: .LBB2_13: ; %Flow9 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s20 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index 247ec407df5fd3..f7715364637787 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -473,101 +473,89 @@ define amdgpu_kernel void @byte16_inselt(ptr addrspace(1) %out, <16 x i8> %vec, ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s2, s7, 24 ; GCN-NEXT: s_cmp_lg_u32 s8, 15 -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s7, 16 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 +; GCN-NEXT: s_lshr_b32 s3, s7, 16 +; GCN-NEXT: s_lshl_b32 s2, s2, 8 ; GCN-NEXT: s_cmp_lg_u32 s8, 14 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s2 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s7, 8 -; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_and_b32 s3, s3, 0xff +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_lshr_b32 s9, s7, 8 +; GCN-NEXT: s_lshl_b32 s2, s2, 16 ; GCN-NEXT: s_cmp_lg_u32 s8, 13 -; GCN-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v1, s2 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s3, s9, 1 +; GCN-NEXT: s_lshl_b32 s3, s3, 8 ; GCN-NEXT: s_cmp_lg_u32 s8, 12 -; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GCN-NEXT: v_cndmask_b32_e32 v2, 1, v2, vcc -; GCN-NEXT: s_lshr_b32 s2, s6, 24 -; GCN-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GCN-NEXT: s_cselect_b32 s7, s7, 1 +; GCN-NEXT: s_and_b32 s7, s7, 0xff +; GCN-NEXT: s_or_b32 s3, s7, s3 +; GCN-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_lshr_b32 s3, s6, 24 ; GCN-NEXT: s_cmp_lg_u32 s8, 11 -; GCN-NEXT: v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s6, 16 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_lshl_b32 s3, s3, 8 +; GCN-NEXT: s_lshr_b32 s7, s6, 16 ; GCN-NEXT: s_cmp_lg_u32 s8, 10 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s2 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s6, 8 -; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc +; GCN-NEXT: s_cselect_b32 s7, s7, 1 +; GCN-NEXT: s_and_b32 s7, s7, 0xff +; GCN-NEXT: s_or_b32 s3, s7, s3 +; GCN-NEXT: s_lshl_b32 s3, s3, 16 +; GCN-NEXT: s_lshr_b32 s7, s6, 8 ; GCN-NEXT: s_cmp_lg_u32 s8, 9 -; GCN-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v1, s2 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s7, s7, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 8 ; GCN-NEXT: s_cmp_lg_u32 s8, 8 -; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GCN-NEXT: v_cndmask_b32_e32 v2, 1, v2, vcc -; GCN-NEXT: s_lshr_b32 s2, s5, 24 -; GCN-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GCN-NEXT: s_cselect_b32 s6, s6, 1 +; GCN-NEXT: s_and_b32 s6, s6, 0xff +; GCN-NEXT: s_or_b32 s6, s6, s7 +; GCN-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NEXT: s_or_b32 s3, s6, s3 +; GCN-NEXT: s_lshr_b32 s6, s5, 24 ; GCN-NEXT: s_cmp_lg_u32 s8, 7 -; GCN-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s5, 16 +; GCN-NEXT: s_cselect_b32 s6, s6, 1 +; GCN-NEXT: s_lshl_b32 s6, s6, 8 +; GCN-NEXT: s_lshr_b32 s7, s5, 16 ; GCN-NEXT: s_cmp_lg_u32 s8, 6 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s2 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s5, 8 -; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc +; GCN-NEXT: s_cselect_b32 s7, s7, 1 +; GCN-NEXT: s_and_b32 s7, s7, 0xff +; GCN-NEXT: s_or_b32 s6, s7, s6 +; GCN-NEXT: s_lshl_b32 s6, s6, 16 +; GCN-NEXT: s_lshr_b32 s7, s5, 8 ; GCN-NEXT: s_cmp_lg_u32 s8, 5 -; GCN-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v1, s2 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s7, s7, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 8 ; GCN-NEXT: s_cmp_lg_u32 s8, 4 -; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v4, s5 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc -; GCN-NEXT: s_lshr_b32 s2, s4, 24 -; GCN-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GCN-NEXT: s_cselect_b32 s5, s5, 1 +; GCN-NEXT: s_and_b32 s5, s5, 0xff +; GCN-NEXT: s_or_b32 s5, s5, s7 +; GCN-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NEXT: s_or_b32 s5, s5, s6 +; GCN-NEXT: s_lshr_b32 s6, s4, 24 ; GCN-NEXT: s_cmp_lg_u32 s8, 3 -; GCN-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s4, 16 +; GCN-NEXT: s_cselect_b32 s6, s6, 1 +; GCN-NEXT: s_lshl_b32 s6, s6, 8 +; GCN-NEXT: s_lshr_b32 s7, s4, 16 ; GCN-NEXT: s_cmp_lg_u32 s8, 2 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v4, s2 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s4, 8 -; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc +; GCN-NEXT: s_cselect_b32 s7, s7, 1 +; GCN-NEXT: s_and_b32 s7, s7, 0xff +; GCN-NEXT: s_or_b32 s6, s7, s6 +; GCN-NEXT: s_lshl_b32 s6, s6, 16 +; GCN-NEXT: s_lshr_b32 s7, s4, 8 ; GCN-NEXT: s_cmp_lg_u32 s8, 1 -; GCN-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v4, s2 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s7, s7, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 8 ; GCN-NEXT: s_cmp_lg_u32 s8, 0 -; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc -; GCN-NEXT: v_mov_b32_e32 v5, s4 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_lshlrev_b16_e32 v4, 8, v4 -; GCN-NEXT: v_cndmask_b32_e32 v5, 1, v5, vcc -; GCN-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GCN-NEXT: s_cselect_b32 s4, s4, 1 +; GCN-NEXT: s_and_b32 s4, s4, 0xff +; GCN-NEXT: s_or_b32 s4, s4, s7 +; GCN-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NEXT: s_or_b32 s4, s4, s6 ; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: v_mov_b32_e32 v3, s2 ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm @@ -971,22 +959,22 @@ define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32 ; GCN-NEXT: s_mov_b32 s15, 0xe80000 ; GCN-NEXT: s_add_u32 s12, s12, s9 ; GCN-NEXT: s_addc_u32 s13, s13, 0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bfe_u32 s6, s2, 0x10003 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: s_bfe_u32 s5, s2, 0x20002 +; GCN-NEXT: buffer_store_byte v1, off, s[12:15], 0 +; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: s_bfe_u32 s4, s2, 0x10001 +; GCN-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:3 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_and_b32 s3, s3, 3 -; GCN-NEXT: v_lshrrev_b16_e64 v2, 1, s2 -; GCN-NEXT: v_mov_b32_e32 v3, s2 -; GCN-NEXT: v_lshrrev_b16_e64 v4, 2, s2 -; GCN-NEXT: v_lshrrev_b16_e64 v5, 3, s2 -; GCN-NEXT: v_mov_b32_e32 v1, 1 +; GCN-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:2 +; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: v_or_b32_e32 v0, s3, v0 -; GCN-NEXT: v_and_b32_e32 v2, 1, v2 -; GCN-NEXT: v_and_b32_e32 v4, 3, v4 -; GCN-NEXT: v_and_b32_e32 v5, 1, v5 -; GCN-NEXT: buffer_store_byte v3, off, s[12:15], 0 -; GCN-NEXT: buffer_store_byte v5, off, s[12:15], 0 offset:3 -; GCN-NEXT: buffer_store_byte v4, off, s[12:15], 0 offset:2 -; GCN-NEXT: buffer_store_byte v2, off, s[12:15], 0 offset:1 +; GCN-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:1 +; GCN-NEXT: v_mov_b32_e32 v1, 1 ; GCN-NEXT: buffer_store_byte v1, v0, s[12:15], 0 offen ; GCN-NEXT: buffer_load_ubyte v0, off, s[12:15], 0 ; GCN-NEXT: buffer_load_ubyte v1, off, s[12:15], 0 offset:1 @@ -1020,909 +1008,838 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-LABEL: bit128_inselt: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GCN-NEXT: s_load_dword s8, s[2:3], 0x44 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x44 +; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s3, s4, 24 -; GCN-NEXT: s_lshr_b32 s8, s4, 16 -; GCN-NEXT: s_lshr_b32 s9, s4, 17 -; GCN-NEXT: s_lshr_b32 s10, s4, 18 -; GCN-NEXT: s_lshr_b32 s11, s4, 19 -; GCN-NEXT: s_lshr_b32 s12, s4, 20 -; GCN-NEXT: s_lshr_b32 s13, s4, 21 -; GCN-NEXT: s_lshr_b32 s14, s4, 22 -; GCN-NEXT: s_lshr_b32 s15, s4, 23 -; GCN-NEXT: s_lshr_b32 s16, s5, 24 -; GCN-NEXT: s_lshr_b32 s17, s5, 16 -; GCN-NEXT: s_lshr_b32 s18, s5, 17 -; GCN-NEXT: s_lshr_b32 s19, s5, 18 -; GCN-NEXT: s_lshr_b32 s20, s5, 19 -; GCN-NEXT: s_lshr_b32 s21, s5, 20 -; GCN-NEXT: s_lshr_b32 s22, s5, 21 -; GCN-NEXT: s_lshr_b32 s23, s5, 22 -; GCN-NEXT: s_lshr_b32 s24, s5, 23 -; GCN-NEXT: s_lshr_b32 s25, s6, 24 -; GCN-NEXT: s_lshr_b32 s26, s6, 16 -; GCN-NEXT: s_lshr_b32 s27, s6, 17 -; GCN-NEXT: s_lshr_b32 s28, s6, 18 -; GCN-NEXT: s_lshr_b32 s29, s6, 19 -; GCN-NEXT: s_lshr_b32 s30, s6, 20 -; GCN-NEXT: s_lshr_b32 s31, s6, 21 -; GCN-NEXT: s_lshr_b32 s33, s6, 22 -; GCN-NEXT: s_lshr_b32 s34, s6, 23 -; GCN-NEXT: s_lshr_b32 s35, s7, 24 -; GCN-NEXT: s_lshr_b32 s36, s7, 16 -; GCN-NEXT: s_lshr_b32 s37, s7, 17 -; GCN-NEXT: s_lshr_b32 s38, s7, 18 -; GCN-NEXT: s_lshr_b32 s39, s7, 19 -; GCN-NEXT: s_lshr_b32 s40, s7, 20 -; GCN-NEXT: s_lshr_b32 s41, s7, 21 -; GCN-NEXT: s_lshr_b32 s42, s7, 22 -; GCN-NEXT: s_lshr_b32 s43, s7, 23 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x77 -; GCN-NEXT: v_mov_b32_e32 v15, s43 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x76 -; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc -; GCN-NEXT: v_mov_b32_e32 v18, s42 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v15, 3, v15 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x75 -; GCN-NEXT: v_or_b32_e32 v15, v15, v18 -; GCN-NEXT: v_mov_b32_e32 v18, s41 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x74 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_mov_b32_e32 v19, s40 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v18, 1, v18 -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_and_b32_e32 v18, 3, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x73 -; GCN-NEXT: v_or_b32_e32 v15, v18, v15 -; GCN-NEXT: v_mov_b32_e32 v18, s39 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x72 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_mov_b32_e32 v19, s38 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x71 -; GCN-NEXT: v_or_b32_e32 v18, v18, v19 -; GCN-NEXT: v_mov_b32_e32 v19, s37 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x70 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_mov_b32_e32 v20, s36 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v19, 1, v19 -; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: v_or_b32_e32 v19, v20, v19 -; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v15, 4, v15 -; GCN-NEXT: v_and_b32_e32 v18, 15, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7f -; GCN-NEXT: v_or_b32_e32 v15, v18, v15 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 7, s35 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7e -; GCN-NEXT: v_lshrrev_b16_e64 v19, 6, s35 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7d -; GCN-NEXT: v_or_b32_e32 v18, v18, v19 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 5, s35 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7c -; GCN-NEXT: v_lshrrev_b16_e64 v20, 4, s35 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v19, 1, v19 -; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: v_or_b32_e32 v19, v20, v19 -; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7b -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 3, s35 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7a -; GCN-NEXT: v_lshrrev_b16_e64 v20, 2, s35 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x78 -; GCN-NEXT: v_mov_b32_e32 v13, s35 -; GCN-NEXT: v_lshlrev_b16_e32 v19, 3, v19 -; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x79 -; GCN-NEXT: v_or_b32_e32 v19, v19, v20 -; GCN-NEXT: v_lshrrev_b16_e64 v20, 1, s35 -; GCN-NEXT: v_cndmask_b32_e32 v13, 1, v13, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: v_and_b32_e32 v13, 1, v13 -; GCN-NEXT: v_lshlrev_b16_e32 v20, 1, v20 -; GCN-NEXT: v_or_b32_e32 v13, v13, v20 -; GCN-NEXT: v_and_b32_e32 v13, 3, v13 -; GCN-NEXT: v_or_b32_e32 v19, v13, v19 -; GCN-NEXT: v_mov_b32_e32 v13, 15 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 12, v18 -; GCN-NEXT: v_and_b32_sdwa v19, v19, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GCN-NEXT: v_or_b32_e32 v18, v18, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6f -; GCN-NEXT: v_or_b32_sdwa v15, v15, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_lshrrev_b16_e64 v18, 15, s7 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6e -; GCN-NEXT: v_lshrrev_b16_e64 v19, 14, s7 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6d -; GCN-NEXT: v_or_b32_e32 v18, v18, v19 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 13, s7 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6c -; GCN-NEXT: v_lshrrev_b16_e64 v20, 12, s7 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v19, 1, v19 -; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: v_or_b32_e32 v19, v20, v19 -; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6b -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 11, s7 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6a -; GCN-NEXT: v_lshrrev_b16_e64 v20, 10, s7 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: v_lshlrev_b16_e32 v19, 3, v19 -; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x69 -; GCN-NEXT: v_or_b32_e32 v19, v19, v20 -; GCN-NEXT: v_lshrrev_b16_e64 v20, 9, s7 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x68 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 8, s7 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v20, 1, v20 -; GCN-NEXT: v_and_b32_e32 v17, 1, v17 -; GCN-NEXT: v_or_b32_e32 v17, v17, v20 -; GCN-NEXT: v_and_b32_e32 v17, 3, v17 -; GCN-NEXT: v_or_b32_e32 v17, v17, v19 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 12, v18 -; GCN-NEXT: v_and_b32_sdwa v17, v17, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x67 -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 7, s7 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x66 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 6, s7 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x65 -; GCN-NEXT: v_or_b32_e32 v18, v18, v19 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 5, s7 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x64 -; GCN-NEXT: v_lshrrev_b16_e64 v20, 4, s7 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v19, 1, v19 -; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: v_or_b32_e32 v19, v20, v19 -; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x63 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 3, s7 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x62 -; GCN-NEXT: v_lshrrev_b16_e64 v20, 2, s7 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: v_lshlrev_b16_e32 v19, 3, v19 -; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x61 -; GCN-NEXT: v_or_b32_e32 v19, v19, v20 -; GCN-NEXT: v_lshrrev_b16_e64 v20, 1, s7 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x60 -; GCN-NEXT: v_mov_b32_e32 v16, s7 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v20, 1, v20 -; GCN-NEXT: v_and_b32_e32 v16, 1, v16 -; GCN-NEXT: v_or_b32_e32 v16, v16, v20 -; GCN-NEXT: v_and_b32_e32 v16, 3, v16 -; GCN-NEXT: v_or_b32_e32 v16, v16, v19 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 4, v18 -; GCN-NEXT: v_and_b32_e32 v16, 15, v16 -; GCN-NEXT: v_or_b32_e32 v16, v16, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x57 -; GCN-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v17, s34 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x56 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_mov_b32_e32 v18, s33 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 3, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x55 -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: v_mov_b32_e32 v18, s31 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x54 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_mov_b32_e32 v19, s30 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v18, 1, v18 -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_and_b32_e32 v18, 3, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x53 -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: v_mov_b32_e32 v18, s29 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x52 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_mov_b32_e32 v19, s28 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x51 -; GCN-NEXT: v_or_b32_e32 v18, v18, v19 -; GCN-NEXT: v_mov_b32_e32 v19, s27 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x50 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_mov_b32_e32 v20, s26 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v19, 1, v19 -; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: v_or_b32_e32 v19, v20, v19 -; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 4, v17 -; GCN-NEXT: v_and_b32_e32 v18, 15, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5f -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 7, s25 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5e -; GCN-NEXT: v_lshrrev_b16_e64 v19, 6, s25 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5d -; GCN-NEXT: v_or_b32_e32 v18, v18, v19 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 5, s25 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5c -; GCN-NEXT: v_lshrrev_b16_e64 v20, 4, s25 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v19, 1, v19 -; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: v_or_b32_e32 v19, v20, v19 -; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5b -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 3, s25 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5a -; GCN-NEXT: v_lshrrev_b16_e64 v20, 2, s25 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x58 -; GCN-NEXT: v_mov_b32_e32 v3, s25 -; GCN-NEXT: v_lshlrev_b16_e32 v19, 3, v19 -; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x59 -; GCN-NEXT: v_or_b32_e32 v19, v19, v20 -; GCN-NEXT: v_lshrrev_b16_e64 v20, 1, s25 -; GCN-NEXT: v_cndmask_b32_e32 v3, 1, v3, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: v_and_b32_e32 v3, 1, v3 -; GCN-NEXT: v_lshlrev_b16_e32 v20, 1, v20 -; GCN-NEXT: v_or_b32_e32 v3, v3, v20 -; GCN-NEXT: v_and_b32_e32 v3, 3, v3 -; GCN-NEXT: v_or_b32_e32 v3, v3, v19 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 12, v18 -; GCN-NEXT: v_and_b32_sdwa v3, v3, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GCN-NEXT: v_or_b32_e32 v3, v18, v3 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4f -; GCN-NEXT: v_or_b32_sdwa v17, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_lshrrev_b16_e64 v3, 15, s6 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4e -; GCN-NEXT: v_lshrrev_b16_e64 v18, 14, s6 -; GCN-NEXT: v_cndmask_b32_e32 v3, 1, v3, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v3, 3, v3 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4d -; GCN-NEXT: v_or_b32_e32 v3, v3, v18 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 13, s6 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4c -; GCN-NEXT: v_lshrrev_b16_e64 v19, 12, s6 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v18, 1, v18 -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_and_b32_e32 v18, 3, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4b -; GCN-NEXT: v_or_b32_e32 v3, v18, v3 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 11, s6 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4a -; GCN-NEXT: v_lshrrev_b16_e64 v19, 10, s6 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x49 -; GCN-NEXT: v_or_b32_e32 v18, v18, v19 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 9, s6 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x48 -; GCN-NEXT: v_lshrrev_b16_e64 v20, 8, s6 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v19, 1, v19 -; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: v_or_b32_e32 v19, v20, v19 -; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v3, 12, v3 -; GCN-NEXT: v_and_b32_sdwa v18, v18, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x47 -; GCN-NEXT: v_or_b32_e32 v18, v3, v18 -; GCN-NEXT: v_lshrrev_b16_e64 v3, 7, s6 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x46 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 6, s6 -; GCN-NEXT: v_cndmask_b32_e32 v3, 1, v3, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: v_lshlrev_b16_e32 v3, 3, v3 -; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x45 -; GCN-NEXT: v_or_b32_e32 v3, v3, v19 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 5, s6 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x44 -; GCN-NEXT: v_lshrrev_b16_e64 v20, 4, s6 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v19, 1, v19 -; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: v_or_b32_e32 v19, v20, v19 -; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x43 -; GCN-NEXT: v_or_b32_e32 v19, v19, v3 -; GCN-NEXT: v_lshrrev_b16_e64 v3, 3, s6 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x42 -; GCN-NEXT: v_lshrrev_b16_e64 v20, 2, s6 -; GCN-NEXT: v_cndmask_b32_e32 v3, 1, v3, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: v_lshlrev_b16_e32 v3, 3, v3 -; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x41 -; GCN-NEXT: v_or_b32_e32 v3, v3, v20 -; GCN-NEXT: v_lshrrev_b16_e64 v20, 1, s6 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 64 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v2, 1, v2, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v20, 1, v20 -; GCN-NEXT: v_and_b32_e32 v2, 1, v2 -; GCN-NEXT: v_or_b32_e32 v2, v2, v20 -; GCN-NEXT: v_and_b32_e32 v2, 3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v2, v3 -; GCN-NEXT: v_or_b32_sdwa v3, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_lshlrev_b16_e32 v15, 4, v19 -; GCN-NEXT: v_and_b32_e32 v2, 15, v2 -; GCN-NEXT: s_cmp_lg_u32 s2, 55 -; GCN-NEXT: v_or_b32_e32 v2, v2, v15 -; GCN-NEXT: v_mov_b32_e32 v15, s24 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 54 -; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc -; GCN-NEXT: v_mov_b32_e32 v16, s23 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: v_and_b32_e32 v16, 1, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v15, 3, v15 -; GCN-NEXT: v_lshlrev_b16_e32 v16, 2, v16 -; GCN-NEXT: s_cmp_lg_u32 s2, 53 -; GCN-NEXT: v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: v_mov_b32_e32 v16, s22 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 52 -; GCN-NEXT: v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: v_mov_b32_e32 v17, s21 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v16, 1, v16 -; GCN-NEXT: v_and_b32_e32 v17, 1, v17 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_and_b32_e32 v16, 3, v16 -; GCN-NEXT: s_cmp_lg_u32 s2, 51 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: v_mov_b32_e32 v16, s20 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 50 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: v_mov_b32_e32 v17, s19 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_and_b32_e32 v17, 1, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 49 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_mov_b32_e32 v17, s18 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 48 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_mov_b32_e32 v18, s17 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v17, 1, v17 -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: v_and_b32_e32 v17, 3, v17 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v15, 4, v15 -; GCN-NEXT: v_and_b32_e32 v16, 15, v16 -; GCN-NEXT: s_cmp_lg_u32 s2, 63 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: v_lshrrev_b16_e64 v16, 7, s16 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 62 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 6, s16 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_and_b32_e32 v17, 1, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 61 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 5, s16 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 60 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 4, s16 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v17, 1, v17 -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: v_and_b32_e32 v17, 3, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 59 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 3, s16 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 58 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 2, s16 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: s_cmp_lg_u32 s2, 56 -; GCN-NEXT: v_mov_b32_e32 v14, s16 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 3, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 57 -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 1, s16 -; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_and_b32_e32 v14, 1, v14 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 1, v18 -; GCN-NEXT: v_or_b32_e32 v14, v14, v18 -; GCN-NEXT: v_and_b32_e32 v14, 3, v14 -; GCN-NEXT: v_or_b32_e32 v14, v14, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v16, 12, v16 -; GCN-NEXT: v_and_b32_sdwa v14, v14, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GCN-NEXT: v_or_b32_e32 v14, v16, v14 -; GCN-NEXT: s_cmp_lg_u32 s2, 47 -; GCN-NEXT: v_or_b32_sdwa v15, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_lshrrev_b16_e64 v14, 15, s5 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 46 -; GCN-NEXT: v_lshrrev_b16_e64 v16, 14, s5 -; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: v_and_b32_e32 v16, 1, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v14, 3, v14 -; GCN-NEXT: v_lshlrev_b16_e32 v16, 2, v16 -; GCN-NEXT: s_cmp_lg_u32 s2, 45 -; GCN-NEXT: v_or_b32_e32 v14, v14, v16 -; GCN-NEXT: v_lshrrev_b16_e64 v16, 13, s5 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 44 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 12, s5 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v16, 1, v16 -; GCN-NEXT: v_and_b32_e32 v17, 1, v17 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_and_b32_e32 v16, 3, v16 -; GCN-NEXT: s_cmp_lg_u32 s2, 43 -; GCN-NEXT: v_or_b32_e32 v14, v16, v14 -; GCN-NEXT: v_lshrrev_b16_e64 v16, 11, s5 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 42 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 10, s5 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_and_b32_e32 v17, 1, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 41 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 9, s5 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 40 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 8, s5 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v17, 1, v17 -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: v_and_b32_e32 v17, 3, v17 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v14, 12, v14 -; GCN-NEXT: v_and_b32_sdwa v16, v16, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GCN-NEXT: s_cmp_lg_u32 s2, 39 -; GCN-NEXT: v_or_b32_e32 v16, v14, v16 -; GCN-NEXT: v_lshrrev_b16_e64 v14, 7, s5 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 38 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 6, s5 -; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_and_b32_e32 v17, 1, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v14, 3, v14 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 37 -; GCN-NEXT: v_or_b32_e32 v14, v14, v17 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 5, s5 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 36 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 4, s5 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v17, 1, v17 -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: v_and_b32_e32 v17, 3, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 35 -; GCN-NEXT: v_or_b32_e32 v17, v17, v14 -; GCN-NEXT: v_lshrrev_b16_e64 v14, 3, s5 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 34 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 2, s5 -; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v14, 3, v14 -; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 -; GCN-NEXT: s_cmp_lg_u32 s2, 33 -; GCN-NEXT: v_or_b32_e32 v18, v14, v18 -; GCN-NEXT: v_lshrrev_b16_e64 v14, 1, s5 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 32 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v14, 1, v14 -; GCN-NEXT: v_and_b32_e32 v1, 1, v1 -; GCN-NEXT: v_or_b32_e32 v1, v1, v14 -; GCN-NEXT: v_and_b32_e32 v1, 3, v1 -; GCN-NEXT: v_or_b32_e32 v1, v1, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 4, v17 -; GCN-NEXT: v_and_b32_e32 v1, 15, v1 -; GCN-NEXT: v_or_b32_e32 v1, v1, v17 -; GCN-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: s_cmp_lg_u32 s2, 23 -; GCN-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v15, s15 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 22 -; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc -; GCN-NEXT: v_mov_b32_e32 v16, s14 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: v_and_b32_e32 v16, 1, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v15, 3, v15 -; GCN-NEXT: v_lshlrev_b16_e32 v16, 2, v16 -; GCN-NEXT: s_cmp_lg_u32 s2, 21 -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: v_mov_b32_e32 v16, s13 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 20 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: v_mov_b32_e32 v17, s12 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v16, 1, v16 -; GCN-NEXT: v_and_b32_e32 v17, 1, v17 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_and_b32_e32 v16, 3, v16 -; GCN-NEXT: s_cmp_lg_u32 s2, 19 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: v_mov_b32_e32 v16, s11 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 18 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: v_mov_b32_e32 v17, s10 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_and_b32_e32 v17, 1, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 17 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_mov_b32_e32 v17, s9 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 16 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_mov_b32_e32 v19, s8 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v17, 1, v17 -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: v_or_b32_e32 v17, v19, v17 -; GCN-NEXT: v_and_b32_e32 v17, 3, v17 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v15, 4, v15 -; GCN-NEXT: v_and_b32_e32 v16, 15, v16 -; GCN-NEXT: s_cmp_lg_u32 s2, 31 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: v_lshrrev_b16_e64 v16, 7, s3 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 30 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 6, s3 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_and_b32_e32 v17, 1, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 29 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 5, s3 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 28 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 4, s3 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v17, 1, v17 -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: v_or_b32_e32 v17, v19, v17 -; GCN-NEXT: v_and_b32_e32 v17, 3, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 27 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 3, s3 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 26 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 2, s3 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: s_cmp_lg_u32 s2, 24 -; GCN-NEXT: v_mov_b32_e32 v18, s3 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 3, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 25 -; GCN-NEXT: v_or_b32_e32 v17, v17, v19 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 1, s3 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: v_lshlrev_b16_e32 v19, 1, v19 -; GCN-NEXT: v_or_b32_e32 v18, v18, v19 -; GCN-NEXT: v_and_b32_e32 v18, 3, v18 -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v16, 12, v16 -; GCN-NEXT: v_and_b32_sdwa v17, v17, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 15 -; GCN-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_lshrrev_b16_e64 v16, 15, s4 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 14 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 14, s4 -; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: v_and_b32_e32 v17, 1, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 13 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 13, s4 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 12 -; GCN-NEXT: v_lshrrev_b16_e64 v18, 12, s4 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc -; GCN-NEXT: v_lshlrev_b16_e32 v17, 1, v17 -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 11 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 11, s4 -; GCN-NEXT: v_and_b32_e32 v17, 3, v17 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 10 -; GCN-NEXT: v_lshrrev_b16_e64 v14, 10, s4 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v19, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 9 -; GCN-NEXT: v_lshrrev_b16_e64 v12, 9, s4 -; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 8 -; GCN-NEXT: v_lshrrev_b16_e64 v11, 8, s4 -; GCN-NEXT: v_cndmask_b32_e32 v12, 1, v12, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 7 -; GCN-NEXT: v_lshrrev_b16_e64 v10, 7, s4 -; GCN-NEXT: v_cndmask_b32_e32 v11, 1, v11, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 6 -; GCN-NEXT: v_lshrrev_b16_e64 v9, 6, s4 -; GCN-NEXT: v_cndmask_b32_e32 v10, 1, v10, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 5 -; GCN-NEXT: v_lshrrev_b16_e64 v8, 5, s4 -; GCN-NEXT: v_cndmask_b32_e32 v9, 1, v9, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 4 -; GCN-NEXT: v_lshrrev_b16_e64 v7, 4, s4 -; GCN-NEXT: v_cndmask_b32_e32 v8, 1, v8, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 3 -; GCN-NEXT: v_lshrrev_b16_e64 v6, 3, s4 -; GCN-NEXT: v_cndmask_b32_e32 v7, 1, v7, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 2 -; GCN-NEXT: v_lshrrev_b16_e64 v5, 2, s4 -; GCN-NEXT: v_cndmask_b32_e32 v6, 1, v6, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 1 -; GCN-NEXT: v_lshrrev_b16_e64 v4, 1, s4 -; GCN-NEXT: v_cndmask_b32_e32 v5, 1, v5, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: v_and_b32_e32 v14, 1, v14 -; GCN-NEXT: v_lshlrev_b16_e32 v12, 1, v12 -; GCN-NEXT: v_and_b32_e32 v11, 1, v11 -; GCN-NEXT: v_and_b32_e32 v9, 1, v9 -; GCN-NEXT: v_lshlrev_b16_e32 v8, 1, v8 -; GCN-NEXT: v_and_b32_e32 v7, 1, v7 -; GCN-NEXT: v_and_b32_e32 v5, 1, v5 -; GCN-NEXT: v_lshlrev_b16_e32 v4, 1, v4 -; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_lshlrev_b16_e32 v17, 3, v17 -; GCN-NEXT: v_lshlrev_b16_e32 v14, 2, v14 -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: v_lshlrev_b16_e32 v10, 3, v10 -; GCN-NEXT: v_lshlrev_b16_e32 v9, 2, v9 -; GCN-NEXT: v_or_b32_e32 v7, v7, v8 -; GCN-NEXT: v_lshlrev_b16_e32 v6, 3, v6 -; GCN-NEXT: v_lshlrev_b16_e32 v5, 2, v5 -; GCN-NEXT: v_or_b32_e32 v0, v0, v4 -; GCN-NEXT: v_or_b32_e32 v14, v17, v14 -; GCN-NEXT: v_and_b32_e32 v11, 3, v11 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: v_and_b32_e32 v7, 3, v7 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_and_b32_e32 v0, 3, v0 -; GCN-NEXT: v_or_b32_e32 v11, v11, v14 -; GCN-NEXT: v_or_b32_e32 v7, v7, v9 -; GCN-NEXT: v_or_b32_e32 v0, v0, v5 -; GCN-NEXT: v_lshlrev_b16_e32 v16, 12, v16 -; GCN-NEXT: v_and_b32_sdwa v11, v11, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GCN-NEXT: v_lshlrev_b16_e32 v7, 4, v7 -; GCN-NEXT: v_and_b32_e32 v0, 15, v0 -; GCN-NEXT: v_or_b32_e32 v11, v16, v11 -; GCN-NEXT: v_or_b32_e32 v0, v0, v7 -; GCN-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NEXT: s_bfe_u32 s9, s4, 0xf0001 +; GCN-NEXT: s_lshr_b32 s42, s5, 16 +; GCN-NEXT: v_writelane_b32 v0, s0, 0 +; GCN-NEXT: v_writelane_b32 v0, s1, 1 +; GCN-NEXT: s_lshr_b32 s0, s4, 16 +; GCN-NEXT: v_writelane_b32 v0, s0, 2 +; GCN-NEXT: s_lshr_b32 s0, s4, 17 +; GCN-NEXT: v_writelane_b32 v0, s0, 3 +; GCN-NEXT: s_lshr_b32 s0, s4, 18 +; GCN-NEXT: v_writelane_b32 v0, s0, 4 +; GCN-NEXT: s_lshr_b32 s0, s4, 19 +; GCN-NEXT: v_writelane_b32 v0, s0, 5 +; GCN-NEXT: s_lshr_b32 s0, s4, 20 +; GCN-NEXT: v_writelane_b32 v0, s0, 6 +; GCN-NEXT: s_lshr_b32 s0, s4, 21 +; GCN-NEXT: v_writelane_b32 v0, s0, 7 +; GCN-NEXT: s_lshr_b32 s0, s4, 22 +; GCN-NEXT: v_writelane_b32 v0, s0, 8 +; GCN-NEXT: s_lshr_b32 s0, s4, 23 +; GCN-NEXT: v_writelane_b32 v0, s0, 9 +; GCN-NEXT: s_lshr_b32 s0, s4, 24 +; GCN-NEXT: v_writelane_b32 v0, s0, 10 +; GCN-NEXT: s_lshr_b32 s0, s4, 25 +; GCN-NEXT: v_writelane_b32 v0, s0, 11 +; GCN-NEXT: s_lshr_b32 s0, s4, 26 +; GCN-NEXT: v_writelane_b32 v0, s0, 12 +; GCN-NEXT: s_lshr_b32 s0, s4, 27 +; GCN-NEXT: v_writelane_b32 v0, s0, 13 +; GCN-NEXT: s_lshr_b32 s0, s4, 28 +; GCN-NEXT: v_writelane_b32 v0, s0, 14 +; GCN-NEXT: s_lshr_b32 s0, s4, 29 +; GCN-NEXT: v_writelane_b32 v0, s0, 15 +; GCN-NEXT: s_lshr_b32 s0, s4, 30 +; GCN-NEXT: v_writelane_b32 v0, s0, 16 +; GCN-NEXT: s_lshr_b32 s0, s4, 31 +; GCN-NEXT: v_writelane_b32 v0, s0, 17 +; GCN-NEXT: v_writelane_b32 v0, s9, 18 +; GCN-NEXT: s_bfe_u32 s9, s4, 0xe0002 +; GCN-NEXT: v_writelane_b32 v0, s9, 19 +; GCN-NEXT: s_bfe_u32 s9, s4, 0xd0003 +; GCN-NEXT: v_writelane_b32 v0, s9, 20 +; GCN-NEXT: s_bfe_u32 s9, s4, 0xc0004 +; GCN-NEXT: v_writelane_b32 v0, s9, 21 +; GCN-NEXT: s_bfe_u32 s9, s4, 0xb0005 +; GCN-NEXT: v_writelane_b32 v0, s9, 22 +; GCN-NEXT: s_bfe_u32 s9, s4, 0xa0006 +; GCN-NEXT: v_writelane_b32 v0, s9, 23 +; GCN-NEXT: s_bfe_u32 s9, s4, 0x90007 +; GCN-NEXT: v_writelane_b32 v0, s9, 24 +; GCN-NEXT: s_bfe_u32 s9, s4, 0x80008 +; GCN-NEXT: v_writelane_b32 v0, s9, 25 +; GCN-NEXT: s_bfe_u32 s9, s4, 0x70009 +; GCN-NEXT: v_writelane_b32 v0, s9, 26 +; GCN-NEXT: s_bfe_u32 s9, s4, 0x6000a +; GCN-NEXT: v_writelane_b32 v0, s9, 27 +; GCN-NEXT: s_bfe_u32 s9, s4, 0x5000b +; GCN-NEXT: v_writelane_b32 v0, s9, 28 +; GCN-NEXT: s_bfe_u32 s9, s4, 0x4000c +; GCN-NEXT: v_writelane_b32 v0, s9, 29 +; GCN-NEXT: s_bfe_u32 s9, s4, 0x3000d +; GCN-NEXT: v_writelane_b32 v0, s9, 30 +; GCN-NEXT: s_bfe_u32 s9, s4, 0x2000e +; GCN-NEXT: v_writelane_b32 v0, s9, 31 +; GCN-NEXT: s_bfe_u32 s9, s4, 0x1000f +; GCN-NEXT: v_writelane_b32 v0, s9, 32 +; GCN-NEXT: s_bfe_u32 s9, s5, 0xf0001 +; GCN-NEXT: s_lshr_b32 s43, s5, 17 +; GCN-NEXT: s_lshr_b32 s45, s5, 18 +; GCN-NEXT: s_lshr_b32 s47, s5, 19 +; GCN-NEXT: s_lshr_b32 s50, s5, 20 +; GCN-NEXT: s_lshr_b32 s51, s5, 21 +; GCN-NEXT: s_lshr_b32 s53, s5, 22 +; GCN-NEXT: s_lshr_b32 s55, s5, 23 +; GCN-NEXT: s_lshr_b32 s58, s5, 24 +; GCN-NEXT: s_lshr_b32 s59, s5, 25 +; GCN-NEXT: s_lshr_b32 s61, s5, 26 +; GCN-NEXT: s_lshr_b32 s63, s5, 27 +; GCN-NEXT: s_lshr_b32 s66, s5, 28 +; GCN-NEXT: s_lshr_b32 s67, s5, 29 +; GCN-NEXT: s_lshr_b32 s68, s5, 30 +; GCN-NEXT: s_lshr_b32 s69, s5, 31 +; GCN-NEXT: s_lshr_b32 s73, s6, 16 +; GCN-NEXT: s_lshr_b32 s74, s6, 17 +; GCN-NEXT: s_lshr_b32 s77, s6, 18 +; GCN-NEXT: s_lshr_b32 s78, s6, 19 +; GCN-NEXT: s_lshr_b32 s81, s6, 20 +; GCN-NEXT: s_lshr_b32 s82, s6, 21 +; GCN-NEXT: s_lshr_b32 s84, s6, 22 +; GCN-NEXT: s_lshr_b32 s86, s6, 23 +; GCN-NEXT: s_lshr_b32 s89, s6, 24 +; GCN-NEXT: s_lshr_b32 s90, s6, 25 +; GCN-NEXT: s_lshr_b32 s93, s6, 26 +; GCN-NEXT: s_lshr_b32 s94, s6, 27 +; GCN-NEXT: s_lshr_b32 vcc_hi, s6, 28 +; GCN-NEXT: s_lshr_b32 s39, s6, 29 +; GCN-NEXT: s_lshr_b32 s38, s6, 30 +; GCN-NEXT: s_lshr_b32 s37, s6, 31 +; GCN-NEXT: s_lshr_b32 s33, s7, 16 +; GCN-NEXT: s_lshr_b32 s31, s7, 17 +; GCN-NEXT: s_lshr_b32 s28, s7, 18 +; GCN-NEXT: s_lshr_b32 s27, s7, 19 +; GCN-NEXT: s_lshr_b32 s24, s7, 20 +; GCN-NEXT: s_lshr_b32 s23, s7, 21 +; GCN-NEXT: s_lshr_b32 s20, s7, 22 +; GCN-NEXT: s_lshr_b32 s19, s7, 23 +; GCN-NEXT: s_lshr_b32 s16, s7, 24 +; GCN-NEXT: s_lshr_b32 s15, s7, 25 +; GCN-NEXT: s_lshr_b32 s12, s7, 26 +; GCN-NEXT: s_lshr_b32 s11, s7, 27 +; GCN-NEXT: s_lshr_b32 s3, s7, 28 +; GCN-NEXT: s_lshr_b32 s2, s7, 29 +; GCN-NEXT: s_lshr_b32 s1, s7, 30 +; GCN-NEXT: s_lshr_b32 s0, s7, 31 +; GCN-NEXT: v_writelane_b32 v0, s9, 33 +; GCN-NEXT: s_bfe_u32 s40, s5, 0xe0002 +; GCN-NEXT: s_bfe_u32 s41, s5, 0xd0003 +; GCN-NEXT: s_bfe_u32 s44, s5, 0xc0004 +; GCN-NEXT: s_bfe_u32 s46, s5, 0xb0005 +; GCN-NEXT: s_bfe_u32 s48, s5, 0xa0006 +; GCN-NEXT: s_bfe_u32 s49, s5, 0x90007 +; GCN-NEXT: s_bfe_u32 s52, s5, 0x80008 +; GCN-NEXT: s_bfe_u32 s54, s5, 0x70009 +; GCN-NEXT: s_bfe_u32 s56, s5, 0x6000a +; GCN-NEXT: s_bfe_u32 s57, s5, 0x5000b +; GCN-NEXT: s_bfe_u32 s60, s5, 0x4000c +; GCN-NEXT: s_bfe_u32 s62, s5, 0x3000d +; GCN-NEXT: s_bfe_u32 s64, s5, 0x2000e +; GCN-NEXT: s_bfe_u32 s65, s5, 0x1000f +; GCN-NEXT: s_bfe_u32 s70, s6, 0xf0001 +; GCN-NEXT: s_bfe_u32 s71, s6, 0xe0002 +; GCN-NEXT: s_bfe_u32 s72, s6, 0xd0003 +; GCN-NEXT: s_bfe_u32 s75, s6, 0xc0004 +; GCN-NEXT: s_bfe_u32 s76, s6, 0xb0005 +; GCN-NEXT: s_bfe_u32 s79, s6, 0xa0006 +; GCN-NEXT: s_bfe_u32 s80, s6, 0x90007 +; GCN-NEXT: s_bfe_u32 s83, s6, 0x80008 +; GCN-NEXT: s_bfe_u32 s85, s6, 0x70009 +; GCN-NEXT: s_bfe_u32 s87, s6, 0x6000a +; GCN-NEXT: s_bfe_u32 s88, s6, 0x5000b +; GCN-NEXT: s_bfe_u32 s91, s6, 0x4000c +; GCN-NEXT: s_bfe_u32 s92, s6, 0x3000d +; GCN-NEXT: s_bfe_u32 s95, s6, 0x2000e +; GCN-NEXT: s_bfe_u32 vcc_lo, s6, 0x1000f +; GCN-NEXT: s_bfe_u32 s36, s7, 0xf0001 +; GCN-NEXT: s_bfe_u32 s35, s7, 0xe0002 +; GCN-NEXT: s_bfe_u32 s34, s7, 0xd0003 +; GCN-NEXT: s_bfe_u32 s30, s7, 0xc0004 +; GCN-NEXT: s_bfe_u32 s29, s7, 0xb0005 +; GCN-NEXT: s_bfe_u32 s26, s7, 0xa0006 +; GCN-NEXT: s_bfe_u32 s25, s7, 0x90007 +; GCN-NEXT: s_bfe_u32 s22, s7, 0x80008 +; GCN-NEXT: s_bfe_u32 s21, s7, 0x70009 +; GCN-NEXT: s_bfe_u32 s18, s7, 0x6000a +; GCN-NEXT: s_bfe_u32 s17, s7, 0x5000b +; GCN-NEXT: s_bfe_u32 s14, s7, 0x4000c +; GCN-NEXT: s_bfe_u32 s13, s7, 0x3000d +; GCN-NEXT: s_bfe_u32 s10, s7, 0x2000e +; GCN-NEXT: s_bfe_u32 s9, s7, 0x1000f +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x7f +; GCN-NEXT: s_cselect_b32 s0, s0, 1 +; GCN-NEXT: s_lshl_b32 s0, s0, 3 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x7e +; GCN-NEXT: s_cselect_b32 s1, s1, 1 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_lshl_b32 s1, s1, 2 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x7d +; GCN-NEXT: s_cselect_b32 s1, s2, 1 +; GCN-NEXT: s_lshl_b32 s1, s1, 1 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x7c +; GCN-NEXT: s_cselect_b32 s2, s3, 1 +; GCN-NEXT: s_and_b32 s2, s2, 1 +; GCN-NEXT: s_or_b32 s1, s2, s1 +; GCN-NEXT: s_and_b32 s1, s1, 3 +; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: s_lshl_b32 s0, s0, 12 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x7b +; GCN-NEXT: s_cselect_b32 s1, s11, 1 +; GCN-NEXT: s_lshl_b32 s1, s1, 3 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x7a +; GCN-NEXT: s_cselect_b32 s2, s12, 1 +; GCN-NEXT: s_and_b32 s2, s2, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 2 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x79 +; GCN-NEXT: s_cselect_b32 s2, s15, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 1 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x78 +; GCN-NEXT: s_cselect_b32 s3, s16, 1 +; GCN-NEXT: s_and_b32 s3, s3, 1 +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_and_b32 s2, s2, 3 +; GCN-NEXT: s_or_b32 s1, s2, s1 +; GCN-NEXT: s_and_b32 s1, s1, 15 +; GCN-NEXT: s_lshl_b32 s1, s1, 8 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x77 +; GCN-NEXT: s_cselect_b32 s1, s19, 1 +; GCN-NEXT: s_lshl_b32 s1, s1, 3 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x76 +; GCN-NEXT: s_cselect_b32 s2, s20, 1 +; GCN-NEXT: s_and_b32 s2, s2, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 2 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x75 +; GCN-NEXT: s_cselect_b32 s2, s23, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 1 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x74 +; GCN-NEXT: s_cselect_b32 s3, s24, 1 +; GCN-NEXT: s_and_b32 s3, s3, 1 +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_and_b32 s2, s2, 3 +; GCN-NEXT: s_or_b32 s1, s2, s1 +; GCN-NEXT: s_lshl_b32 s1, s1, 4 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x73 +; GCN-NEXT: s_cselect_b32 s2, s27, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 3 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x72 +; GCN-NEXT: s_cselect_b32 s3, s28, 1 +; GCN-NEXT: s_and_b32 s3, s3, 1 +; GCN-NEXT: s_lshl_b32 s3, s3, 2 +; GCN-NEXT: s_or_b32 s2, s2, s3 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x71 +; GCN-NEXT: s_cselect_b32 s3, s31, 1 +; GCN-NEXT: s_lshl_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x70 +; GCN-NEXT: s_cselect_b32 s11, s33, 1 +; GCN-NEXT: s_and_b32 s11, s11, 1 +; GCN-NEXT: s_or_b32 s3, s11, s3 +; GCN-NEXT: s_and_b32 s3, s3, 3 +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_and_b32 s2, s2, 15 +; GCN-NEXT: s_or_b32 s1, s2, s1 +; GCN-NEXT: s_and_b32 s1, s1, 0xff +; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: s_lshl_b32 s0, s0, 16 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x6f +; GCN-NEXT: s_cselect_b32 s1, s9, 1 +; GCN-NEXT: s_lshl_b32 s1, s1, 3 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x6e +; GCN-NEXT: s_cselect_b32 s2, s10, 1 +; GCN-NEXT: s_and_b32 s2, s2, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 2 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x6d +; GCN-NEXT: s_cselect_b32 s2, s13, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 1 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x6c +; GCN-NEXT: s_cselect_b32 s3, s14, 1 +; GCN-NEXT: s_and_b32 s3, s3, 1 +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_and_b32 s2, s2, 3 +; GCN-NEXT: s_or_b32 s1, s2, s1 +; GCN-NEXT: s_lshl_b32 s1, s1, 12 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x6b +; GCN-NEXT: s_cselect_b32 s2, s17, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 3 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x6a +; GCN-NEXT: s_cselect_b32 s3, s18, 1 +; GCN-NEXT: s_and_b32 s3, s3, 1 +; GCN-NEXT: s_lshl_b32 s3, s3, 2 +; GCN-NEXT: s_or_b32 s2, s2, s3 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x69 +; GCN-NEXT: s_cselect_b32 s3, s21, 1 +; GCN-NEXT: s_lshl_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x68 +; GCN-NEXT: s_cselect_b32 s9, s22, 1 +; GCN-NEXT: s_and_b32 s9, s9, 1 +; GCN-NEXT: s_or_b32 s3, s9, s3 +; GCN-NEXT: s_and_b32 s3, s3, 3 +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_and_b32 s2, s2, 15 +; GCN-NEXT: s_lshl_b32 s2, s2, 8 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x67 +; GCN-NEXT: s_cselect_b32 s2, s25, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 3 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x66 +; GCN-NEXT: s_cselect_b32 s3, s26, 1 +; GCN-NEXT: s_and_b32 s3, s3, 1 +; GCN-NEXT: s_lshl_b32 s3, s3, 2 +; GCN-NEXT: s_or_b32 s2, s2, s3 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x65 +; GCN-NEXT: s_cselect_b32 s3, s29, 1 +; GCN-NEXT: s_lshl_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x64 +; GCN-NEXT: s_cselect_b32 s9, s30, 1 +; GCN-NEXT: s_and_b32 s9, s9, 1 +; GCN-NEXT: s_or_b32 s3, s9, s3 +; GCN-NEXT: s_and_b32 s3, s3, 3 +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_lshl_b32 s2, s2, 4 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x63 +; GCN-NEXT: s_cselect_b32 s3, s34, 1 +; GCN-NEXT: s_lshl_b32 s3, s3, 3 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x62 +; GCN-NEXT: s_cselect_b32 s9, s35, 1 +; GCN-NEXT: s_and_b32 s9, s9, 1 +; GCN-NEXT: s_lshl_b32 s9, s9, 2 +; GCN-NEXT: s_or_b32 s3, s3, s9 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x60 +; GCN-NEXT: s_cselect_b32 s7, s7, 1 +; GCN-NEXT: s_and_b32 s7, s7, 1 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x61 +; GCN-NEXT: s_cselect_b32 s9, s36, 1 +; GCN-NEXT: s_lshl_b32 s9, s9, 1 +; GCN-NEXT: s_or_b32 s7, s7, s9 +; GCN-NEXT: s_and_b32 s7, s7, 3 +; GCN-NEXT: s_or_b32 s3, s7, s3 +; GCN-NEXT: s_and_b32 s3, s3, 15 +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_and_b32 s2, s2, 0xff +; GCN-NEXT: s_or_b32 s1, s2, s1 +; GCN-NEXT: s_and_b32 s1, s1, 0xffff +; GCN-NEXT: s_or_b32 s7, s1, s0 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x5f +; GCN-NEXT: s_cselect_b32 s0, s37, 1 +; GCN-NEXT: s_lshl_b32 s0, s0, 3 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x5e +; GCN-NEXT: s_cselect_b32 s1, s38, 1 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_lshl_b32 s1, s1, 2 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x5d +; GCN-NEXT: s_cselect_b32 s1, s39, 1 +; GCN-NEXT: s_lshl_b32 s1, s1, 1 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x5c +; GCN-NEXT: s_cselect_b32 s2, vcc_hi, 1 +; GCN-NEXT: s_and_b32 s2, s2, 1 +; GCN-NEXT: s_or_b32 s1, s2, s1 +; GCN-NEXT: s_and_b32 s1, s1, 3 +; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: s_lshl_b32 s0, s0, 12 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x5b +; GCN-NEXT: s_cselect_b32 s1, s94, 1 +; GCN-NEXT: s_lshl_b32 s1, s1, 3 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x5a +; GCN-NEXT: s_cselect_b32 s2, s93, 1 +; GCN-NEXT: s_and_b32 s2, s2, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 2 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x59 +; GCN-NEXT: s_cselect_b32 s2, s90, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 1 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x58 +; GCN-NEXT: s_cselect_b32 s3, s89, 1 +; GCN-NEXT: s_and_b32 s3, s3, 1 +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_and_b32 s2, s2, 3 +; GCN-NEXT: s_or_b32 s1, s2, s1 +; GCN-NEXT: s_and_b32 s1, s1, 15 +; GCN-NEXT: s_lshl_b32 s1, s1, 8 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x57 +; GCN-NEXT: s_cselect_b32 s1, s86, 1 +; GCN-NEXT: s_lshl_b32 s1, s1, 3 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x56 +; GCN-NEXT: s_cselect_b32 s2, s84, 1 +; GCN-NEXT: s_and_b32 s2, s2, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 2 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x55 +; GCN-NEXT: s_cselect_b32 s2, s82, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 1 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x54 +; GCN-NEXT: s_cselect_b32 s3, s81, 1 +; GCN-NEXT: s_and_b32 s3, s3, 1 +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_and_b32 s2, s2, 3 +; GCN-NEXT: s_or_b32 s1, s2, s1 +; GCN-NEXT: s_lshl_b32 s1, s1, 4 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x53 +; GCN-NEXT: s_cselect_b32 s2, s78, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 3 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x52 +; GCN-NEXT: s_cselect_b32 s3, s77, 1 +; GCN-NEXT: s_and_b32 s3, s3, 1 +; GCN-NEXT: s_lshl_b32 s3, s3, 2 +; GCN-NEXT: s_or_b32 s2, s2, s3 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x51 +; GCN-NEXT: s_cselect_b32 s3, s74, 1 +; GCN-NEXT: s_lshl_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x50 +; GCN-NEXT: s_cselect_b32 s9, s73, 1 +; GCN-NEXT: s_and_b32 s9, s9, 1 +; GCN-NEXT: s_or_b32 s3, s9, s3 +; GCN-NEXT: s_and_b32 s3, s3, 3 +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_and_b32 s2, s2, 15 +; GCN-NEXT: s_or_b32 s1, s2, s1 +; GCN-NEXT: s_and_b32 s1, s1, 0xff +; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: s_lshl_b32 s0, s0, 16 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x4f +; GCN-NEXT: s_cselect_b32 s1, vcc_lo, 1 +; GCN-NEXT: s_lshl_b32 s1, s1, 3 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x4e +; GCN-NEXT: s_cselect_b32 s2, s95, 1 +; GCN-NEXT: s_and_b32 s2, s2, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 2 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x4d +; GCN-NEXT: s_cselect_b32 s2, s92, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 1 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x4c +; GCN-NEXT: s_cselect_b32 s3, s91, 1 +; GCN-NEXT: s_and_b32 s3, s3, 1 +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_and_b32 s2, s2, 3 +; GCN-NEXT: s_or_b32 s1, s2, s1 +; GCN-NEXT: s_lshl_b32 s1, s1, 12 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x4b +; GCN-NEXT: s_cselect_b32 s2, s88, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 3 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x4a +; GCN-NEXT: s_cselect_b32 s3, s87, 1 +; GCN-NEXT: s_and_b32 s3, s3, 1 +; GCN-NEXT: s_lshl_b32 s3, s3, 2 +; GCN-NEXT: s_or_b32 s2, s2, s3 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x49 +; GCN-NEXT: s_cselect_b32 s3, s85, 1 +; GCN-NEXT: s_lshl_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x48 +; GCN-NEXT: s_cselect_b32 s9, s83, 1 +; GCN-NEXT: s_and_b32 s9, s9, 1 +; GCN-NEXT: s_or_b32 s3, s9, s3 +; GCN-NEXT: s_and_b32 s3, s3, 3 +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_and_b32 s2, s2, 15 +; GCN-NEXT: s_lshl_b32 s2, s2, 8 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x47 +; GCN-NEXT: s_cselect_b32 s2, s80, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 3 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x46 +; GCN-NEXT: s_cselect_b32 s3, s79, 1 +; GCN-NEXT: s_and_b32 s3, s3, 1 +; GCN-NEXT: s_lshl_b32 s3, s3, 2 +; GCN-NEXT: s_or_b32 s2, s2, s3 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x45 +; GCN-NEXT: s_cselect_b32 s3, s76, 1 +; GCN-NEXT: s_lshl_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x44 +; GCN-NEXT: s_cselect_b32 s9, s75, 1 +; GCN-NEXT: s_and_b32 s9, s9, 1 +; GCN-NEXT: s_or_b32 s3, s9, s3 +; GCN-NEXT: s_and_b32 s3, s3, 3 +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_lshl_b32 s2, s2, 4 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x43 +; GCN-NEXT: s_cselect_b32 s3, s72, 1 +; GCN-NEXT: s_lshl_b32 s3, s3, 3 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x42 +; GCN-NEXT: s_cselect_b32 s9, s71, 1 +; GCN-NEXT: s_and_b32 s9, s9, 1 +; GCN-NEXT: s_lshl_b32 s9, s9, 2 +; GCN-NEXT: s_or_b32 s3, s3, s9 +; GCN-NEXT: s_cmp_lg_u32 s8, 64 +; GCN-NEXT: s_cselect_b32 s6, s6, 1 +; GCN-NEXT: s_and_b32 s6, s6, 1 +; GCN-NEXT: s_cmpk_lg_i32 s8, 0x41 +; GCN-NEXT: s_cselect_b32 s9, s70, 1 +; GCN-NEXT: s_lshl_b32 s9, s9, 1 +; GCN-NEXT: s_or_b32 s6, s6, s9 +; GCN-NEXT: s_and_b32 s6, s6, 3 +; GCN-NEXT: s_or_b32 s3, s6, s3 +; GCN-NEXT: s_and_b32 s3, s3, 15 +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_and_b32 s2, s2, 0xff +; GCN-NEXT: s_or_b32 s1, s2, s1 +; GCN-NEXT: s_and_b32 s1, s1, 0xffff +; GCN-NEXT: s_or_b32 s6, s1, s0 +; GCN-NEXT: s_cmp_lg_u32 s8, 63 +; GCN-NEXT: s_cselect_b32 s0, s69, 1 +; GCN-NEXT: s_lshl_b32 s0, s0, 3 +; GCN-NEXT: s_cmp_lg_u32 s8, 62 +; GCN-NEXT: s_cselect_b32 s1, s68, 1 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_lshl_b32 s1, s1, 2 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_cmp_lg_u32 s8, 61 +; GCN-NEXT: s_cselect_b32 s1, s67, 1 +; GCN-NEXT: s_lshl_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s8, 60 +; GCN-NEXT: s_cselect_b32 s2, s66, 1 +; GCN-NEXT: s_and_b32 s2, s2, 1 +; GCN-NEXT: s_or_b32 s1, s2, s1 +; GCN-NEXT: s_and_b32 s1, s1, 3 +; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: s_lshl_b32 s0, s0, 12 +; GCN-NEXT: s_cmp_lg_u32 s8, 59 +; GCN-NEXT: s_cselect_b32 s1, s63, 1 +; GCN-NEXT: s_lshl_b32 s1, s1, 3 +; GCN-NEXT: s_cmp_lg_u32 s8, 58 +; GCN-NEXT: s_cselect_b32 s2, s61, 1 +; GCN-NEXT: s_and_b32 s2, s2, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 2 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_cmp_lg_u32 s8, 57 +; GCN-NEXT: s_cselect_b32 s2, s59, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 1 +; GCN-NEXT: s_cmp_lg_u32 s8, 56 +; GCN-NEXT: s_cselect_b32 s3, s58, 1 +; GCN-NEXT: s_and_b32 s3, s3, 1 +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_and_b32 s2, s2, 3 +; GCN-NEXT: s_or_b32 s1, s2, s1 +; GCN-NEXT: s_and_b32 s1, s1, 15 +; GCN-NEXT: s_lshl_b32 s1, s1, 8 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_cmp_lg_u32 s8, 55 +; GCN-NEXT: s_cselect_b32 s1, s55, 1 +; GCN-NEXT: s_lshl_b32 s1, s1, 3 +; GCN-NEXT: s_cmp_lg_u32 s8, 54 +; GCN-NEXT: s_cselect_b32 s2, s53, 1 +; GCN-NEXT: s_and_b32 s2, s2, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 2 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_cmp_lg_u32 s8, 53 +; GCN-NEXT: s_cselect_b32 s2, s51, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 1 +; GCN-NEXT: s_cmp_lg_u32 s8, 52 +; GCN-NEXT: s_cselect_b32 s3, s50, 1 +; GCN-NEXT: s_and_b32 s3, s3, 1 +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_and_b32 s2, s2, 3 +; GCN-NEXT: s_or_b32 s1, s2, s1 +; GCN-NEXT: s_lshl_b32 s1, s1, 4 +; GCN-NEXT: s_cmp_lg_u32 s8, 51 +; GCN-NEXT: s_cselect_b32 s2, s47, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 3 +; GCN-NEXT: s_cmp_lg_u32 s8, 50 +; GCN-NEXT: s_cselect_b32 s3, s45, 1 +; GCN-NEXT: s_and_b32 s3, s3, 1 +; GCN-NEXT: s_lshl_b32 s3, s3, 2 +; GCN-NEXT: s_or_b32 s2, s2, s3 +; GCN-NEXT: s_cmp_lg_u32 s8, 49 +; GCN-NEXT: s_cselect_b32 s3, s43, 1 +; GCN-NEXT: s_lshl_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s8, 48 +; GCN-NEXT: s_cselect_b32 s9, s42, 1 +; GCN-NEXT: s_and_b32 s9, s9, 1 +; GCN-NEXT: s_or_b32 s3, s9, s3 +; GCN-NEXT: s_and_b32 s3, s3, 3 +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_and_b32 s2, s2, 15 +; GCN-NEXT: s_or_b32 s1, s2, s1 +; GCN-NEXT: s_and_b32 s1, s1, 0xff +; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: s_lshl_b32 s0, s0, 16 +; GCN-NEXT: s_cmp_lg_u32 s8, 47 +; GCN-NEXT: s_cselect_b32 s1, s65, 1 +; GCN-NEXT: s_lshl_b32 s1, s1, 3 +; GCN-NEXT: s_cmp_lg_u32 s8, 46 +; GCN-NEXT: s_cselect_b32 s2, s64, 1 +; GCN-NEXT: s_and_b32 s2, s2, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 2 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_cmp_lg_u32 s8, 45 +; GCN-NEXT: s_cselect_b32 s2, s62, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 1 +; GCN-NEXT: s_cmp_lg_u32 s8, 44 +; GCN-NEXT: s_cselect_b32 s3, s60, 1 +; GCN-NEXT: s_and_b32 s3, s3, 1 +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_and_b32 s2, s2, 3 +; GCN-NEXT: s_or_b32 s1, s2, s1 +; GCN-NEXT: s_lshl_b32 s1, s1, 12 +; GCN-NEXT: s_cmp_lg_u32 s8, 43 +; GCN-NEXT: s_cselect_b32 s2, s57, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 3 +; GCN-NEXT: s_cmp_lg_u32 s8, 42 +; GCN-NEXT: s_cselect_b32 s3, s56, 1 +; GCN-NEXT: s_and_b32 s3, s3, 1 +; GCN-NEXT: s_lshl_b32 s3, s3, 2 +; GCN-NEXT: s_or_b32 s2, s2, s3 +; GCN-NEXT: s_cmp_lg_u32 s8, 41 +; GCN-NEXT: s_cselect_b32 s3, s54, 1 +; GCN-NEXT: s_lshl_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s8, 40 +; GCN-NEXT: s_cselect_b32 s9, s52, 1 +; GCN-NEXT: s_and_b32 s9, s9, 1 +; GCN-NEXT: s_or_b32 s3, s9, s3 +; GCN-NEXT: s_and_b32 s3, s3, 3 +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_and_b32 s2, s2, 15 +; GCN-NEXT: s_lshl_b32 s2, s2, 8 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_cmp_lg_u32 s8, 39 +; GCN-NEXT: s_cselect_b32 s2, s49, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 3 +; GCN-NEXT: s_cmp_lg_u32 s8, 38 +; GCN-NEXT: s_cselect_b32 s3, s48, 1 +; GCN-NEXT: s_and_b32 s3, s3, 1 +; GCN-NEXT: s_lshl_b32 s3, s3, 2 +; GCN-NEXT: s_or_b32 s2, s2, s3 +; GCN-NEXT: s_cmp_lg_u32 s8, 37 +; GCN-NEXT: s_cselect_b32 s3, s46, 1 +; GCN-NEXT: s_lshl_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s8, 36 +; GCN-NEXT: s_cselect_b32 s9, s44, 1 +; GCN-NEXT: s_and_b32 s9, s9, 1 +; GCN-NEXT: s_or_b32 s3, s9, s3 +; GCN-NEXT: s_and_b32 s3, s3, 3 +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_lshl_b32 s2, s2, 4 +; GCN-NEXT: s_cmp_lg_u32 s8, 35 +; GCN-NEXT: s_cselect_b32 s3, s41, 1 +; GCN-NEXT: s_lshl_b32 s3, s3, 3 +; GCN-NEXT: s_cmp_lg_u32 s8, 34 +; GCN-NEXT: s_cselect_b32 s9, s40, 1 +; GCN-NEXT: s_and_b32 s9, s9, 1 +; GCN-NEXT: s_lshl_b32 s9, s9, 2 +; GCN-NEXT: s_or_b32 s3, s3, s9 +; GCN-NEXT: s_cmp_lg_u32 s8, 32 +; GCN-NEXT: s_cselect_b32 s5, s5, 1 +; GCN-NEXT: s_and_b32 s5, s5, 1 +; GCN-NEXT: s_cmp_lg_u32 s8, 33 +; GCN-NEXT: v_readlane_b32 s9, v0, 33 +; GCN-NEXT: s_cselect_b32 s9, s9, 1 +; GCN-NEXT: s_lshl_b32 s9, s9, 1 +; GCN-NEXT: s_or_b32 s5, s5, s9 +; GCN-NEXT: s_and_b32 s5, s5, 3 +; GCN-NEXT: s_or_b32 s3, s5, s3 +; GCN-NEXT: s_and_b32 s3, s3, 15 +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_and_b32 s2, s2, 0xff +; GCN-NEXT: s_or_b32 s1, s2, s1 +; GCN-NEXT: s_and_b32 s1, s1, 0xffff +; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: s_cmp_lg_u32 s8, 31 +; GCN-NEXT: v_readlane_b32 s1, v0, 17 +; GCN-NEXT: s_cselect_b32 s1, s1, 1 +; GCN-NEXT: s_lshl_b32 s1, s1, 3 +; GCN-NEXT: s_cmp_lg_u32 s8, 30 +; GCN-NEXT: v_readlane_b32 s2, v0, 16 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 +; GCN-NEXT: s_and_b32 s2, s2, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 2 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_cmp_lg_u32 s8, 29 +; GCN-NEXT: v_readlane_b32 s2, v0, 15 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 1 +; GCN-NEXT: s_cmp_lg_u32 s8, 28 +; GCN-NEXT: v_readlane_b32 s3, v0, 14 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_and_b32 s3, s3, 1 +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_and_b32 s2, s2, 3 +; GCN-NEXT: s_or_b32 s1, s2, s1 +; GCN-NEXT: s_lshl_b32 s1, s1, 12 +; GCN-NEXT: s_cmp_lg_u32 s8, 27 +; GCN-NEXT: v_readlane_b32 s2, v0, 13 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 3 +; GCN-NEXT: s_cmp_lg_u32 s8, 26 +; GCN-NEXT: v_readlane_b32 s3, v0, 12 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_and_b32 s3, s3, 1 +; GCN-NEXT: s_lshl_b32 s3, s3, 2 +; GCN-NEXT: s_or_b32 s2, s2, s3 +; GCN-NEXT: s_cmp_lg_u32 s8, 25 +; GCN-NEXT: v_readlane_b32 s3, v0, 11 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_lshl_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s8, 24 +; GCN-NEXT: v_readlane_b32 s5, v0, 10 +; GCN-NEXT: s_cselect_b32 s5, s5, 1 +; GCN-NEXT: s_and_b32 s5, s5, 1 +; GCN-NEXT: s_or_b32 s3, s5, s3 +; GCN-NEXT: s_and_b32 s3, s3, 3 +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_and_b32 s2, s2, 15 +; GCN-NEXT: s_lshl_b32 s2, s2, 8 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_cmp_lg_u32 s8, 23 +; GCN-NEXT: v_readlane_b32 s2, v0, 9 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 3 +; GCN-NEXT: s_cmp_lg_u32 s8, 22 +; GCN-NEXT: v_readlane_b32 s3, v0, 8 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_and_b32 s3, s3, 1 +; GCN-NEXT: s_lshl_b32 s3, s3, 2 +; GCN-NEXT: s_or_b32 s2, s2, s3 +; GCN-NEXT: s_cmp_lg_u32 s8, 21 +; GCN-NEXT: v_readlane_b32 s3, v0, 7 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_lshl_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s8, 20 +; GCN-NEXT: v_readlane_b32 s5, v0, 6 +; GCN-NEXT: s_cselect_b32 s5, s5, 1 +; GCN-NEXT: s_and_b32 s5, s5, 1 +; GCN-NEXT: s_or_b32 s3, s5, s3 +; GCN-NEXT: s_and_b32 s3, s3, 3 +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_lshl_b32 s2, s2, 4 +; GCN-NEXT: s_cmp_lg_u32 s8, 19 +; GCN-NEXT: v_readlane_b32 s3, v0, 5 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_lshl_b32 s3, s3, 3 +; GCN-NEXT: s_cmp_lg_u32 s8, 18 +; GCN-NEXT: v_readlane_b32 s5, v0, 4 +; GCN-NEXT: s_cselect_b32 s5, s5, 1 +; GCN-NEXT: s_and_b32 s5, s5, 1 +; GCN-NEXT: s_lshl_b32 s5, s5, 2 +; GCN-NEXT: s_or_b32 s3, s3, s5 +; GCN-NEXT: s_cmp_lg_u32 s8, 17 +; GCN-NEXT: v_readlane_b32 s5, v0, 3 +; GCN-NEXT: s_cselect_b32 s5, s5, 1 +; GCN-NEXT: s_lshl_b32 s5, s5, 1 +; GCN-NEXT: s_cmp_lg_u32 s8, 16 +; GCN-NEXT: v_readlane_b32 s9, v0, 2 +; GCN-NEXT: s_cselect_b32 s9, s9, 1 +; GCN-NEXT: s_and_b32 s9, s9, 1 +; GCN-NEXT: s_or_b32 s5, s9, s5 +; GCN-NEXT: s_and_b32 s5, s5, 3 +; GCN-NEXT: s_or_b32 s3, s5, s3 +; GCN-NEXT: s_and_b32 s3, s3, 15 +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_and_b32 s2, s2, 0xff +; GCN-NEXT: s_or_b32 s1, s2, s1 +; GCN-NEXT: s_lshl_b32 s1, s1, 16 +; GCN-NEXT: s_cmp_lg_u32 s8, 15 +; GCN-NEXT: v_readlane_b32 s2, v0, 32 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 3 +; GCN-NEXT: s_cmp_lg_u32 s8, 14 +; GCN-NEXT: v_readlane_b32 s3, v0, 31 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_and_b32 s3, s3, 1 +; GCN-NEXT: s_lshl_b32 s3, s3, 2 +; GCN-NEXT: s_or_b32 s2, s2, s3 +; GCN-NEXT: s_cmp_lg_u32 s8, 13 +; GCN-NEXT: v_readlane_b32 s3, v0, 30 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_lshl_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s8, 12 +; GCN-NEXT: v_readlane_b32 s5, v0, 29 +; GCN-NEXT: s_cselect_b32 s5, s5, 1 +; GCN-NEXT: s_and_b32 s5, s5, 1 +; GCN-NEXT: s_or_b32 s3, s5, s3 +; GCN-NEXT: s_and_b32 s3, s3, 3 +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_lshl_b32 s2, s2, 12 +; GCN-NEXT: s_cmp_lg_u32 s8, 11 +; GCN-NEXT: v_readlane_b32 s3, v0, 28 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_lshl_b32 s3, s3, 3 +; GCN-NEXT: s_cmp_lg_u32 s8, 10 +; GCN-NEXT: v_readlane_b32 s5, v0, 27 +; GCN-NEXT: s_cselect_b32 s5, s5, 1 +; GCN-NEXT: s_and_b32 s5, s5, 1 +; GCN-NEXT: s_lshl_b32 s5, s5, 2 +; GCN-NEXT: s_or_b32 s3, s3, s5 +; GCN-NEXT: s_cmp_lg_u32 s8, 9 +; GCN-NEXT: v_readlane_b32 s5, v0, 26 +; GCN-NEXT: s_cselect_b32 s5, s5, 1 +; GCN-NEXT: s_lshl_b32 s5, s5, 1 +; GCN-NEXT: s_cmp_lg_u32 s8, 8 +; GCN-NEXT: v_readlane_b32 s9, v0, 25 +; GCN-NEXT: s_cselect_b32 s9, s9, 1 +; GCN-NEXT: s_and_b32 s9, s9, 1 +; GCN-NEXT: s_or_b32 s5, s9, s5 +; GCN-NEXT: s_and_b32 s5, s5, 3 +; GCN-NEXT: s_or_b32 s3, s5, s3 +; GCN-NEXT: s_and_b32 s3, s3, 15 +; GCN-NEXT: s_lshl_b32 s3, s3, 8 +; GCN-NEXT: s_or_b32 s2, s2, s3 +; GCN-NEXT: s_cmp_lg_u32 s8, 7 +; GCN-NEXT: v_readlane_b32 s3, v0, 24 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_lshl_b32 s3, s3, 3 +; GCN-NEXT: s_cmp_lg_u32 s8, 6 +; GCN-NEXT: v_readlane_b32 s5, v0, 23 +; GCN-NEXT: s_cselect_b32 s5, s5, 1 +; GCN-NEXT: s_and_b32 s5, s5, 1 +; GCN-NEXT: s_lshl_b32 s5, s5, 2 +; GCN-NEXT: s_or_b32 s3, s3, s5 +; GCN-NEXT: s_cmp_lg_u32 s8, 5 +; GCN-NEXT: v_readlane_b32 s5, v0, 22 +; GCN-NEXT: s_cselect_b32 s5, s5, 1 +; GCN-NEXT: s_lshl_b32 s5, s5, 1 +; GCN-NEXT: s_cmp_lg_u32 s8, 4 +; GCN-NEXT: v_readlane_b32 s9, v0, 21 +; GCN-NEXT: s_cselect_b32 s9, s9, 1 +; GCN-NEXT: s_and_b32 s9, s9, 1 +; GCN-NEXT: s_or_b32 s5, s9, s5 +; GCN-NEXT: s_and_b32 s5, s5, 3 +; GCN-NEXT: s_or_b32 s3, s5, s3 +; GCN-NEXT: s_lshl_b32 s3, s3, 4 +; GCN-NEXT: s_cmp_lg_u32 s8, 3 +; GCN-NEXT: v_readlane_b32 s5, v0, 20 +; GCN-NEXT: s_cselect_b32 s5, s5, 1 +; GCN-NEXT: s_lshl_b32 s5, s5, 3 +; GCN-NEXT: s_cmp_lg_u32 s8, 2 +; GCN-NEXT: v_readlane_b32 s9, v0, 19 +; GCN-NEXT: s_cselect_b32 s9, s9, 1 +; GCN-NEXT: s_and_b32 s9, s9, 1 +; GCN-NEXT: s_lshl_b32 s9, s9, 2 +; GCN-NEXT: s_or_b32 s5, s5, s9 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_cselect_b32 s4, s4, 1 +; GCN-NEXT: s_and_b32 s4, s4, 1 +; GCN-NEXT: s_cmp_lg_u32 s8, 1 +; GCN-NEXT: v_readlane_b32 s8, v0, 18 +; GCN-NEXT: s_cselect_b32 s8, s8, 1 +; GCN-NEXT: s_lshl_b32 s8, s8, 1 +; GCN-NEXT: s_or_b32 s4, s4, s8 +; GCN-NEXT: s_and_b32 s4, s4, 3 +; GCN-NEXT: s_or_b32 s4, s4, s5 +; GCN-NEXT: s_and_b32 s4, s4, 15 +; GCN-NEXT: s_or_b32 s3, s4, s3 +; GCN-NEXT: s_and_b32 s3, s3, 0xff +; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_and_b32 s2, s2, 0xffff +; GCN-NEXT: s_or_b32 s1, s2, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_readlane_b32 s0, v0, 0 +; GCN-NEXT: v_readlane_b32 s1, v0, 1 +; GCN-NEXT: v_mov_b32_e32 v6, s1 +; GCN-NEXT: v_mov_b32_e32 v3, s6 +; GCN-NEXT: v_mov_b32_e32 v4, s7 +; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: flat_store_dwordx4 v[5:6], v[1:4] +; GCN-NEXT: ; kill: killed $vgpr0 ; GCN-NEXT: s_endpgm entry: %v = insertelement <128 x i1> %vec, i1 1, i32 %sel diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index 2a8eac8712e52a..213813a94fc859 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1612,16 +1612,16 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8 ; VI-NEXT: s_load_dword s4, s[6:7], 0x4c ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_load_dword s5, s[6:7], 0x28 -; VI-NEXT: v_mov_b32_e32 v0, 0xff ; VI-NEXT: s_mov_b32 s3, 0x1100f000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b32 s4, s4, 3 -; VI-NEXT: v_lshlrev_b16_e32 v0, s4, v0 -; VI-NEXT: v_not_b32_e32 v1, v0 -; VI-NEXT: v_and_b32_e32 v1, s5, v1 -; VI-NEXT: v_and_b32_e32 v0, 0x505, v0 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_lshl_b32 s4, 0xff, s4 +; VI-NEXT: s_and_b32 s6, s4, 0x505 +; VI-NEXT: s_xor_b32 s4, s4, 0xffff +; VI-NEXT: s_and_b32 s4, s4, s5 +; VI-NEXT: s_or_b32 s4, s6, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <2 x i8> %a, i8 5, i32 %b @@ -1871,100 +1871,88 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(ptr addrspace(1) %out, <1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s5, s11, 24 ; VI-NEXT: s_cmp_lg_u32 s4, 15 -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s5, s11, 16 +; VI-NEXT: s_cselect_b32 s5, s5, 5 +; VI-NEXT: s_lshl_b32 s5, s5, 8 +; VI-NEXT: s_lshr_b32 s6, s11, 16 ; VI-NEXT: s_cmp_lg_u32 s4, 14 -; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s5, s11, 8 -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc +; VI-NEXT: s_cselect_b32 s6, s6, 5 +; VI-NEXT: s_and_b32 s6, s6, 0xff +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_lshr_b32 s6, s11, 8 ; VI-NEXT: s_cmp_lg_u32 s4, 13 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cselect_b32 s6, s6, 5 +; VI-NEXT: s_lshl_b32 s6, s6, 8 ; VI-NEXT: s_cmp_lg_u32 s4, 12 -; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, s11 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc -; VI-NEXT: s_lshr_b32 s5, s10, 24 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_cselect_b32 s7, s11, 5 +; VI-NEXT: s_and_b32 s7, s7, 0xff +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_lshr_b32 s6, s10, 24 ; VI-NEXT: s_cmp_lg_u32 s4, 11 -; VI-NEXT: v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s5, s10, 16 +; VI-NEXT: s_cselect_b32 s6, s6, 5 +; VI-NEXT: s_lshl_b32 s6, s6, 8 +; VI-NEXT: s_lshr_b32 s7, s10, 16 ; VI-NEXT: s_cmp_lg_u32 s4, 10 -; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s5, s10, 8 -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc +; VI-NEXT: s_cselect_b32 s7, s7, 5 +; VI-NEXT: s_and_b32 s7, s7, 0xff +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_lshr_b32 s7, s10, 8 ; VI-NEXT: s_cmp_lg_u32 s4, 9 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cselect_b32 s7, s7, 5 +; VI-NEXT: s_lshl_b32 s7, s7, 8 ; VI-NEXT: s_cmp_lg_u32 s4, 8 -; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc -; VI-NEXT: s_lshr_b32 s5, s9, 24 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_cselect_b32 s10, s10, 5 +; VI-NEXT: s_and_b32 s10, s10, 0xff +; VI-NEXT: s_or_b32 s7, s10, s7 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_lshr_b32 s7, s9, 24 ; VI-NEXT: s_cmp_lg_u32 s4, 7 -; VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s5, s9, 16 +; VI-NEXT: s_cselect_b32 s7, s7, 5 +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: s_lshr_b32 s10, s9, 16 ; VI-NEXT: s_cmp_lg_u32 s4, 6 -; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s5, s9, 8 -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc +; VI-NEXT: s_cselect_b32 s10, s10, 5 +; VI-NEXT: s_and_b32 s10, s10, 0xff +; VI-NEXT: s_or_b32 s7, s10, s7 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_lshr_b32 s10, s9, 8 ; VI-NEXT: s_cmp_lg_u32 s4, 5 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cselect_b32 s10, s10, 5 +; VI-NEXT: s_lshl_b32 s10, s10, 8 ; VI-NEXT: s_cmp_lg_u32 s4, 4 -; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; VI-NEXT: v_mov_b32_e32 v4, s9 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc -; VI-NEXT: s_lshr_b32 s5, s8, 24 -; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_cselect_b32 s9, s9, 5 +; VI-NEXT: s_and_b32 s9, s9, 0xff +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_or_b32 s7, s9, s7 +; VI-NEXT: s_lshr_b32 s9, s8, 24 ; VI-NEXT: s_cmp_lg_u32 s4, 3 -; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s5, s8, 16 +; VI-NEXT: s_cselect_b32 s9, s9, 5 +; VI-NEXT: s_lshl_b32 s9, s9, 8 +; VI-NEXT: s_lshr_b32 s10, s8, 16 ; VI-NEXT: s_cmp_lg_u32 s4, 2 -; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; VI-NEXT: v_mov_b32_e32 v4, s5 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s5, s8, 8 -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc +; VI-NEXT: s_cselect_b32 s10, s10, 5 +; VI-NEXT: s_and_b32 s10, s10, 0xff +; VI-NEXT: s_or_b32 s9, s10, s9 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_lshr_b32 s10, s8, 8 ; VI-NEXT: s_cmp_lg_u32 s4, 1 -; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v4, s5 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cselect_b32 s10, s10, 5 +; VI-NEXT: s_lshl_b32 s10, s10, 8 ; VI-NEXT: s_cmp_lg_u32 s4, 0 -; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc -; VI-NEXT: v_mov_b32_e32 v5, s8 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4 -; VI-NEXT: v_cndmask_b32_e32 v5, 5, v5, vcc -; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_cselect_b32 s4, s8, 5 +; VI-NEXT: s_and_b32 s4, s4, 0xff +; VI-NEXT: s_or_b32 s4, s4, s10 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s9 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <16 x i8> %a, i8 5, i32 %b diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll index 2e8049e9765e18..f86c8294ab3c00 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll @@ -30,13 +30,12 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) { ; GFX8CHECK: ; %bb.0: ; GFX8CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8CHECK-NEXT: v_mov_b32_e32 v0, 0x7fff -; GFX8CHECK-NEXT: s_movk_i32 s2, 0x7f80 ; GFX8CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX8CHECK-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX8CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s2, v0 +; GFX8CHECK-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX8CHECK-NEXT: s_cmpk_gt_i32 s2, 0x7f80 +; GFX8CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0 -; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3] ; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s1 ; GFX8CHECK-NEXT: flat_store_dword v[0:1], v2 ; GFX8CHECK-NEXT: s_endpgm @@ -45,13 +44,12 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) { ; GFX9CHECK: ; %bb.0: ; GFX9CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x7fff -; GFX9CHECK-NEXT: s_movk_i32 s2, 0x7f80 ; GFX9CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX9CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX9CHECK-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX9CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s2, v1 -; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; GFX9CHECK-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX9CHECK-NEXT: s_cmpk_gt_i32 s2, 0x7f80 +; GFX9CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[2:3] ; GFX9CHECK-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9CHECK-NEXT: s_endpgm ; @@ -60,12 +58,13 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) { ; GFX10CHECK-NEXT: s_clause 0x1 ; GFX10CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10CHECK-NEXT: v_mov_b32_e32 v1, 0 +; GFX10CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX10CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX10CHECK-NEXT: v_and_b32_e64 v0, 0x7fff, s4 -; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX10CHECK-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10CHECK-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX10CHECK-NEXT: s_cmpk_gt_i32 s2, 0x7f80 +; GFX10CHECK-NEXT: s_cselect_b32 s2, -1, 0 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 +; GFX10CHECK-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10CHECK-NEXT: s_endpgm ; ; GFX11CHECK-LABEL: sgpr_isnan_bf16: @@ -73,12 +72,13 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) { ; GFX11CHECK-NEXT: s_clause 0x1 ; GFX11CHECK-NEXT: s_load_b32 s4, s[2:3], 0x2c ; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11CHECK-NEXT: v_mov_b32_e32 v1, 0 +; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e64 v0, 0x7fff, s4 -; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX11CHECK-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11CHECK-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX11CHECK-NEXT: s_cmpk_gt_i32 s2, 0x7f80 +; GFX11CHECK-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 +; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11CHECK-NEXT: s_nop 0 ; GFX11CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index 0221f9992ad43e..ec7c04a82a1eed 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -851,6 +851,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out ; GFX8-LABEL: constant_zextload_v2i1_to_v2i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: v_mov_b32_e32 v3, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -858,9 +859,8 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v4, 1, v2 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; @@ -889,11 +889,10 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v2, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_lshrrev_b32_e32 v1, 1, v1 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -934,9 +933,8 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v2 +; GFX8-NEXT: v_bfe_i32 v3, v2, 1, 1 ; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 1 -; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 1 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; @@ -962,14 +960,14 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out ; GFX12-LABEL: constant_sextload_v2i1_to_v2i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_u8 v0, v2, s[2:3] -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0 -; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 1 +; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_bfe_i32 s3, s2, 0x10000 +; GFX12-NEXT: s_bfe_i32 s2, s2, 0x10001 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: v_mov_b32_e32 v0, s3 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1008,16 +1006,14 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_load_ubyte v0, v[0:1] +; GFX8-NEXT: flat_load_ubyte v1, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, 2 ; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: v_mov_b32_e32 v4, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v0 -; GFX8-NEXT: v_and_b32_e32 v5, 1, v0 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, 2, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v1 +; GFX8-NEXT: v_bfe_u32 v1, v1, 1, 1 ; GFX8-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; GFX8-NEXT: s_endpgm ; @@ -1049,17 +1045,14 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_u8 v0, v3, s[2:3] +; GFX12-NEXT: global_load_u8 v1, v3, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0 -; GFX12-NEXT: v_lshrrev_b16 v2, 2, v0 -; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 2, v0 +; GFX12-NEXT: v_and_b32_e32 v0, 1, v1 +; GFX12-NEXT: v_bfe_u32 v1, v1, 1, 1 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1102,11 +1095,9 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out ; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: v_mov_b32_e32 v4, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, 2, v0 +; GFX8-NEXT: v_bfe_i32 v2, v0, 2, 1 +; GFX8-NEXT: v_bfe_i32 v1, v0, 1, 1 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 1 -; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 1 -; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 1 ; GFX8-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; GFX8-NEXT: s_endpgm ; @@ -1138,16 +1129,14 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out ; GFX12-LABEL: constant_sextload_v3i1_to_v3i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_u8 v0, v3, s[2:3] -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v1, 2, v0 -; GFX12-NEXT: v_lshrrev_b16 v4, 1, v0 -; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_i32 v2, v1, 0, 1 -; GFX12-NEXT: v_bfe_i32 v1, v4, 0, 1 +; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_bfe_i32 s3, s2, 0x10002 +; GFX12-NEXT: s_bfe_i32 s4, s2, 0x10000 +; GFX12-NEXT: s_bfe_i32 s2, s2, 0x10001 +; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 ; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1186,19 +1175,15 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_load_ubyte v0, v[0:1] +; GFX8-NEXT: flat_load_ubyte v1, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, 3 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, 2, v0 -; GFX8-NEXT: v_and_b32_e32 v6, 1, v0 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 3, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v1 +; GFX8-NEXT: v_bfe_u32 v2, v1, 2, 1 +; GFX8-NEXT: v_bfe_u32 v1, v1, 1, 1 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -1229,21 +1214,14 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_u8 v0, v4, s[2:3] +; GFX12-NEXT: global_load_u8 v1, v4, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v1, 2, v0 -; GFX12-NEXT: v_lshrrev_b16 v2, 1, v0 -; GFX12-NEXT: v_lshrrev_b16 v3, 3, v0 -; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX12-NEXT: v_and_b32_e32 v5, 1, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX12-NEXT: v_and_b32_e32 v0, 1, v1 +; GFX12-NEXT: v_bfe_u32 v2, v1, 2, 1 +; GFX12-NEXT: v_bfe_u32 v1, v1, 1, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-NEXT: v_lshrrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1286,13 +1264,10 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, 2, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 3, v0 +; GFX8-NEXT: v_bfe_i32 v3, v0, 3, 1 +; GFX8-NEXT: v_bfe_i32 v2, v0, 2, 1 +; GFX8-NEXT: v_bfe_i32 v1, v0, 1, 1 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 1 -; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 1 -; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 1 -; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 1 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -1324,19 +1299,17 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out ; GFX12-LABEL: constant_sextload_v4i1_to_v4i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_u8 v0, v4, s[2:3] -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v1, 3, v0 -; GFX12-NEXT: v_lshrrev_b16 v2, 2, v0 -; GFX12-NEXT: v_lshrrev_b16 v5, 1, v0 -; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_bfe_i32 v3, v1, 0, 1 -; GFX12-NEXT: v_bfe_i32 v2, v2, 0, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX12-NEXT: v_bfe_i32 v1, v5, 0, 1 +; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_bfe_i32 s3, s2, 0x10003 +; GFX12-NEXT: s_bfe_i32 s4, s2, 0x10002 +; GFX12-NEXT: s_bfe_i32 s5, s2, 0x10000 +; GFX12-NEXT: s_bfe_i32 s2, s2, 0x10001 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1380,33 +1353,32 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_load_ubyte v1, v[0:1] -; GFX8-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v11, s3 +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v9, s1 -; GFX8-NEXT: v_mov_b32_e32 v10, s2 ; GFX8-NEXT: v_mov_b32_e32 v8, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e32 v2, 5, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v5, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v12, 3, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 4, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, 6, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 2, v1 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v7, 7, v1 -; GFX8-NEXT: v_and_b32_e32 v13, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v5 -; GFX8-NEXT: v_and_b32_e32 v5, 1, v12 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v6 -; GFX8-NEXT: v_and_b32_e32 v6, 1, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 1, v3 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff, v13 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX8-NEXT: v_readfirstlane_b32 s2, v0 +; GFX8-NEXT: s_bfe_u32 s3, s2, 0x10003 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x10001 +; GFX8-NEXT: s_bfe_u32 s5, s2, 0x10005 +; GFX8-NEXT: s_and_b32 s6, s2, 1 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x10002 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x10004 +; GFX8-NEXT: s_add_u32 s0, s0, 16 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX8-NEXT: v_mov_b32_e32 v11, s1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 7, v0 +; GFX8-NEXT: v_bfe_u32 v2, v0, 6, 1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v10, s0 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s3 +; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_v8i1_to_v8i32: @@ -1448,27 +1420,24 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v8, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v2, 5, v0 -; GFX12-NEXT: v_lshrrev_b16 v5, 1, v0 -; GFX12-NEXT: v_lshrrev_b16 v6, 3, v0 -; GFX12-NEXT: v_lshrrev_b16 v1, 4, v0 -; GFX12-NEXT: v_lshrrev_b16 v3, 6, v0 -; GFX12-NEXT: v_and_b32_e32 v9, 1, v2 -; GFX12-NEXT: v_lshrrev_b16 v4, 7, v0 -; GFX12-NEXT: v_lshrrev_b16 v7, 2, v0 -; GFX12-NEXT: v_and_b32_e32 v10, 1, v5 -; GFX12-NEXT: v_and_b32_e32 v5, 1, v6 -; GFX12-NEXT: v_and_b32_e32 v6, 1, v3 -; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX12-NEXT: v_and_b32_e32 v2, 1, v7 -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 1, v1 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v9 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; GFX12-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10001 +; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10003 +; GFX12-NEXT: s_bfe_u32 s5, s2, 0x10005 +; GFX12-NEXT: s_and_b32 s6, s2, 1 +; GFX12-NEXT: s_bfe_u32 s7, s2, 0x10002 +; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10004 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, 7, v0 +; GFX12-NEXT: v_bfe_u32 v2, v0, 6, 1 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v4, s6 +; GFX12-NEXT: v_mov_b32_e32 v6, s7 ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 -; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -1511,7 +1480,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_load_ubyte v0, v[0:1] +; GFX8-NEXT: flat_load_ubyte v4, v[0:1] ; GFX8-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v11, s3 @@ -1519,21 +1488,14 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out ; GFX8-NEXT: v_mov_b32_e32 v10, s2 ; GFX8-NEXT: v_mov_b32_e32 v8, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e32 v4, 4, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v5, 5, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 6, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v7, 7, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, 2, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 3, v0 -; GFX8-NEXT: v_bfe_i32 v7, v7, 0, 1 -; GFX8-NEXT: v_bfe_i32 v6, v6, 0, 1 -; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 1 -; GFX8-NEXT: v_bfe_i32 v4, v4, 0, 1 -; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 1 -; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 1 -; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 1 -; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 1 +; GFX8-NEXT: v_bfe_i32 v3, v4, 3, 1 +; GFX8-NEXT: v_bfe_i32 v2, v4, 2, 1 +; GFX8-NEXT: v_bfe_i32 v1, v4, 1, 1 +; GFX8-NEXT: v_bfe_i32 v0, v4, 0, 1 +; GFX8-NEXT: v_bfe_i32 v7, v4, 7, 1 +; GFX8-NEXT: v_bfe_i32 v6, v4, 6, 1 +; GFX8-NEXT: v_bfe_i32 v5, v4, 5, 1 +; GFX8-NEXT: v_bfe_i32 v4, v4, 4, 1 ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: s_endpgm @@ -1579,28 +1541,26 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out ; GFX12-LABEL: constant_sextload_v8i1_to_v8i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v8, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_u8 v0, v8, s[2:3] -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v1, 4, v0 -; GFX12-NEXT: v_lshrrev_b16 v4, 5, v0 -; GFX12-NEXT: v_lshrrev_b16 v5, 6, v0 -; GFX12-NEXT: v_lshrrev_b16 v2, 3, v0 -; GFX12-NEXT: v_lshrrev_b16 v6, 2, v0 -; GFX12-NEXT: v_lshrrev_b16 v7, 7, v0 -; GFX12-NEXT: v_lshrrev_b16 v9, 1, v0 -; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1 -; GFX12-NEXT: v_bfe_i32 v3, v2, 0, 1 -; GFX12-NEXT: v_bfe_i32 v2, v6, 0, 1 -; GFX12-NEXT: v_bfe_i32 v7, v7, 0, 1 -; GFX12-NEXT: v_bfe_i32 v6, v5, 0, 1 -; GFX12-NEXT: v_bfe_i32 v5, v4, 0, 1 -; GFX12-NEXT: v_bfe_i32 v4, v1, 0, 1 -; GFX12-NEXT: v_bfe_i32 v1, v9, 0, 1 +; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_bfe_i32 s3, s2, 0x10003 +; GFX12-NEXT: s_bfe_i32 s4, s2, 0x10002 +; GFX12-NEXT: s_bfe_i32 s5, s2, 0x10001 +; GFX12-NEXT: s_bfe_i32 s6, s2, 0x10000 +; GFX12-NEXT: s_bfe_i32 s7, s2, 0x10007 +; GFX12-NEXT: s_bfe_i32 s8, s2, 0x10006 +; GFX12-NEXT: s_bfe_i32 s9, s2, 0x10004 +; GFX12-NEXT: s_bfe_i32 s2, s2, 0x10005 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: v_dual_mov_b32 v0, s9 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v5, s5 +; GFX12-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v7, s3 +; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 -; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -1653,62 +1613,59 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_load_ushort v1, v[0:1] +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v17, s1 +; GFX8-NEXT: v_mov_b32_e32 v16, s0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s2, v0 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s2 +; GFX8-NEXT: s_bfe_u32 s3, s2, 0x10003 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x10001 +; GFX8-NEXT: s_bfe_u32 s5, s2, 0x10007 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x10009 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x1000d +; GFX8-NEXT: s_and_b32 s9, s2, 1 +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x1000a +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x1000c +; GFX8-NEXT: s_bfe_u32 s11, s6, 0x10005 +; GFX8-NEXT: s_bfe_u32 s12, s6, 0x1000b +; GFX8-NEXT: s_lshr_b32 s13, s6, 15 +; GFX8-NEXT: s_bfe_u32 s14, s6, 0x10002 +; GFX8-NEXT: s_bfe_u32 s15, s6, 0x10006 +; GFX8-NEXT: s_bfe_u32 s16, s6, 0x10004 +; GFX8-NEXT: s_bfe_u32 s17, s6, 0x10008 +; GFX8-NEXT: s_bfe_u32 s6, s6, 0x1000e +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 48 +; GFX8-NEXT: v_mov_b32_e32 v15, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v19, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s13 ; GFX8-NEXT: v_mov_b32_e32 v18, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: v_mov_b32_e32 v0, 1 -; GFX8-NEXT: v_mov_b32_e32 v17, s1 +; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[0:3] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v16, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s10 +; GFX8-NEXT: v_mov_b32_e32 v4, s17 +; GFX8-NEXT: v_mov_b32_e32 v7, s12 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s0, s0, 16 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v21, s3 -; GFX8-NEXT: v_mov_b32_e32 v23, s1 -; GFX8-NEXT: v_mov_b32_e32 v20, s2 -; GFX8-NEXT: v_mov_b32_e32 v22, s0 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 12, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v8, 3, v1 -; GFX8-NEXT: v_and_b32_e32 v12, 1, v8 -; GFX8-NEXT: v_and_b32_e32 v8, 1, v3 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX8-NEXT: v_and_b32_e32 v13, 0xffff, v3 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 7, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v7, 14, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v11, 2, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX8-NEXT: v_lshrrev_b16_e32 v5, 13, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v9, 9, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, 10, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, 4, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 6, v1 -; GFX8-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v10, 1, v7 -; GFX8-NEXT: v_and_b32_e32 v14, 1, v11 -; GFX8-NEXT: v_and_b32_e32 v15, 0xffff, v12 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff, v3 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 11, v1 -; GFX8-NEXT: v_and_b32_e32 v12, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v11, 15, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 5, v1 -; GFX8-NEXT: v_and_b32_e32 v24, 1, v5 -; GFX8-NEXT: v_and_b32_e32 v9, 1, v9 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v24 -; GFX8-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[8:11] -; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[0:3] -; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[4:7] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v11, s5 +; GFX8-NEXT: v_mov_b32_e32 v8, s16 +; GFX8-NEXT: v_mov_b32_e32 v9, s11 +; GFX8-NEXT: v_mov_b32_e32 v10, s15 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v12, s9 +; GFX8-NEXT: v_mov_b32_e32 v13, s4 +; GFX8-NEXT: v_mov_b32_e32 v14, s14 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GFX8-NEXT: s_endpgm ; @@ -1772,49 +1729,39 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v16, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v2, 13, v0 -; GFX12-NEXT: v_lshrrev_b16 v13, 1, v0 -; GFX12-NEXT: v_lshrrev_b16 v15, 3, v0 -; GFX12-NEXT: v_lshrrev_b16 v4, 9, v0 -; GFX12-NEXT: v_lshrrev_b16 v6, 11, v0 -; GFX12-NEXT: v_and_b32_e32 v17, 1, v2 -; GFX12-NEXT: v_lshrrev_b16 v10, 5, v0 -; GFX12-NEXT: v_lshrrev_b16 v12, 7, v0 -; GFX12-NEXT: v_lshrrev_b16 v1, 12, v0 -; GFX12-NEXT: v_lshrrev_b16 v3, 14, v0 -; GFX12-NEXT: v_lshrrev_b16 v5, 15, v0 -; GFX12-NEXT: v_lshrrev_b16 v14, 2, v0 -; GFX12-NEXT: v_and_b32_e32 v22, 1, v13 -; GFX12-NEXT: v_and_b32_e32 v13, 1, v15 -; GFX12-NEXT: v_lshrrev_b16 v7, 8, v0 -; GFX12-NEXT: v_lshrrev_b16 v8, 10, v0 -; GFX12-NEXT: v_lshrrev_b16 v9, 4, v0 -; GFX12-NEXT: v_lshrrev_b16 v11, 6, v0 -; GFX12-NEXT: v_and_b32_e32 v18, 1, v4 -; GFX12-NEXT: v_and_b32_e32 v19, 1, v6 -; GFX12-NEXT: v_and_b32_e32 v20, 1, v10 -; GFX12-NEXT: v_and_b32_e32 v21, 1, v12 -; GFX12-NEXT: v_and_b32_e32 v2, 1, v14 -; GFX12-NEXT: v_and_b32_e32 v15, 0xffff, v5 -; GFX12-NEXT: v_and_b32_e32 v14, 1, v3 -; GFX12-NEXT: v_and_b32_e32 v12, 1, v1 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v13 -; GFX12-NEXT: v_and_b32_e32 v13, 0xffff, v17 -; GFX12-NEXT: v_and_b32_e32 v6, 1, v11 -; GFX12-NEXT: v_and_b32_e32 v4, 1, v9 -; GFX12-NEXT: v_and_b32_e32 v10, 1, v8 -; GFX12-NEXT: v_and_b32_e32 v8, 1, v7 -; GFX12-NEXT: v_and_b32_e32 v11, 0xffff, v19 -; GFX12-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff, v21 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v20 -; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; GFX12-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_and_b32 s6, 0xffff, s2 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10003 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10001 +; GFX12-NEXT: s_bfe_u32 s5, s2, 0x10007 +; GFX12-NEXT: s_bfe_u32 s7, s2, 0x10009 +; GFX12-NEXT: s_bfe_u32 s8, s2, 0x1000d +; GFX12-NEXT: s_and_b32 s9, s2, 1 +; GFX12-NEXT: v_mov_b32_e32 v1, s8 +; GFX12-NEXT: s_bfe_u32 s10, s2, 0x1000a +; GFX12-NEXT: s_bfe_u32 s2, s2, 0x1000c +; GFX12-NEXT: s_bfe_u32 s11, s6, 0x10005 +; GFX12-NEXT: s_bfe_u32 s12, s6, 0x1000b +; GFX12-NEXT: s_lshr_b32 s13, s6, 15 +; GFX12-NEXT: s_bfe_u32 s14, s6, 0x10002 +; GFX12-NEXT: s_bfe_u32 s15, s6, 0x10006 +; GFX12-NEXT: s_bfe_u32 s16, s6, 0x10004 +; GFX12-NEXT: s_bfe_u32 s17, s6, 0x10008 +; GFX12-NEXT: s_bfe_u32 s6, s6, 0x1000e +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s7 +; GFX12-NEXT: v_dual_mov_b32 v15, s3 :: v_dual_mov_b32 v2, s6 +; GFX12-NEXT: v_dual_mov_b32 v3, s13 :: v_dual_mov_b32 v4, s17 +; GFX12-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v11, s5 +; GFX12-NEXT: v_dual_mov_b32 v7, s12 :: v_dual_mov_b32 v8, s16 +; GFX12-NEXT: v_dual_mov_b32 v9, s11 :: v_dual_mov_b32 v10, s15 +; GFX12-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v13, s4 +; GFX12-NEXT: v_mov_b32_e32 v14, s14 ; GFX12-NEXT: s_clause 0x3 -; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:48 -; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32 -; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16 -; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] +; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -1867,7 +1814,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: flat_load_ushort v12, v[0:1] ; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v19, s3 @@ -1883,37 +1830,22 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v20, s2 ; GFX8-NEXT: v_mov_b32_e32 v22, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e32 v12, 12, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v13, 13, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v14, 14, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v15, 15, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v8, 8, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v9, 9, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v10, 10, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v11, 11, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, 4, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v5, 5, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 6, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v7, 7, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, 2, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 3, v0 -; GFX8-NEXT: v_bfe_i32 v15, v15, 0, 1 -; GFX8-NEXT: v_bfe_i32 v14, v14, 0, 1 -; GFX8-NEXT: v_bfe_i32 v13, v13, 0, 1 -; GFX8-NEXT: v_bfe_i32 v12, v12, 0, 1 -; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 1 -; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 1 -; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 1 -; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 1 -; GFX8-NEXT: v_bfe_i32 v7, v7, 0, 1 -; GFX8-NEXT: v_bfe_i32 v6, v6, 0, 1 -; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 1 -; GFX8-NEXT: v_bfe_i32 v4, v4, 0, 1 -; GFX8-NEXT: v_bfe_i32 v11, v11, 0, 1 -; GFX8-NEXT: v_bfe_i32 v10, v10, 0, 1 -; GFX8-NEXT: v_bfe_i32 v9, v9, 0, 1 -; GFX8-NEXT: v_bfe_i32 v8, v8, 0, 1 +; GFX8-NEXT: v_bfe_i32 v3, v12, 3, 1 +; GFX8-NEXT: v_bfe_i32 v2, v12, 2, 1 +; GFX8-NEXT: v_bfe_i32 v1, v12, 1, 1 +; GFX8-NEXT: v_bfe_i32 v0, v12, 0, 1 +; GFX8-NEXT: v_bfe_i32 v7, v12, 7, 1 +; GFX8-NEXT: v_bfe_i32 v6, v12, 6, 1 +; GFX8-NEXT: v_bfe_i32 v5, v12, 5, 1 +; GFX8-NEXT: v_bfe_i32 v4, v12, 4, 1 +; GFX8-NEXT: v_bfe_i32 v11, v12, 11, 1 +; GFX8-NEXT: v_bfe_i32 v10, v12, 10, 1 +; GFX8-NEXT: v_bfe_i32 v9, v12, 9, 1 +; GFX8-NEXT: v_bfe_i32 v8, v12, 8, 1 +; GFX8-NEXT: v_bfe_i32 v15, v12, 15, 1 +; GFX8-NEXT: v_bfe_i32 v14, v12, 14, 1 +; GFX8-NEXT: v_bfe_i32 v13, v12, 13, 1 +; GFX8-NEXT: v_bfe_i32 v12, v12, 12, 1 ; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[12:15] ; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[8:11] ; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[4:7] @@ -1991,46 +1923,40 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o ; GFX12-LABEL: constant_sextload_v16i1_to_v16i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v16, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_u16 v0, v16, s[2:3] -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v1, 12, v0 -; GFX12-NEXT: v_lshrrev_b16 v4, 13, v0 -; GFX12-NEXT: v_lshrrev_b16 v8, 14, v0 -; GFX12-NEXT: v_lshrrev_b16 v12, 15, v0 -; GFX12-NEXT: v_lshrrev_b16 v2, 3, v0 -; GFX12-NEXT: v_lshrrev_b16 v7, 2, v0 -; GFX12-NEXT: v_lshrrev_b16 v13, 7, v0 -; GFX12-NEXT: v_lshrrev_b16 v17, 8, v0 -; GFX12-NEXT: v_lshrrev_b16 v9, 9, v0 -; GFX12-NEXT: v_lshrrev_b16 v10, 10, v0 -; GFX12-NEXT: v_lshrrev_b16 v11, 11, v0 -; GFX12-NEXT: v_lshrrev_b16 v18, 4, v0 -; GFX12-NEXT: v_lshrrev_b16 v5, 5, v0 -; GFX12-NEXT: v_lshrrev_b16 v6, 6, v0 -; GFX12-NEXT: v_lshrrev_b16 v19, 1, v0 -; GFX12-NEXT: v_bfe_i32 v3, v2, 0, 1 -; GFX12-NEXT: v_bfe_i32 v2, v7, 0, 1 -; GFX12-NEXT: v_bfe_i32 v7, v13, 0, 1 -; GFX12-NEXT: v_bfe_i32 v15, v12, 0, 1 -; GFX12-NEXT: v_bfe_i32 v14, v8, 0, 1 -; GFX12-NEXT: v_bfe_i32 v13, v4, 0, 1 -; GFX12-NEXT: v_bfe_i32 v12, v1, 0, 1 -; GFX12-NEXT: v_bfe_i32 v11, v11, 0, 1 -; GFX12-NEXT: v_bfe_i32 v10, v10, 0, 1 -; GFX12-NEXT: v_bfe_i32 v9, v9, 0, 1 -; GFX12-NEXT: v_bfe_i32 v8, v17, 0, 1 -; GFX12-NEXT: v_bfe_i32 v6, v6, 0, 1 -; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 1 -; GFX12-NEXT: v_bfe_i32 v4, v18, 0, 1 -; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1 -; GFX12-NEXT: v_bfe_i32 v1, v19, 0, 1 +; GFX12-NEXT: s_load_u16 s2, s[2:3], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_bfe_i32 s3, s2, 0x10003 +; GFX12-NEXT: s_bfe_i32 s4, s2, 0x10002 +; GFX12-NEXT: s_bfe_i32 s5, s2, 0x10001 +; GFX12-NEXT: s_bfe_i32 s6, s2, 0x10000 +; GFX12-NEXT: s_bfe_i32 s7, s2, 0x10007 +; GFX12-NEXT: s_bfe_i32 s8, s2, 0x10006 +; GFX12-NEXT: s_bfe_i32 s9, s2, 0x10005 +; GFX12-NEXT: s_bfe_i32 s10, s2, 0x10004 +; GFX12-NEXT: s_bfe_i32 s11, s2, 0x1000b +; GFX12-NEXT: s_bfe_i32 s12, s2, 0x1000a +; GFX12-NEXT: s_bfe_i32 s13, s2, 0x10009 +; GFX12-NEXT: s_bfe_i32 s14, s2, 0x10008 +; GFX12-NEXT: s_bfe_i32 s15, s2, 0x1000f +; GFX12-NEXT: s_bfe_i32 s16, s2, 0x1000e +; GFX12-NEXT: s_bfe_i32 s17, s2, 0x1000c +; GFX12-NEXT: s_bfe_i32 s2, s2, 0x1000d +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: v_dual_mov_b32 v0, s17 :: v_dual_mov_b32 v3, s15 +; GFX12-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v5, s13 +; GFX12-NEXT: v_dual_mov_b32 v4, s14 :: v_dual_mov_b32 v7, s11 +; GFX12-NEXT: v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v9, s9 +; GFX12-NEXT: v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v11, s7 +; GFX12-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v13, s5 +; GFX12-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v15, s3 +; GFX12-NEXT: v_mov_b32_e32 v14, s4 ; GFX12-NEXT: s_clause 0x3 -; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:48 -; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32 -; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16 -; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] +; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -2134,118 +2060,108 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 13, s4 -; GFX8-NEXT: v_and_b32_e32 v24, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s4 -; GFX8-NEXT: v_and_b32_e32 v22, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 7, s4 -; GFX8-NEXT: v_and_b32_e32 v15, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s4 -; GFX8-NEXT: v_and_b32_e32 v23, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s4 -; GFX8-NEXT: s_lshr_b32 s2, s4, 24 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 9, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v3, 11, s4 -; GFX8-NEXT: v_and_b32_e32 v26, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s2 -; GFX8-NEXT: v_and_b32_e32 v17, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v18, 1, v3 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 4, s2 -; GFX8-NEXT: v_and_b32_e32 v5, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e64 v6, 6, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 2, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v3, 3, s2 -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x10018 -; GFX8-NEXT: v_lshrrev_b16_e64 v7, 7, s2 -; GFX8-NEXT: s_and_b32 s6, s4, 1 -; GFX8-NEXT: s_bfe_u32 s7, s4, 0x10013 -; GFX8-NEXT: s_bfe_u32 s8, s4, 0x10012 -; GFX8-NEXT: s_bfe_u32 s9, s4, 0x10011 -; GFX8-NEXT: s_bfe_u32 s10, s4, 0x10010 -; GFX8-NEXT: s_bfe_u32 s2, s4, 0x10017 -; GFX8-NEXT: s_bfe_u32 s3, s4, 0x10016 -; GFX8-NEXT: s_bfe_u32 s11, s4, 0x10015 -; GFX8-NEXT: s_bfe_u32 s12, s4, 0x10014 -; GFX8-NEXT: v_mov_b32_e32 v11, s2 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x10003 +; GFX8-NEXT: s_bfe_u32 s5, s2, 0x10001 +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x10007 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x10005 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x1000b +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x10009 +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x1000f +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x1000d +; GFX8-NEXT: s_bfe_u32 s12, s2, 0x10013 +; GFX8-NEXT: s_bfe_u32 s13, s2, 0x10011 +; GFX8-NEXT: s_bfe_u32 s14, s2, 0x10017 +; GFX8-NEXT: s_bfe_u32 s15, s2, 0x1001b +; GFX8-NEXT: s_bfe_u32 s16, s2, 0x10019 +; GFX8-NEXT: s_lshr_b32 s3, s2, 31 +; GFX8-NEXT: s_bfe_u32 s17, s2, 0x1001d +; GFX8-NEXT: s_and_b32 s18, s2, 1 +; GFX8-NEXT: s_bfe_u32 s19, s2, 0x10002 +; GFX8-NEXT: s_bfe_u32 s20, s2, 0x10006 +; GFX8-NEXT: s_bfe_u32 s21, s2, 0x10004 +; GFX8-NEXT: s_bfe_u32 s22, s2, 0x1000a +; GFX8-NEXT: s_bfe_u32 s23, s2, 0x10008 +; GFX8-NEXT: s_bfe_u32 s24, s2, 0x1000e +; GFX8-NEXT: s_bfe_u32 s25, s2, 0x1000c +; GFX8-NEXT: s_bfe_u32 s26, s2, 0x10012 +; GFX8-NEXT: s_bfe_u32 s27, s2, 0x10010 +; GFX8-NEXT: s_bfe_u32 s28, s2, 0x10016 +; GFX8-NEXT: s_bfe_u32 s29, s2, 0x10015 +; GFX8-NEXT: s_bfe_u32 s30, s2, 0x10014 +; GFX8-NEXT: s_bfe_u32 s31, s2, 0x1001a +; GFX8-NEXT: s_bfe_u32 s33, s2, 0x10018 +; GFX8-NEXT: s_bfe_u32 s34, s2, 0x1001e +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x1001c +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0x70 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0x60 +; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x50 -; GFX8-NEXT: v_mov_b32_e32 v10, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s33 +; GFX8-NEXT: v_mov_b32_e32 v1, s16 +; GFX8-NEXT: v_mov_b32_e32 v2, s31 +; GFX8-NEXT: v_mov_b32_e32 v3, s15 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v13, s3 -; GFX8-NEXT: v_mov_b32_e32 v12, s2 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 64 -; GFX8-NEXT: v_mov_b32_e32 v8, s12 -; GFX8-NEXT: v_mov_b32_e32 v9, s11 +; GFX8-NEXT: v_mov_b32_e32 v0, s30 +; GFX8-NEXT: v_mov_b32_e32 v1, s29 +; GFX8-NEXT: v_mov_b32_e32 v2, s28 +; GFX8-NEXT: v_mov_b32_e32 v3, s14 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11] -; GFX8-NEXT: v_mov_b32_e32 v13, s3 -; GFX8-NEXT: v_mov_b32_e32 v12, s2 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 48 -; GFX8-NEXT: v_lshrrev_b16_e64 v21, 14, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v25, 2, s4 -; GFX8-NEXT: v_mov_b32_e32 v8, s10 -; GFX8-NEXT: v_mov_b32_e32 v9, s9 -; GFX8-NEXT: v_mov_b32_e32 v10, s8 -; GFX8-NEXT: v_mov_b32_e32 v11, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s27 +; GFX8-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NEXT: v_mov_b32_e32 v2, s26 +; GFX8-NEXT: v_mov_b32_e32 v3, s12 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11] -; GFX8-NEXT: v_and_b32_e32 v13, 0xffff, v22 -; GFX8-NEXT: v_and_b32_e32 v10, 1, v25 -; GFX8-NEXT: v_and_b32_e32 v22, 1, v21 -; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v24 -; GFX8-NEXT: v_mov_b32_e32 v25, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v20, 12, s4 -; GFX8-NEXT: v_mov_b32_e32 v24, s2 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v23 -; GFX8-NEXT: v_lshrrev_b16_e64 v23, 15, s4 -; GFX8-NEXT: v_and_b32_e32 v20, 1, v20 +; GFX8-NEXT: v_mov_b32_e32 v0, s25 +; GFX8-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NEXT: v_mov_b32_e32 v2, s24 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[20:23] -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v21, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v16, 10, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v19, 4, s4 -; GFX8-NEXT: v_mov_b32_e32 v8, 1 -; GFX8-NEXT: v_mov_b32_e32 v20, s2 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NEXT: v_and_b32_e32 v12, 1, v19 -; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v18 -; GFX8-NEXT: v_and_b32_e32 v18, 1, v16 -; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX8-NEXT: v_and_b32_sdwa v16, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s4 -; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GFX8-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX8-NEXT: v_mov_b32_e32 v17, s3 -; GFX8-NEXT: v_and_b32_e32 v14, 1, v14 -; GFX8-NEXT: v_mov_b32_e32 v16, s2 -; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GFX8-NEXT: s_add_u32 s2, s0, 0x70 -; GFX8-NEXT: v_mov_b32_e32 v13, s1 -; GFX8-NEXT: v_and_b32_e32 v11, 0xffff, v26 -; GFX8-NEXT: v_mov_b32_e32 v8, s6 -; GFX8-NEXT: v_mov_b32_e32 v12, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s23 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_mov_b32_e32 v2, s22 +; GFX8-NEXT: v_mov_b32_e32 v3, s8 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11] -; GFX8-NEXT: s_add_u32 s0, s0, 0x60 -; GFX8-NEXT: v_mov_b32_e32 v9, s3 -; GFX8-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX8-NEXT: v_mov_b32_e32 v8, s2 -; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s21 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s20 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s19 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -2353,84 +2269,65 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshr_b32 s3, s2, 24 -; GFX12-NEXT: v_lshrrev_b16 v1, 13, s2 -; GFX12-NEXT: v_lshrrev_b16 v2, 9, s2 -; GFX12-NEXT: v_lshrrev_b16 v4, 11, s2 -; GFX12-NEXT: v_lshrrev_b16 v6, 5, s2 -; GFX12-NEXT: v_lshrrev_b16 v9, 7, s2 -; GFX12-NEXT: v_lshrrev_b16 v13, 3, s2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_lshrrev_b16 v14, 5, s3 -; GFX12-NEXT: v_lshrrev_b16 v18, 1, s3 -; GFX12-NEXT: v_lshrrev_b16 v21, 3, s3 -; GFX12-NEXT: v_lshrrev_b16 v10, 1, s2 -; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_and_b32 v33, 1, v1 -; GFX12-NEXT: v_lshrrev_b16 v0, 12, s2 -; GFX12-NEXT: v_lshrrev_b16 v12, 14, s2 -; GFX12-NEXT: v_lshrrev_b16 v20, 15, s2 -; GFX12-NEXT: v_lshrrev_b16 v8, 8, s2 -; GFX12-NEXT: v_lshrrev_b16 v3, 10, s2 -; GFX12-NEXT: v_lshrrev_b16 v5, 4, s2 -; GFX12-NEXT: v_lshrrev_b16 v7, 6, s2 -; GFX12-NEXT: v_lshrrev_b16 v11, 2, s2 -; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018 -; GFX12-NEXT: s_and_b32 s5, s2, 1 -; GFX12-NEXT: v_lshrrev_b16 v15, 4, s3 -; GFX12-NEXT: v_lshrrev_b16 v16, 6, s3 -; GFX12-NEXT: v_lshrrev_b16 v17, 7, s3 -; GFX12-NEXT: v_lshrrev_b16 v19, 2, s3 -; GFX12-NEXT: v_and_b32_e32 v25, 1, v14 -; GFX12-NEXT: v_and_b32_e32 v26, 1, v18 -; GFX12-NEXT: v_and_b32_e32 v21, 1, v21 -; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10013 -; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10012 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v30, s6 :: v_dual_and_b32 v13, 1, v13 -; GFX12-NEXT: s_bfe_u32 s7, s2, 0x10011 -; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10010 -; GFX12-NEXT: s_bfe_u32 s9, s2, 0x10017 -; GFX12-NEXT: v_dual_mov_b32 v27, s9 :: v_dual_and_b32 v24, 1, v6 -; GFX12-NEXT: s_bfe_u32 s10, s2, 0x10016 -; GFX12-NEXT: v_and_b32_e32 v9, 1, v9 -; GFX12-NEXT: s_bfe_u32 s11, s2, 0x10014 -; GFX12-NEXT: v_and_b32_e32 v23, 1, v4 -; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10015 -; GFX12-NEXT: v_and_b32_e32 v22, 1, v2 -; GFX12-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_and_b32 v1, 1, v10 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v29, s7 :: v_dual_and_b32 v2, 1, v11 -; GFX12-NEXT: v_dual_mov_b32 v31, s3 :: v_dual_and_b32 v6, 1, v7 -; GFX12-NEXT: v_and_b32_e32 v4, 1, v5 -; GFX12-NEXT: v_and_b32_e32 v10, 1, v3 -; GFX12-NEXT: v_and_b32_e32 v14, 1, v19 -; GFX12-NEXT: v_and_b32_e32 v19, 0xffff, v17 -; GFX12-NEXT: v_and_b32_e32 v18, 1, v16 -; GFX12-NEXT: v_and_b32_e32 v16, 1, v15 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v13 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v24 -; GFX12-NEXT: v_dual_mov_b32 v24, s11 :: v_dual_and_b32 v13, 0xffff, v26 -; GFX12-NEXT: v_and_b32_e32 v11, 0xffff, v23 -; GFX12-NEXT: v_dual_mov_b32 v26, s10 :: v_dual_and_b32 v23, 0xffff, v20 -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff, v9 -; GFX12-NEXT: v_and_b32_e32 v20, 1, v0 -; GFX12-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_and_b32 v17, 0xffff, v25 -; GFX12-NEXT: v_mov_b32_e32 v25, s2 -; GFX12-NEXT: v_and_b32_e32 v9, 0xffff, v22 -; GFX12-NEXT: v_and_b32_e32 v22, 1, v12 -; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_and_b32 v15, 0xffff, v21 -; GFX12-NEXT: v_and_b32_e32 v21, 0xffff, v33 -; GFX12-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-NEXT: s_clause 0x7 -; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:80 -; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:64 -; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:48 -; GFX12-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:32 -; GFX12-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:16 -; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1] -; GFX12-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:112 -; GFX12-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:96 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10003 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10001 +; GFX12-NEXT: s_bfe_u32 s5, s2, 0x10007 +; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10005 +; GFX12-NEXT: s_bfe_u32 s7, s2, 0x1000b +; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10009 +; GFX12-NEXT: s_bfe_u32 s9, s2, 0x1000f +; GFX12-NEXT: s_bfe_u32 s10, s2, 0x1000d +; GFX12-NEXT: s_bfe_u32 s11, s2, 0x10013 +; GFX12-NEXT: s_bfe_u32 s12, s2, 0x10011 +; GFX12-NEXT: s_bfe_u32 s13, s2, 0x10017 +; GFX12-NEXT: s_bfe_u32 s14, s2, 0x1001b +; GFX12-NEXT: s_bfe_u32 s15, s2, 0x10019 +; GFX12-NEXT: s_lshr_b32 s16, s2, 31 +; GFX12-NEXT: s_bfe_u32 s17, s2, 0x1001d +; GFX12-NEXT: s_and_b32 s18, s2, 1 +; GFX12-NEXT: s_bfe_u32 s19, s2, 0x10002 +; GFX12-NEXT: s_bfe_u32 s20, s2, 0x10006 +; GFX12-NEXT: s_bfe_u32 s21, s2, 0x10004 +; GFX12-NEXT: s_bfe_u32 s22, s2, 0x1000a +; GFX12-NEXT: s_bfe_u32 s23, s2, 0x10008 +; GFX12-NEXT: s_bfe_u32 s24, s2, 0x1000e +; GFX12-NEXT: s_bfe_u32 s25, s2, 0x1000c +; GFX12-NEXT: s_bfe_u32 s26, s2, 0x10012 +; GFX12-NEXT: s_bfe_u32 s27, s2, 0x10010 +; GFX12-NEXT: s_bfe_u32 s28, s2, 0x10016 +; GFX12-NEXT: s_bfe_u32 s29, s2, 0x10015 +; GFX12-NEXT: s_bfe_u32 s30, s2, 0x10014 +; GFX12-NEXT: s_bfe_u32 s31, s2, 0x1001a +; GFX12-NEXT: s_bfe_u32 s33, s2, 0x10018 +; GFX12-NEXT: s_bfe_u32 s34, s2, 0x1001c +; GFX12-NEXT: s_bfe_u32 s2, s2, 0x1001e +; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s17 +; GFX12-NEXT: v_dual_mov_b32 v0, s34 :: v_dual_mov_b32 v3, s16 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s15 +; GFX12-NEXT: v_dual_mov_b32 v4, s33 :: v_dual_mov_b32 v7, s14 +; GFX12-NEXT: v_dual_mov_b32 v6, s31 :: v_dual_mov_b32 v9, s29 +; GFX12-NEXT: v_dual_mov_b32 v8, s30 :: v_dual_mov_b32 v11, s13 +; GFX12-NEXT: v_mov_b32_e32 v10, s28 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:112 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:96 +; GFX12-NEXT: v_dual_mov_b32 v1, s12 :: v_dual_mov_b32 v0, s27 +; GFX12-NEXT: v_dual_mov_b32 v3, s11 :: v_dual_mov_b32 v2, s26 +; GFX12-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v4, s25 +; GFX12-NEXT: v_dual_mov_b32 v7, s9 :: v_dual_mov_b32 v6, s24 +; GFX12-NEXT: v_dual_mov_b32 v13, s8 :: v_dual_mov_b32 v12, s23 +; GFX12-NEXT: v_dual_mov_b32 v15, s7 :: v_dual_mov_b32 v14, s22 +; GFX12-NEXT: v_dual_mov_b32 v17, s6 :: v_dual_mov_b32 v16, s21 +; GFX12-NEXT: v_dual_mov_b32 v19, s5 :: v_dual_mov_b32 v18, s20 +; GFX12-NEXT: v_dual_mov_b32 v21, s4 :: v_dual_mov_b32 v20, s18 +; GFX12-NEXT: v_dual_mov_b32 v23, s3 :: v_dual_mov_b32 v22, s19 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:80 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:64 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -2536,107 +2433,106 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s3, s2, 24 -; GFX8-NEXT: v_lshrrev_b16_e64 v8, 12, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v20, 13, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v21, 14, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v22, 15, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v16, 8, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v17, 9, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v18, 10, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v19, 11, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v12, 4, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v13, 5, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v15, 7, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 1, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v10, 2, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v11, 3, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v23, 4, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v24, 5, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v6, 6, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v7, 7, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v25, 1, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v26, 2, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v27, 3, s3 -; GFX8-NEXT: s_bfe_i32 s4, s2, 0x10018 -; GFX8-NEXT: s_bfe_i32 s5, s2, 0x10000 -; GFX8-NEXT: s_bfe_i32 s6, s2, 0x10013 -; GFX8-NEXT: s_bfe_i32 s7, s2, 0x10012 -; GFX8-NEXT: s_bfe_i32 s8, s2, 0x10011 -; GFX8-NEXT: s_bfe_i32 s9, s2, 0x10010 -; GFX8-NEXT: s_bfe_i32 s3, s2, 0x10017 -; GFX8-NEXT: s_bfe_i32 s10, s2, 0x10016 -; GFX8-NEXT: s_bfe_i32 s11, s2, 0x10015 -; GFX8-NEXT: s_bfe_i32 s2, s2, 0x10014 +; GFX8-NEXT: s_bfe_i32 s4, s2, 0x10003 +; GFX8-NEXT: s_bfe_i32 s5, s2, 0x10002 +; GFX8-NEXT: s_bfe_i32 s6, s2, 0x10001 +; GFX8-NEXT: s_bfe_i32 s7, s2, 0x10000 +; GFX8-NEXT: s_bfe_i32 s8, s2, 0x10007 +; GFX8-NEXT: s_bfe_i32 s9, s2, 0x10006 +; GFX8-NEXT: s_bfe_i32 s10, s2, 0x10005 +; GFX8-NEXT: s_bfe_i32 s11, s2, 0x10004 +; GFX8-NEXT: s_bfe_i32 s12, s2, 0x1000b +; GFX8-NEXT: s_bfe_i32 s13, s2, 0x1000a +; GFX8-NEXT: s_bfe_i32 s14, s2, 0x10009 +; GFX8-NEXT: s_bfe_i32 s15, s2, 0x10008 +; GFX8-NEXT: s_bfe_i32 s16, s2, 0x1000f +; GFX8-NEXT: s_bfe_i32 s17, s2, 0x1000e +; GFX8-NEXT: s_bfe_i32 s18, s2, 0x1000d +; GFX8-NEXT: s_bfe_i32 s19, s2, 0x1000c +; GFX8-NEXT: s_bfe_i32 s20, s2, 0x10013 +; GFX8-NEXT: s_bfe_i32 s21, s2, 0x10012 +; GFX8-NEXT: s_bfe_i32 s22, s2, 0x10011 +; GFX8-NEXT: s_bfe_i32 s23, s2, 0x10010 +; GFX8-NEXT: s_bfe_i32 s24, s2, 0x10017 +; GFX8-NEXT: s_bfe_i32 s25, s2, 0x10016 +; GFX8-NEXT: s_bfe_i32 s26, s2, 0x10015 +; GFX8-NEXT: s_bfe_i32 s27, s2, 0x10014 +; GFX8-NEXT: s_bfe_i32 s28, s2, 0x1001b +; GFX8-NEXT: s_bfe_i32 s29, s2, 0x1001a +; GFX8-NEXT: s_bfe_i32 s30, s2, 0x10019 +; GFX8-NEXT: s_bfe_i32 s31, s2, 0x10018 +; GFX8-NEXT: s_ashr_i32 s3, s2, 31 +; GFX8-NEXT: s_bfe_i32 s33, s2, 0x1001e +; GFX8-NEXT: s_bfe_i32 s34, s2, 0x1001d +; GFX8-NEXT: s_bfe_i32 s2, s2, 0x1001c ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0x50 +; GFX8-NEXT: s_add_u32 s2, s0, 0x70 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0x60 +; GFX8-NEXT: v_mov_b32_e32 v1, s34 +; GFX8-NEXT: v_mov_b32_e32 v2, s33 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0x50 +; GFX8-NEXT: v_mov_b32_e32 v0, s31 +; GFX8-NEXT: v_mov_b32_e32 v1, s30 +; GFX8-NEXT: v_mov_b32_e32 v2, s29 +; GFX8-NEXT: v_mov_b32_e32 v3, s28 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 64 -; GFX8-NEXT: v_mov_b32_e32 v1, s11 -; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: v_mov_b32_e32 v0, s27 +; GFX8-NEXT: v_mov_b32_e32 v1, s26 +; GFX8-NEXT: v_mov_b32_e32 v2, s25 +; GFX8-NEXT: v_mov_b32_e32 v3, s24 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 48 -; GFX8-NEXT: v_mov_b32_e32 v0, s9 -; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: v_mov_b32_e32 v2, s7 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s23 +; GFX8-NEXT: v_mov_b32_e32 v1, s22 +; GFX8-NEXT: v_mov_b32_e32 v2, s21 +; GFX8-NEXT: v_mov_b32_e32 v3, s20 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_bfe_i32 v5, v24, 0, 1 -; GFX8-NEXT: v_bfe_i32 v1, v25, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v25, s3 -; GFX8-NEXT: v_mov_b32_e32 v24, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: v_bfe_i32 v4, v23, 0, 1 -; GFX8-NEXT: v_bfe_i32 v23, v22, 0, 1 -; GFX8-NEXT: v_bfe_i32 v22, v21, 0, 1 -; GFX8-NEXT: v_bfe_i32 v21, v20, 0, 1 -; GFX8-NEXT: v_bfe_i32 v20, v8, 0, 1 +; GFX8-NEXT: v_mov_b32_e32 v0, s19 +; GFX8-NEXT: v_mov_b32_e32 v1, s18 +; GFX8-NEXT: v_mov_b32_e32 v2, s17 +; GFX8-NEXT: v_mov_b32_e32 v3, s16 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[20:23] -; GFX8-NEXT: v_bfe_i32 v19, v19, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v21, s3 -; GFX8-NEXT: v_mov_b32_e32 v20, s2 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NEXT: v_bfe_i32 v18, v18, 0, 1 -; GFX8-NEXT: v_bfe_i32 v17, v17, 0, 1 -; GFX8-NEXT: v_bfe_i32 v16, v16, 0, 1 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GFX8-NEXT: v_bfe_i32 v15, v15, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v17, s3 -; GFX8-NEXT: v_bfe_i32 v14, v14, 0, 1 -; GFX8-NEXT: v_bfe_i32 v13, v13, 0, 1 -; GFX8-NEXT: v_bfe_i32 v12, v12, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v16, s2 -; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GFX8-NEXT: s_add_u32 s2, s0, 0x70 -; GFX8-NEXT: v_mov_b32_e32 v13, s1 -; GFX8-NEXT: v_bfe_i32 v11, v11, 0, 1 -; GFX8-NEXT: v_bfe_i32 v10, v10, 0, 1 -; GFX8-NEXT: v_bfe_i32 v9, v9, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v8, s5 -; GFX8-NEXT: v_mov_b32_e32 v12, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s15 +; GFX8-NEXT: v_mov_b32_e32 v1, s14 +; GFX8-NEXT: v_mov_b32_e32 v2, s13 +; GFX8-NEXT: v_mov_b32_e32 v3, s12 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11] -; GFX8-NEXT: s_add_u32 s0, s0, 0x60 -; GFX8-NEXT: v_mov_b32_e32 v9, s3 -; GFX8-NEXT: v_bfe_i32 v7, v7, 0, 1 -; GFX8-NEXT: v_bfe_i32 v6, v6, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v8, s2 -; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GFX8-NEXT: v_bfe_i32 v3, v27, 0, 1 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s11 +; GFX8-NEXT: v_mov_b32_e32 v1, s10 +; GFX8-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NEXT: v_mov_b32_e32 v3, s8 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_bfe_i32 v2, v26, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -2776,80 +2672,66 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v0, 12, s2 -; GFX12-NEXT: v_lshrrev_b16 v12, 13, s2 -; GFX12-NEXT: v_lshrrev_b16 v13, 14, s2 -; GFX12-NEXT: v_lshrrev_b16 v14, 15, s2 -; GFX12-NEXT: v_lshrrev_b16 v8, 8, s2 -; GFX12-NEXT: v_lshrrev_b16 v9, 9, s2 -; GFX12-NEXT: v_lshrrev_b16 v10, 10, s2 -; GFX12-NEXT: v_lshrrev_b16 v11, 11, s2 -; GFX12-NEXT: s_lshr_b32 s3, s2, 24 -; GFX12-NEXT: v_lshrrev_b16 v4, 4, s2 -; GFX12-NEXT: v_lshrrev_b16 v5, 5, s2 -; GFX12-NEXT: v_lshrrev_b16 v6, 6, s2 -; GFX12-NEXT: v_lshrrev_b16 v7, 7, s2 -; GFX12-NEXT: v_lshrrev_b16 v1, 1, s2 -; GFX12-NEXT: v_lshrrev_b16 v2, 2, s2 -; GFX12-NEXT: v_lshrrev_b16 v3, 3, s2 -; GFX12-NEXT: s_bfe_i32 s4, s2, 0x10018 -; GFX12-NEXT: s_bfe_i32 s5, s2, 0x10000 -; GFX12-NEXT: s_bfe_i32 s6, s2, 0x10013 -; GFX12-NEXT: s_bfe_i32 s7, s2, 0x10012 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_lshrrev_b16 v16, 4, s3 -; GFX12-NEXT: v_lshrrev_b16 v20, 5, s3 -; GFX12-NEXT: v_lshrrev_b16 v21, 6, s3 -; GFX12-NEXT: v_lshrrev_b16 v22, 7, s3 -; GFX12-NEXT: v_lshrrev_b16 v17, 1, s3 -; GFX12-NEXT: v_lshrrev_b16 v18, 2, s3 -; GFX12-NEXT: v_lshrrev_b16 v19, 3, s3 -; GFX12-NEXT: s_bfe_i32 s3, s2, 0x10011 -; GFX12-NEXT: s_bfe_i32 s8, s2, 0x10010 -; GFX12-NEXT: s_bfe_i32 s9, s2, 0x10017 -; GFX12-NEXT: s_bfe_i32 s10, s2, 0x10016 -; GFX12-NEXT: s_bfe_i32 s11, s2, 0x10014 -; GFX12-NEXT: s_bfe_i32 s2, s2, 0x10015 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v25, s2 -; GFX12-NEXT: v_bfe_i32 v15, v14, 0, 1 -; GFX12-NEXT: v_dual_mov_b32 v24, s11 :: v_dual_mov_b32 v27, s9 -; GFX12-NEXT: v_dual_mov_b32 v26, s10 :: v_dual_mov_b32 v29, s3 -; GFX12-NEXT: v_bfe_i32 v14, v13, 0, 1 -; GFX12-NEXT: v_bfe_i32 v13, v12, 0, 1 -; GFX12-NEXT: v_bfe_i32 v12, v0, 0, 1 -; GFX12-NEXT: v_bfe_i32 v11, v11, 0, 1 -; GFX12-NEXT: v_bfe_i32 v10, v10, 0, 1 -; GFX12-NEXT: v_bfe_i32 v9, v9, 0, 1 -; GFX12-NEXT: v_bfe_i32 v8, v8, 0, 1 -; GFX12-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_mov_b32 v31, s6 -; GFX12-NEXT: v_mov_b32_e32 v30, s7 -; GFX12-NEXT: v_bfe_i32 v7, v7, 0, 1 -; GFX12-NEXT: v_bfe_i32 v6, v6, 0, 1 -; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 1 -; GFX12-NEXT: v_bfe_i32 v4, v4, 0, 1 -; GFX12-NEXT: v_bfe_i32 v3, v3, 0, 1 -; GFX12-NEXT: v_bfe_i32 v2, v2, 0, 1 -; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 1 -; GFX12-NEXT: v_mov_b32_e32 v0, s5 -; GFX12-NEXT: v_bfe_i32 v23, v22, 0, 1 -; GFX12-NEXT: v_bfe_i32 v22, v21, 0, 1 -; GFX12-NEXT: v_bfe_i32 v21, v20, 0, 1 -; GFX12-NEXT: v_bfe_i32 v20, v16, 0, 1 -; GFX12-NEXT: v_bfe_i32 v19, v19, 0, 1 -; GFX12-NEXT: v_bfe_i32 v18, v18, 0, 1 -; GFX12-NEXT: v_bfe_i32 v17, v17, 0, 1 +; GFX12-NEXT: s_bfe_i32 s3, s2, 0x10003 +; GFX12-NEXT: s_bfe_i32 s4, s2, 0x10002 +; GFX12-NEXT: s_bfe_i32 s5, s2, 0x10001 +; GFX12-NEXT: s_bfe_i32 s6, s2, 0x10000 +; GFX12-NEXT: s_bfe_i32 s7, s2, 0x10007 +; GFX12-NEXT: s_bfe_i32 s8, s2, 0x10006 +; GFX12-NEXT: s_bfe_i32 s9, s2, 0x10005 +; GFX12-NEXT: s_bfe_i32 s10, s2, 0x10004 +; GFX12-NEXT: s_bfe_i32 s11, s2, 0x1000b +; GFX12-NEXT: s_bfe_i32 s12, s2, 0x1000a +; GFX12-NEXT: s_bfe_i32 s13, s2, 0x10009 +; GFX12-NEXT: s_bfe_i32 s14, s2, 0x10008 +; GFX12-NEXT: s_bfe_i32 s15, s2, 0x1000f +; GFX12-NEXT: s_bfe_i32 s16, s2, 0x1000e +; GFX12-NEXT: s_bfe_i32 s17, s2, 0x1000d +; GFX12-NEXT: s_bfe_i32 s18, s2, 0x1000c +; GFX12-NEXT: s_bfe_i32 s19, s2, 0x10013 +; GFX12-NEXT: s_bfe_i32 s20, s2, 0x10012 +; GFX12-NEXT: s_bfe_i32 s21, s2, 0x10011 +; GFX12-NEXT: s_bfe_i32 s22, s2, 0x10010 +; GFX12-NEXT: s_bfe_i32 s23, s2, 0x10017 +; GFX12-NEXT: s_bfe_i32 s24, s2, 0x10016 +; GFX12-NEXT: s_bfe_i32 s25, s2, 0x10015 +; GFX12-NEXT: s_bfe_i32 s26, s2, 0x10014 +; GFX12-NEXT: s_bfe_i32 s27, s2, 0x1001b +; GFX12-NEXT: s_bfe_i32 s28, s2, 0x1001a +; GFX12-NEXT: s_bfe_i32 s29, s2, 0x10019 +; GFX12-NEXT: s_bfe_i32 s30, s2, 0x10018 +; GFX12-NEXT: s_ashr_i32 s31, s2, 31 +; GFX12-NEXT: s_bfe_i32 s33, s2, 0x1001e +; GFX12-NEXT: s_bfe_i32 s34, s2, 0x1001c +; GFX12-NEXT: s_bfe_i32 s2, s2, 0x1001d +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: v_dual_mov_b32 v0, s34 :: v_dual_mov_b32 v3, s31 +; GFX12-NEXT: v_dual_mov_b32 v2, s33 :: v_dual_mov_b32 v5, s29 +; GFX12-NEXT: v_dual_mov_b32 v4, s30 :: v_dual_mov_b32 v7, s27 +; GFX12-NEXT: v_dual_mov_b32 v6, s28 :: v_dual_mov_b32 v9, s25 +; GFX12-NEXT: v_dual_mov_b32 v8, s26 :: v_dual_mov_b32 v11, s23 +; GFX12-NEXT: v_mov_b32_e32 v10, s24 ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:80 -; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:64 -; GFX12-NEXT: v_mov_b32_e32 v16, s4 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:112 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:96 +; GFX12-NEXT: v_dual_mov_b32 v1, s21 :: v_dual_mov_b32 v0, s22 +; GFX12-NEXT: v_dual_mov_b32 v3, s19 :: v_dual_mov_b32 v2, s20 +; GFX12-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s18 +; GFX12-NEXT: v_dual_mov_b32 v7, s15 :: v_dual_mov_b32 v6, s16 +; GFX12-NEXT: v_dual_mov_b32 v13, s13 :: v_dual_mov_b32 v12, s14 +; GFX12-NEXT: v_dual_mov_b32 v15, s11 :: v_dual_mov_b32 v14, s12 +; GFX12-NEXT: v_dual_mov_b32 v17, s9 :: v_dual_mov_b32 v16, s10 +; GFX12-NEXT: v_dual_mov_b32 v19, s7 :: v_dual_mov_b32 v18, s8 +; GFX12-NEXT: v_dual_mov_b32 v21, s5 :: v_dual_mov_b32 v20, s6 +; GFX12-NEXT: v_dual_mov_b32 v23, s3 :: v_dual_mov_b32 v22, s4 ; GFX12-NEXT: s_clause 0x5 -; GFX12-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:48 -; GFX12-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:32 -; GFX12-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:16 -; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1] -; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:112 -; GFX12-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:96 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:80 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:64 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -3030,233 +2912,218 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[26:27], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s6, s3, 24 -; GFX8-NEXT: s_lshr_b32 s8, s2, 24 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x10018 -; GFX8-NEXT: s_bfe_u32 s5, s3, 0x10018 -; GFX8-NEXT: s_and_b32 s7, s3, 1 -; GFX8-NEXT: s_and_b32 s9, s2, 1 -; GFX8-NEXT: s_bfe_u32 s12, s2, 0x10013 -; GFX8-NEXT: s_bfe_u32 s13, s2, 0x10012 -; GFX8-NEXT: s_bfe_u32 s14, s2, 0x10011 -; GFX8-NEXT: s_bfe_u32 s15, s2, 0x10010 -; GFX8-NEXT: s_bfe_u32 s16, s2, 0x10017 -; GFX8-NEXT: s_bfe_u32 s17, s2, 0x10016 -; GFX8-NEXT: s_bfe_u32 s18, s2, 0x10015 -; GFX8-NEXT: s_bfe_u32 s19, s2, 0x10014 -; GFX8-NEXT: s_bfe_u32 s20, s3, 0x10013 -; GFX8-NEXT: s_bfe_u32 s21, s3, 0x10012 -; GFX8-NEXT: s_bfe_u32 s22, s3, 0x10011 -; GFX8-NEXT: s_bfe_u32 s23, s3, 0x10010 -; GFX8-NEXT: s_bfe_u32 s10, s3, 0x10017 -; GFX8-NEXT: s_bfe_u32 s11, s3, 0x10016 -; GFX8-NEXT: s_bfe_u32 s24, s3, 0x10015 -; GFX8-NEXT: s_bfe_u32 s25, s3, 0x10014 -; GFX8-NEXT: v_mov_b32_e32 v25, s10 -; GFX8-NEXT: s_add_u32 s10, s0, 0xd0 -; GFX8-NEXT: v_mov_b32_e32 v24, s11 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v27, s11 -; GFX8-NEXT: v_mov_b32_e32 v26, s10 -; GFX8-NEXT: s_add_u32 s10, s0, 0xc0 -; GFX8-NEXT: v_mov_b32_e32 v22, s25 -; GFX8-NEXT: v_mov_b32_e32 v23, s24 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v27, s11 -; GFX8-NEXT: v_mov_b32_e32 v26, s10 -; GFX8-NEXT: s_add_u32 s10, s0, 0x50 -; GFX8-NEXT: v_mov_b32_e32 v22, s23 -; GFX8-NEXT: v_mov_b32_e32 v23, s22 -; GFX8-NEXT: v_mov_b32_e32 v24, s21 -; GFX8-NEXT: v_mov_b32_e32 v25, s20 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v27, s11 -; GFX8-NEXT: v_mov_b32_e32 v26, s10 -; GFX8-NEXT: s_add_u32 s10, s0, 64 -; GFX8-NEXT: v_mov_b32_e32 v22, s19 -; GFX8-NEXT: v_mov_b32_e32 v23, s18 -; GFX8-NEXT: v_mov_b32_e32 v24, s17 -; GFX8-NEXT: v_mov_b32_e32 v25, s16 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v27, s11 -; GFX8-NEXT: v_mov_b32_e32 v26, s10 -; GFX8-NEXT: s_add_u32 s10, s0, 48 -; GFX8-NEXT: v_mov_b32_e32 v22, s15 -; GFX8-NEXT: v_mov_b32_e32 v23, s14 -; GFX8-NEXT: v_mov_b32_e32 v24, s13 -; GFX8-NEXT: v_mov_b32_e32 v25, s12 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 13, s2 -; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_lshrrev_b16_e64 v19, 12, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v22, 7, s3 -; GFX8-NEXT: v_mov_b32_e32 v25, s11 -; GFX8-NEXT: v_lshrrev_b16_e64 v20, 14, s2 -; GFX8-NEXT: v_and_b32_e32 v21, 1, v0 -; GFX8-NEXT: v_and_b32_e32 v27, 1, v22 -; GFX8-NEXT: v_lshrrev_b16_e64 v22, 1, s3 -; GFX8-NEXT: v_mov_b32_e32 v24, s10 -; GFX8-NEXT: s_add_u32 s10, s0, 32 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 9, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 11, s2 -; GFX8-NEXT: v_and_b32_e32 v28, 1, v22 -; GFX8-NEXT: v_and_b32_e32 v22, 1, v20 -; GFX8-NEXT: v_lshrrev_b16_e64 v23, 15, s2 -; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX8-NEXT: v_and_b32_e32 v20, 1, v19 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v14, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v16, 10, s2 -; GFX8-NEXT: v_and_b32_e32 v17, 1, v1 -; GFX8-NEXT: v_and_b32_e32 v18, 1, v2 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 5, s2 -; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[20:23] -; GFX8-NEXT: v_lshrrev_b16_e64 v19, 3, s3 -; GFX8-NEXT: v_mov_b32_e32 v25, 1 -; GFX8-NEXT: v_mov_b32_e32 v21, s11 -; GFX8-NEXT: v_and_b32_e32 v12, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 7, s2 -; GFX8-NEXT: v_and_b32_e32 v23, 1, v19 -; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v18 -; GFX8-NEXT: v_and_b32_e32 v18, 1, v16 -; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX8-NEXT: v_and_b32_sdwa v16, v14, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v20, s10 -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 5, s6 -; GFX8-NEXT: v_and_b32_e32 v15, 1, v0 -; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GFX8-NEXT: v_and_b32_e32 v20, 1, v14 -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 1, s6 -; GFX8-NEXT: s_add_u32 s10, s0, 16 -; GFX8-NEXT: v_and_b32_e32 v17, 1, v14 -; GFX8-NEXT: v_and_b32_e32 v14, 0xffff, v15 -; GFX8-NEXT: v_lshrrev_b16_e64 v15, 3, s6 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e64 v11, 4, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v13, 6, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 1, s2 -; GFX8-NEXT: v_and_b32_e32 v19, 1, v15 -; GFX8-NEXT: v_mov_b32_e32 v16, s11 -; GFX8-NEXT: v_and_b32_e32 v8, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 3, s2 -; GFX8-NEXT: v_and_b32_e32 v13, 1, v13 -; GFX8-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX8-NEXT: v_and_b32_e32 v11, 1, v11 -; GFX8-NEXT: v_mov_b32_e32 v15, s10 -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 2, s2 -; GFX8-NEXT: v_and_b32_e32 v10, 1, v0 -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 13, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v11, 5, s8 -; GFX8-NEXT: v_mov_b32_e32 v13, s1 -; GFX8-NEXT: v_and_b32_e32 v15, 1, v11 -; GFX8-NEXT: v_and_b32_e32 v11, 0xffff, v10 -; GFX8-NEXT: v_and_b32_e32 v10, 1, v9 -; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v8 -; GFX8-NEXT: v_mov_b32_e32 v8, s9 -; GFX8-NEXT: v_mov_b32_e32 v12, s0 -; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11] -; GFX8-NEXT: s_add_u32 s10, s0, 0xb0 -; GFX8-NEXT: v_lshrrev_b16_e64 v8, 1, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v5, 12, s3 -; GFX8-NEXT: v_and_b32_e32 v6, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e64 v7, 14, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 9, s3 -; GFX8-NEXT: v_and_b32_e32 v11, 1, v8 -; GFX8-NEXT: v_lshrrev_b16_e64 v8, 3, s8 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v9, s10 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX8-NEXT: v_and_b32_e32 v13, 1, v8 -; GFX8-NEXT: v_and_b32_e32 v7, 1, v7 -; GFX8-NEXT: v_lshrrev_b16_e64 v8, 15, s3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX8-NEXT: v_mov_b32_e32 v10, s11 -; GFX8-NEXT: v_lshrrev_b16_e64 v3, 10, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 11, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v18, 2, s6 -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 4, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v16, 6, s8 -; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[5:8] -; GFX8-NEXT: s_add_u32 s2, s0, 0xa0 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff, v13 -; GFX8-NEXT: v_and_b32_e32 v13, 0xffff, v17 -; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: v_and_b32_e32 v4, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 4, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v26, 6, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v22, 2, s3 -; GFX8-NEXT: v_and_b32_e32 v10, 1, v16 -; GFX8-NEXT: v_and_b32_e32 v8, 1, v14 -; GFX8-NEXT: v_and_b32_e32 v14, 1, v18 -; GFX8-NEXT: v_and_b32_e32 v18, 1, v3 -; GFX8-NEXT: v_and_b32_sdwa v16, v2, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v15 -; GFX8-NEXT: v_and_b32_e32 v15, 0xffff, v19 -; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v4 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_add_u32 s2, s0, 0x90 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[16:19] -; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v1 -; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v20 -; GFX8-NEXT: v_and_b32_e32 v20, 1, v0 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 6, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_add_u32 s2, s0, 0x80 -; GFX8-NEXT: v_and_b32_e32 v18, 1, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v23 -; GFX8-NEXT: v_and_b32_e32 v3, 1, v22 -; GFX8-NEXT: v_and_b32_e32 v23, 0xffff, v27 -; GFX8-NEXT: v_and_b32_e32 v22, 1, v26 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[20:23] -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v28 -; GFX8-NEXT: v_mov_b32_e32 v21, s3 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v20, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0xf0 -; GFX8-NEXT: v_lshrrev_b16_e64 v24, 4, s6 -; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[1:4] -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v19, 7, s6 -; GFX8-NEXT: v_and_b32_e32 v16, 1, v24 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_add_u32 s2, s0, 0xe0 -; GFX8-NEXT: v_lshrrev_b16_e64 v12, 2, s8 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[16:19] -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_and_b32_e32 v6, 1, v12 -; GFX8-NEXT: v_mov_b32_e32 v12, s5 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_add_u32 s2, s0, 0x70 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[12:15] -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff, v11 -; GFX8-NEXT: v_lshrrev_b16_e64 v11, 7, s8 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_add_u32 s0, s0, 0x60 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[8:11] -; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_bfe_u32 s2, s26, 0x10003 +; GFX8-NEXT: s_bfe_u32 s3, s26, 0x10001 +; GFX8-NEXT: s_bfe_u32 s4, s26, 0x10007 +; GFX8-NEXT: s_bfe_u32 s5, s26, 0x10005 +; GFX8-NEXT: s_bfe_u32 s6, s26, 0x1000b +; GFX8-NEXT: s_bfe_u32 s9, s26, 0x10009 +; GFX8-NEXT: s_bfe_u32 s11, s26, 0x1000f +; GFX8-NEXT: s_bfe_u32 s13, s26, 0x1000d +; GFX8-NEXT: s_bfe_u32 s15, s26, 0x10013 +; GFX8-NEXT: s_bfe_u32 s17, s26, 0x10011 +; GFX8-NEXT: s_bfe_u32 s19, s26, 0x10017 +; GFX8-NEXT: s_bfe_u32 s21, s26, 0x1001b +; GFX8-NEXT: s_bfe_u32 s23, s26, 0x10019 +; GFX8-NEXT: s_lshr_b32 s25, s26, 31 +; GFX8-NEXT: s_bfe_u32 s28, s26, 0x1001d +; GFX8-NEXT: s_bfe_u32 s29, s27, 0x10003 +; GFX8-NEXT: s_bfe_u32 s30, s27, 0x10001 +; GFX8-NEXT: s_bfe_u32 s31, s27, 0x10007 +; GFX8-NEXT: s_bfe_u32 s33, s27, 0x10005 +; GFX8-NEXT: s_bfe_u32 s34, s27, 0x1000b +; GFX8-NEXT: s_bfe_u32 s35, s27, 0x10009 +; GFX8-NEXT: s_bfe_u32 s36, s27, 0x1000f +; GFX8-NEXT: s_bfe_u32 s37, s27, 0x1000d +; GFX8-NEXT: s_bfe_u32 s38, s27, 0x10013 +; GFX8-NEXT: s_bfe_u32 s39, s27, 0x10011 +; GFX8-NEXT: s_bfe_u32 s40, s27, 0x10017 +; GFX8-NEXT: s_bfe_u32 s41, s27, 0x1001b +; GFX8-NEXT: s_bfe_u32 s42, s27, 0x10019 +; GFX8-NEXT: s_lshr_b32 s43, s27, 31 +; GFX8-NEXT: s_bfe_u32 s44, s27, 0x1001d +; GFX8-NEXT: s_and_b32 s8, s26, 1 +; GFX8-NEXT: s_bfe_u32 s7, s26, 0x10002 +; GFX8-NEXT: s_bfe_u32 s10, s26, 0x10006 +; GFX8-NEXT: s_bfe_u32 s12, s26, 0x10004 +; GFX8-NEXT: s_bfe_u32 s14, s26, 0x1000a +; GFX8-NEXT: s_bfe_u32 s16, s26, 0x10008 +; GFX8-NEXT: s_bfe_u32 s18, s26, 0x1000e +; GFX8-NEXT: s_bfe_u32 s20, s26, 0x1000c +; GFX8-NEXT: s_bfe_u32 s22, s26, 0x10012 +; GFX8-NEXT: s_bfe_u32 s24, s26, 0x10010 +; GFX8-NEXT: s_bfe_u32 s45, s26, 0x10016 +; GFX8-NEXT: s_bfe_u32 s46, s26, 0x10015 +; GFX8-NEXT: s_bfe_u32 s47, s26, 0x10014 +; GFX8-NEXT: s_bfe_u32 s48, s26, 0x1001a +; GFX8-NEXT: s_bfe_u32 s49, s26, 0x10018 +; GFX8-NEXT: s_bfe_u32 s50, s26, 0x1001e +; GFX8-NEXT: s_bfe_u32 s51, s26, 0x1001c +; GFX8-NEXT: s_and_b32 s52, s27, 1 +; GFX8-NEXT: s_bfe_u32 s53, s27, 0x10002 +; GFX8-NEXT: s_bfe_u32 s54, s27, 0x10006 +; GFX8-NEXT: s_bfe_u32 s55, s27, 0x10004 +; GFX8-NEXT: s_bfe_u32 s56, s27, 0x1000a +; GFX8-NEXT: s_bfe_u32 s57, s27, 0x10008 +; GFX8-NEXT: s_bfe_u32 s58, s27, 0x1000e +; GFX8-NEXT: s_bfe_u32 s59, s27, 0x1000c +; GFX8-NEXT: s_bfe_u32 s60, s27, 0x10012 +; GFX8-NEXT: s_bfe_u32 s61, s27, 0x10010 +; GFX8-NEXT: s_bfe_u32 s62, s27, 0x10016 +; GFX8-NEXT: s_bfe_u32 s63, s27, 0x10015 +; GFX8-NEXT: s_bfe_u32 s64, s27, 0x10014 +; GFX8-NEXT: s_bfe_u32 s65, s27, 0x1001a +; GFX8-NEXT: s_bfe_u32 s66, s27, 0x10018 +; GFX8-NEXT: s_bfe_u32 s26, s27, 0x1001e +; GFX8-NEXT: s_bfe_u32 s27, s27, 0x1001c +; GFX8-NEXT: v_mov_b32_e32 v2, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0xf0 +; GFX8-NEXT: v_mov_b32_e32 v0, s27 +; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: v_mov_b32_e32 v1, s44 +; GFX8-NEXT: v_mov_b32_e32 v3, s43 +; GFX8-NEXT: v_mov_b32_e32 v5, s27 +; GFX8-NEXT: s_add_u32 s26, s0, 0xe0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: v_mov_b32_e32 v0, s66 +; GFX8-NEXT: v_mov_b32_e32 v1, s42 +; GFX8-NEXT: v_mov_b32_e32 v2, s65 +; GFX8-NEXT: v_mov_b32_e32 v3, s41 +; GFX8-NEXT: v_mov_b32_e32 v5, s27 +; GFX8-NEXT: s_add_u32 s26, s0, 0xd0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: v_mov_b32_e32 v0, s64 +; GFX8-NEXT: v_mov_b32_e32 v1, s63 +; GFX8-NEXT: v_mov_b32_e32 v2, s62 +; GFX8-NEXT: v_mov_b32_e32 v3, s40 +; GFX8-NEXT: v_mov_b32_e32 v5, s27 +; GFX8-NEXT: s_add_u32 s26, s0, 0xc0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: v_mov_b32_e32 v0, s61 +; GFX8-NEXT: v_mov_b32_e32 v1, s39 +; GFX8-NEXT: v_mov_b32_e32 v2, s60 +; GFX8-NEXT: v_mov_b32_e32 v3, s38 +; GFX8-NEXT: v_mov_b32_e32 v5, s27 +; GFX8-NEXT: s_add_u32 s26, s0, 0xb0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: v_mov_b32_e32 v0, s59 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v2, s58 +; GFX8-NEXT: v_mov_b32_e32 v3, s36 +; GFX8-NEXT: v_mov_b32_e32 v5, s27 +; GFX8-NEXT: s_add_u32 s26, s0, 0xa0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: v_mov_b32_e32 v0, s57 +; GFX8-NEXT: v_mov_b32_e32 v1, s35 +; GFX8-NEXT: v_mov_b32_e32 v2, s56 +; GFX8-NEXT: v_mov_b32_e32 v3, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s27 +; GFX8-NEXT: s_add_u32 s26, s0, 0x90 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: v_mov_b32_e32 v0, s55 +; GFX8-NEXT: v_mov_b32_e32 v1, s33 +; GFX8-NEXT: v_mov_b32_e32 v2, s54 +; GFX8-NEXT: v_mov_b32_e32 v3, s31 +; GFX8-NEXT: v_mov_b32_e32 v5, s27 +; GFX8-NEXT: s_add_u32 s26, s0, 0x80 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: v_mov_b32_e32 v0, s52 +; GFX8-NEXT: v_mov_b32_e32 v1, s30 +; GFX8-NEXT: v_mov_b32_e32 v2, s53 +; GFX8-NEXT: v_mov_b32_e32 v3, s29 +; GFX8-NEXT: v_mov_b32_e32 v5, s27 +; GFX8-NEXT: s_add_u32 s26, s0, 0x70 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: v_mov_b32_e32 v0, s51 +; GFX8-NEXT: v_mov_b32_e32 v1, s28 +; GFX8-NEXT: v_mov_b32_e32 v2, s50 +; GFX8-NEXT: v_mov_b32_e32 v3, s25 +; GFX8-NEXT: v_mov_b32_e32 v5, s27 +; GFX8-NEXT: s_add_u32 s26, s0, 0x60 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: v_mov_b32_e32 v0, s49 +; GFX8-NEXT: v_mov_b32_e32 v1, s23 +; GFX8-NEXT: v_mov_b32_e32 v2, s48 +; GFX8-NEXT: v_mov_b32_e32 v3, s21 +; GFX8-NEXT: v_mov_b32_e32 v5, s27 +; GFX8-NEXT: s_add_u32 s26, s0, 0x50 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: v_mov_b32_e32 v0, s47 +; GFX8-NEXT: v_mov_b32_e32 v1, s46 +; GFX8-NEXT: v_mov_b32_e32 v2, s45 +; GFX8-NEXT: v_mov_b32_e32 v3, s19 +; GFX8-NEXT: v_mov_b32_e32 v5, s27 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s22 +; GFX8-NEXT: s_add_u32 s22, s0, 64 +; GFX8-NEXT: s_addc_u32 s23, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s22 +; GFX8-NEXT: v_mov_b32_e32 v0, s24 +; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: v_mov_b32_e32 v3, s15 +; GFX8-NEXT: v_mov_b32_e32 v5, s23 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: s_add_u32 s18, s0, 48 +; GFX8-NEXT: s_addc_u32 s19, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s18 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: v_mov_b32_e32 v5, s19 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s14 +; GFX8-NEXT: s_add_u32 s14, s0, 32 +; GFX8-NEXT: s_addc_u32 s15, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s14 +; GFX8-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: s_add_u32 s4, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s5, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX8-NEXT: v_mov_b32_e32 v0, s12 +; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_v64i1_to_v64i32: @@ -3451,168 +3318,124 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v1, 13, s2 -; GFX12-NEXT: s_lshr_b32 s4, s3, 24 -; GFX12-NEXT: v_lshrrev_b16 v2, 9, s2 -; GFX12-NEXT: v_lshrrev_b16 v3, 11, s2 -; GFX12-NEXT: v_lshrrev_b16 v9, 13, s3 -; GFX12-NEXT: v_and_b32_e32 v44, 1, v1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_lshrrev_b16 v1, 1, s4 -; GFX12-NEXT: s_lshr_b32 s5, s2, 24 -; GFX12-NEXT: v_dual_mov_b32 v64, 0 :: v_dual_and_b32 v41, 1, v2 -; GFX12-NEXT: v_lshrrev_b16 v4, 5, s2 -; GFX12-NEXT: v_lshrrev_b16 v5, 7, s2 -; GFX12-NEXT: v_lshrrev_b16 v6, 1, s2 -; GFX12-NEXT: v_lshrrev_b16 v7, 3, s2 -; GFX12-NEXT: v_lshrrev_b16 v10, 9, s3 -; GFX12-NEXT: v_lshrrev_b16 v11, 11, s3 -; GFX12-NEXT: v_lshrrev_b16 v12, 5, s3 -; GFX12-NEXT: v_lshrrev_b16 v13, 7, s3 -; GFX12-NEXT: v_lshrrev_b16 v14, 1, s3 -; GFX12-NEXT: v_lshrrev_b16 v17, 5, s4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_lshrrev_b16 v2, 5, s5 -; GFX12-NEXT: s_and_b32 s7, s2, 1 -; GFX12-NEXT: s_bfe_u32 s18, s3, 0x10010 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v52, s18 :: v_dual_and_b32 v35, 1, v9 -; GFX12-NEXT: v_and_b32_e32 v9, 1, v1 -; GFX12-NEXT: v_lshrrev_b16 v1, 3, s4 -; GFX12-NEXT: s_bfe_u32 s19, s3, 0x10017 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v51, s19 :: v_dual_and_b32 v42, 1, v3 -; GFX12-NEXT: v_lshrrev_b16 v3, 3, s5 -; GFX12-NEXT: v_lshrrev_b16 v15, 3, s3 -; GFX12-NEXT: v_lshrrev_b16 v28, 12, s3 -; GFX12-NEXT: v_lshrrev_b16 v29, 14, s3 -; GFX12-NEXT: v_lshrrev_b16 v30, 15, s3 -; GFX12-NEXT: v_lshrrev_b16 v25, 10, s3 -; GFX12-NEXT: v_lshrrev_b16 v20, 4, s3 -; GFX12-NEXT: v_lshrrev_b16 v21, 6, s3 -; GFX12-NEXT: v_and_b32_e32 v27, 1, v12 -; GFX12-NEXT: s_and_b32 s6, s3, 1 -; GFX12-NEXT: s_bfe_u32 s9, s2, 0x10012 -; GFX12-NEXT: s_bfe_u32 s10, s2, 0x10011 -; GFX12-NEXT: s_bfe_u32 s12, s2, 0x10017 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v59, s12 :: v_dual_and_b32 v22, 1, v13 -; GFX12-NEXT: v_dual_mov_b32 v62, s9 :: v_dual_and_b32 v13, 1, v17 -; GFX12-NEXT: v_lshrrev_b16 v17, 6, s5 -; GFX12-NEXT: s_bfe_u32 s13, s2, 0x10016 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v58, s13 :: v_dual_and_b32 v23, 1, v14 -; GFX12-NEXT: s_bfe_u32 s14, s2, 0x10015 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v57, s14 :: v_dual_and_b32 v26, 1, v11 -; GFX12-NEXT: v_and_b32_e32 v11, 1, v1 -; GFX12-NEXT: v_lshrrev_b16 v1, 1, s5 -; GFX12-NEXT: s_bfe_u32 s15, s3, 0x10013 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v55, s15 :: v_dual_and_b32 v34, 1, v7 -; GFX12-NEXT: v_lshrrev_b16 v7, 7, s5 -; GFX12-NEXT: s_bfe_u32 s16, s3, 0x10012 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v54, s16 :: v_dual_and_b32 v31, 1, v10 -; GFX12-NEXT: s_bfe_u32 s17, s3, 0x10011 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v53, s17 :: v_dual_and_b32 v38, 1, v5 -; GFX12-NEXT: s_bfe_u32 s20, s3, 0x10016 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v50, s20 :: v_dual_and_b32 v39, 1, v6 -; GFX12-NEXT: v_lshrrev_b16 v6, 2, s5 -; GFX12-NEXT: s_bfe_u32 s21, s3, 0x10014 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v48, s21 :: v_dual_and_b32 v43, 1, v4 -; GFX12-NEXT: v_lshrrev_b16 v4, 4, s5 -; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10018 -; GFX12-NEXT: v_and_b32_e32 v5, 1, v2 -; GFX12-NEXT: v_dual_mov_b32 v61, s10 :: v_dual_and_b32 v2, 1, v3 -; GFX12-NEXT: v_lshrrev_b16 v16, 15, s2 -; GFX12-NEXT: v_lshrrev_b16 v24, 8, s3 -; GFX12-NEXT: v_lshrrev_b16 v18, 2, s3 -; GFX12-NEXT: s_bfe_u32 s11, s2, 0x10010 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v60, s11 :: v_dual_and_b32 v19, 1, v15 -; GFX12-NEXT: v_lshrrev_b16 v0, 12, s2 -; GFX12-NEXT: v_lshrrev_b16 v8, 14, s2 -; GFX12-NEXT: v_lshrrev_b16 v12, 4, s4 -; GFX12-NEXT: v_lshrrev_b16 v14, 6, s4 -; GFX12-NEXT: v_lshrrev_b16 v15, 7, s4 -; GFX12-NEXT: v_lshrrev_b16 v32, 8, s2 -; GFX12-NEXT: v_lshrrev_b16 v40, 10, s2 -; GFX12-NEXT: v_lshrrev_b16 v36, 4, s2 -; GFX12-NEXT: v_lshrrev_b16 v37, 6, s2 -; GFX12-NEXT: v_lshrrev_b16 v33, 2, s2 -; GFX12-NEXT: v_lshrrev_b16 v10, 2, s4 -; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018 -; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10013 -; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10014 -; GFX12-NEXT: s_bfe_u32 s3, s3, 0x10015 -; GFX12-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v2 +; GFX12-NEXT: s_lshr_b32 s33, s3, 31 +; GFX12-NEXT: s_bfe_u32 s34, s3, 0x1001d +; GFX12-NEXT: s_bfe_u32 s65, s3, 0x1001c +; GFX12-NEXT: s_bfe_u32 s66, s3, 0x1001e +; GFX12-NEXT: s_bfe_u32 s30, s3, 0x1001b +; GFX12-NEXT: s_bfe_u32 s31, s3, 0x10019 +; GFX12-NEXT: s_bfe_u32 s63, s3, 0x1001a +; GFX12-NEXT: s_bfe_u32 s64, s3, 0x10018 +; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s34 +; GFX12-NEXT: s_bfe_u32 s29, s3, 0x10017 +; GFX12-NEXT: s_bfe_u32 s60, s3, 0x10016 +; GFX12-NEXT: s_bfe_u32 s61, s3, 0x10015 +; GFX12-NEXT: s_bfe_u32 s62, s3, 0x10014 +; GFX12-NEXT: v_dual_mov_b32 v0, s65 :: v_dual_mov_b32 v3, s33 +; GFX12-NEXT: v_dual_mov_b32 v2, s66 :: v_dual_mov_b32 v5, s31 +; GFX12-NEXT: s_bfe_u32 s27, s3, 0x10013 +; GFX12-NEXT: s_bfe_u32 s28, s3, 0x10011 +; GFX12-NEXT: s_bfe_u32 s58, s3, 0x10012 +; GFX12-NEXT: s_bfe_u32 s59, s3, 0x10010 +; GFX12-NEXT: v_dual_mov_b32 v4, s64 :: v_dual_mov_b32 v7, s30 +; GFX12-NEXT: v_dual_mov_b32 v6, s63 :: v_dual_mov_b32 v9, s61 +; GFX12-NEXT: v_dual_mov_b32 v8, s62 :: v_dual_mov_b32 v11, s29 +; GFX12-NEXT: v_dual_mov_b32 v10, s60 :: v_dual_mov_b32 v13, s28 +; GFX12-NEXT: s_bfe_u32 s19, s3, 0x10003 +; GFX12-NEXT: s_bfe_u32 s20, s3, 0x10001 +; GFX12-NEXT: s_bfe_u32 s21, s3, 0x10007 +; GFX12-NEXT: s_bfe_u32 s22, s3, 0x10005 +; GFX12-NEXT: s_bfe_u32 s23, s3, 0x1000b +; GFX12-NEXT: s_bfe_u32 s24, s3, 0x10009 +; GFX12-NEXT: s_bfe_u32 s25, s3, 0x1000f +; GFX12-NEXT: s_bfe_u32 s26, s3, 0x1000d +; GFX12-NEXT: s_and_b32 s51, s3, 1 +; GFX12-NEXT: s_bfe_u32 s52, s3, 0x10002 +; GFX12-NEXT: s_bfe_u32 s53, s3, 0x10006 +; GFX12-NEXT: s_bfe_u32 s54, s3, 0x10004 +; GFX12-NEXT: s_bfe_u32 s55, s3, 0x1000a +; GFX12-NEXT: s_bfe_u32 s56, s3, 0x10008 +; GFX12-NEXT: s_bfe_u32 s57, s3, 0x1000e +; GFX12-NEXT: v_dual_mov_b32 v12, s59 :: v_dual_mov_b32 v15, s27 +; GFX12-NEXT: v_mov_b32_e32 v14, s58 +; GFX12-NEXT: s_bfe_u32 s3, s3, 0x1000c +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:240 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:224 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:208 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:192 +; GFX12-NEXT: v_dual_mov_b32 v1, s26 :: v_dual_mov_b32 v0, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s25 :: v_dual_mov_b32 v2, s57 +; GFX12-NEXT: v_dual_mov_b32 v5, s24 :: v_dual_mov_b32 v4, s56 +; GFX12-NEXT: v_dual_mov_b32 v7, s23 :: v_dual_mov_b32 v6, s55 +; GFX12-NEXT: v_mov_b32_e32 v9, s22 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10003 +; GFX12-NEXT: s_bfe_u32 s5, s2, 0x10001 +; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10007 +; GFX12-NEXT: s_bfe_u32 s7, s2, 0x10005 +; GFX12-NEXT: s_bfe_u32 s8, s2, 0x1000b +; GFX12-NEXT: s_bfe_u32 s9, s2, 0x10009 +; GFX12-NEXT: s_bfe_u32 s10, s2, 0x1000f +; GFX12-NEXT: s_bfe_u32 s11, s2, 0x1000d +; GFX12-NEXT: s_bfe_u32 s12, s2, 0x10013 +; GFX12-NEXT: s_bfe_u32 s13, s2, 0x10011 +; GFX12-NEXT: s_bfe_u32 s14, s2, 0x10017 +; GFX12-NEXT: s_bfe_u32 s15, s2, 0x1001b +; GFX12-NEXT: s_bfe_u32 s16, s2, 0x10019 +; GFX12-NEXT: s_lshr_b32 s17, s2, 31 +; GFX12-NEXT: s_bfe_u32 s18, s2, 0x1001d +; GFX12-NEXT: s_and_b32 s35, s2, 1 +; GFX12-NEXT: s_bfe_u32 s36, s2, 0x10002 +; GFX12-NEXT: s_bfe_u32 s37, s2, 0x10006 +; GFX12-NEXT: s_bfe_u32 s38, s2, 0x10004 +; GFX12-NEXT: s_bfe_u32 s39, s2, 0x1000a +; GFX12-NEXT: s_bfe_u32 s40, s2, 0x10008 +; GFX12-NEXT: s_bfe_u32 s41, s2, 0x1000e +; GFX12-NEXT: s_bfe_u32 s42, s2, 0x1000c +; GFX12-NEXT: s_bfe_u32 s43, s2, 0x10012 +; GFX12-NEXT: s_bfe_u32 s44, s2, 0x10010 +; GFX12-NEXT: s_bfe_u32 s45, s2, 0x10016 +; GFX12-NEXT: s_bfe_u32 s46, s2, 0x10015 +; GFX12-NEXT: s_bfe_u32 s47, s2, 0x10014 +; GFX12-NEXT: s_bfe_u32 s48, s2, 0x1001a +; GFX12-NEXT: s_bfe_u32 s49, s2, 0x10018 +; GFX12-NEXT: s_bfe_u32 s50, s2, 0x1001e +; GFX12-NEXT: s_bfe_u32 s2, s2, 0x1001c +; GFX12-NEXT: v_dual_mov_b32 v8, s54 :: v_dual_mov_b32 v11, s21 +; GFX12-NEXT: v_dual_mov_b32 v10, s53 :: v_dual_mov_b32 v13, s20 +; GFX12-NEXT: v_dual_mov_b32 v12, s51 :: v_dual_mov_b32 v15, s19 +; GFX12-NEXT: v_dual_mov_b32 v14, s52 :: v_dual_mov_b32 v17, s18 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v63, s8 :: v_dual_and_b32 v2, 1, v6 -; GFX12-NEXT: v_and_b32_e32 v6, 1, v17 -; GFX12-NEXT: v_and_b32_e32 v17, 0xffff, v23 -; GFX12-NEXT: v_and_b32_e32 v23, 0xffff, v22 -; GFX12-NEXT: v_and_b32_e32 v22, 1, v21 -; GFX12-NEXT: v_and_b32_e32 v20, 1, v20 -; GFX12-NEXT: v_dual_mov_b32 v49, s3 :: v_dual_and_b32 v28, 1, v28 -; GFX12-NEXT: v_dual_mov_b32 v56, s2 :: v_dual_and_b32 v21, 0xffff, v27 -; GFX12-NEXT: v_and_b32_e32 v27, 0xffff, v26 -; GFX12-NEXT: v_and_b32_e32 v26, 1, v25 -; GFX12-NEXT: v_and_b32_e32 v25, 0xffff, v31 -; GFX12-NEXT: v_and_b32_e32 v31, 0xffff, v30 -; GFX12-NEXT: v_and_b32_e32 v30, 1, v29 -; GFX12-NEXT: v_and_b32_e32 v29, 0xffff, v35 -; GFX12-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX12-NEXT: v_and_b32_e32 v18, 1, v18 -; GFX12-NEXT: v_and_b32_e32 v24, 1, v24 -; GFX12-NEXT: v_dual_mov_b32 v16, s6 :: v_dual_and_b32 v47, 0xffff, v16 -; GFX12-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX12-NEXT: v_and_b32_e32 v14, 1, v14 -; GFX12-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX12-NEXT: v_and_b32_e32 v12, 1, v12 -; GFX12-NEXT: v_and_b32_e32 v46, 1, v8 -; GFX12-NEXT: v_and_b32_e32 v45, 0xffff, v44 -; GFX12-NEXT: v_and_b32_e32 v44, 1, v0 -; GFX12-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX12-NEXT: v_and_b32_e32 v10, 1, v10 -; GFX12-NEXT: v_and_b32_e32 v35, 0xffff, v34 -; GFX12-NEXT: v_and_b32_e32 v34, 1, v33 -; GFX12-NEXT: v_and_b32_e32 v33, 0xffff, v39 -; GFX12-NEXT: v_and_b32_e32 v39, 0xffff, v38 -; GFX12-NEXT: v_and_b32_e32 v38, 1, v37 -; GFX12-NEXT: v_and_b32_e32 v37, 0xffff, v43 -; GFX12-NEXT: v_and_b32_e32 v43, 0xffff, v42 -; GFX12-NEXT: v_and_b32_e32 v42, 1, v40 -; GFX12-NEXT: v_and_b32_e32 v41, 0xffff, v41 -; GFX12-NEXT: v_and_b32_e32 v40, 1, v32 -; GFX12-NEXT: v_dual_mov_b32 v32, s7 :: v_dual_and_b32 v9, 0xffff, v9 -; GFX12-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_and_b32 v7, 0xffff, v7 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX12-NEXT: v_and_b32_e32 v36, 1, v36 -; GFX12-NEXT: s_clause 0x9 -; GFX12-NEXT: global_store_b128 v64, v[48:51], s[0:1] offset:208 -; GFX12-NEXT: global_store_b128 v64, v[52:55], s[0:1] offset:192 -; GFX12-NEXT: global_store_b128 v64, v[56:59], s[0:1] offset:80 -; GFX12-NEXT: global_store_b128 v64, v[60:63], s[0:1] offset:64 -; GFX12-NEXT: global_store_b128 v64, v[44:47], s[0:1] offset:48 -; GFX12-NEXT: global_store_b128 v64, v[40:43], s[0:1] offset:32 -; GFX12-NEXT: global_store_b128 v64, v[36:39], s[0:1] offset:16 -; GFX12-NEXT: global_store_b128 v64, v[32:35], s[0:1] -; GFX12-NEXT: global_store_b128 v64, v[28:31], s[0:1] offset:176 -; GFX12-NEXT: global_store_b128 v64, v[24:27], s[0:1] offset:160 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_and_b32 v1, 0xffff, v1 +; GFX12-NEXT: v_dual_mov_b32 v16, s2 :: v_dual_mov_b32 v19, s17 +; GFX12-NEXT: v_dual_mov_b32 v18, s50 :: v_dual_mov_b32 v21, s16 +; GFX12-NEXT: v_dual_mov_b32 v20, s49 :: v_dual_mov_b32 v23, s15 +; GFX12-NEXT: v_mov_b32_e32 v22, s48 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:176 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:160 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:144 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:128 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:112 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1] offset:96 +; GFX12-NEXT: v_dual_mov_b32 v1, s46 :: v_dual_mov_b32 v0, s47 +; GFX12-NEXT: v_dual_mov_b32 v3, s14 :: v_dual_mov_b32 v2, s45 +; GFX12-NEXT: v_dual_mov_b32 v5, s13 :: v_dual_mov_b32 v4, s44 +; GFX12-NEXT: v_dual_mov_b32 v7, s12 :: v_dual_mov_b32 v6, s43 +; GFX12-NEXT: v_dual_mov_b32 v9, s11 :: v_dual_mov_b32 v8, s42 +; GFX12-NEXT: v_dual_mov_b32 v11, s10 :: v_dual_mov_b32 v10, s41 +; GFX12-NEXT: v_dual_mov_b32 v13, s9 :: v_dual_mov_b32 v12, s40 +; GFX12-NEXT: v_dual_mov_b32 v15, s8 :: v_dual_mov_b32 v14, s39 +; GFX12-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s38 +; GFX12-NEXT: v_dual_mov_b32 v19, s6 :: v_dual_mov_b32 v18, s37 +; GFX12-NEXT: v_dual_mov_b32 v21, s5 :: v_dual_mov_b32 v20, s35 +; GFX12-NEXT: v_dual_mov_b32 v23, s4 :: v_dual_mov_b32 v22, s36 ; GFX12-NEXT: s_clause 0x5 -; GFX12-NEXT: global_store_b128 v64, v[20:23], s[0:1] offset:144 -; GFX12-NEXT: global_store_b128 v64, v[16:19], s[0:1] offset:128 -; GFX12-NEXT: global_store_b128 v64, v[12:15], s[0:1] offset:240 -; GFX12-NEXT: global_store_b128 v64, v[8:11], s[0:1] offset:224 -; GFX12-NEXT: global_store_b128 v64, v[4:7], s[0:1] offset:112 -; GFX12-NEXT: global_store_b128 v64, v[0:3], s[0:1] offset:96 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:80 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:64 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -3793,216 +3616,219 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[26:27], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e64 v18, 12, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v19, 13, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v20, 14, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v21, 15, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 8, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v15, 9, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v16, 10, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v17, 11, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v10, 4, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v11, 5, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v12, 6, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v13, 7, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v7, 1, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v8, 2, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 3, s2 -; GFX8-NEXT: s_lshr_b32 s7, s3, 24 -; GFX8-NEXT: s_lshr_b32 s8, s2, 24 -; GFX8-NEXT: s_bfe_i32 s4, s2, 0x10018 -; GFX8-NEXT: s_bfe_i32 s5, s3, 0x10018 -; GFX8-NEXT: s_bfe_i32 s6, s3, 0x10000 -; GFX8-NEXT: s_bfe_i32 s9, s2, 0x10000 -; GFX8-NEXT: s_bfe_i32 s12, s2, 0x10013 -; GFX8-NEXT: s_bfe_i32 s13, s2, 0x10012 -; GFX8-NEXT: s_bfe_i32 s14, s2, 0x10011 -; GFX8-NEXT: s_bfe_i32 s15, s2, 0x10010 -; GFX8-NEXT: s_bfe_i32 s16, s2, 0x10017 -; GFX8-NEXT: s_bfe_i32 s17, s2, 0x10016 -; GFX8-NEXT: s_bfe_i32 s18, s2, 0x10015 -; GFX8-NEXT: s_bfe_i32 s2, s2, 0x10014 -; GFX8-NEXT: s_bfe_i32 s19, s3, 0x10013 -; GFX8-NEXT: s_bfe_i32 s20, s3, 0x10012 -; GFX8-NEXT: s_bfe_i32 s21, s3, 0x10011 -; GFX8-NEXT: s_bfe_i32 s22, s3, 0x10010 -; GFX8-NEXT: s_bfe_i32 s10, s3, 0x10017 -; GFX8-NEXT: s_bfe_i32 s11, s3, 0x10016 -; GFX8-NEXT: s_bfe_i32 s23, s3, 0x10015 -; GFX8-NEXT: s_bfe_i32 s24, s3, 0x10014 -; GFX8-NEXT: v_mov_b32_e32 v25, s10 -; GFX8-NEXT: s_add_u32 s10, s0, 0xd0 -; GFX8-NEXT: v_mov_b32_e32 v24, s11 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v27, s11 -; GFX8-NEXT: v_mov_b32_e32 v26, s10 -; GFX8-NEXT: s_add_u32 s10, s0, 0xc0 -; GFX8-NEXT: v_mov_b32_e32 v22, s24 -; GFX8-NEXT: v_mov_b32_e32 v23, s23 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v27, s11 -; GFX8-NEXT: v_mov_b32_e32 v26, s10 -; GFX8-NEXT: s_add_u32 s10, s0, 0x50 -; GFX8-NEXT: v_mov_b32_e32 v22, s22 -; GFX8-NEXT: v_mov_b32_e32 v23, s21 -; GFX8-NEXT: v_mov_b32_e32 v24, s20 -; GFX8-NEXT: v_mov_b32_e32 v25, s19 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v27, s11 -; GFX8-NEXT: v_mov_b32_e32 v26, s10 -; GFX8-NEXT: s_add_u32 s10, s0, 64 -; GFX8-NEXT: v_mov_b32_e32 v22, s2 -; GFX8-NEXT: v_mov_b32_e32 v23, s18 -; GFX8-NEXT: v_mov_b32_e32 v24, s17 -; GFX8-NEXT: v_mov_b32_e32 v25, s16 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v27, s11 -; GFX8-NEXT: v_mov_b32_e32 v26, s10 -; GFX8-NEXT: s_add_u32 s10, s0, 48 -; GFX8-NEXT: v_mov_b32_e32 v22, s15 -; GFX8-NEXT: v_mov_b32_e32 v23, s14 -; GFX8-NEXT: v_mov_b32_e32 v24, s13 -; GFX8-NEXT: v_mov_b32_e32 v25, s12 +; GFX8-NEXT: s_bfe_i32 s2, s26, 0x10003 +; GFX8-NEXT: s_bfe_i32 s3, s26, 0x10002 +; GFX8-NEXT: s_bfe_i32 s4, s26, 0x10001 +; GFX8-NEXT: s_bfe_i32 s5, s26, 0x10000 +; GFX8-NEXT: s_bfe_i32 s6, s26, 0x10007 +; GFX8-NEXT: s_bfe_i32 s7, s26, 0x10006 +; GFX8-NEXT: s_bfe_i32 s8, s26, 0x10005 +; GFX8-NEXT: s_bfe_i32 s9, s26, 0x10004 +; GFX8-NEXT: s_bfe_i32 s10, s26, 0x1000b +; GFX8-NEXT: s_bfe_i32 s11, s26, 0x1000a +; GFX8-NEXT: s_bfe_i32 s12, s26, 0x10009 +; GFX8-NEXT: s_bfe_i32 s13, s26, 0x10008 +; GFX8-NEXT: s_bfe_i32 s14, s26, 0x1000f +; GFX8-NEXT: s_bfe_i32 s15, s26, 0x1000e +; GFX8-NEXT: s_bfe_i32 s16, s26, 0x1000d +; GFX8-NEXT: s_bfe_i32 s17, s26, 0x1000c +; GFX8-NEXT: s_bfe_i32 s18, s26, 0x10013 +; GFX8-NEXT: s_bfe_i32 s19, s26, 0x10012 +; GFX8-NEXT: s_bfe_i32 s20, s26, 0x10011 +; GFX8-NEXT: s_bfe_i32 s21, s26, 0x10010 +; GFX8-NEXT: s_bfe_i32 s22, s26, 0x10017 +; GFX8-NEXT: s_bfe_i32 s23, s26, 0x10016 +; GFX8-NEXT: s_bfe_i32 s24, s26, 0x10015 +; GFX8-NEXT: s_bfe_i32 s25, s26, 0x10014 +; GFX8-NEXT: s_bfe_i32 s28, s26, 0x1001b +; GFX8-NEXT: s_bfe_i32 s29, s26, 0x1001a +; GFX8-NEXT: s_bfe_i32 s30, s26, 0x10019 +; GFX8-NEXT: s_bfe_i32 s31, s26, 0x10018 +; GFX8-NEXT: s_ashr_i32 s33, s26, 31 +; GFX8-NEXT: s_bfe_i32 s34, s26, 0x1001e +; GFX8-NEXT: s_bfe_i32 s35, s26, 0x1001d +; GFX8-NEXT: s_bfe_i32 s36, s26, 0x1001c +; GFX8-NEXT: s_bfe_i32 s37, s27, 0x10003 +; GFX8-NEXT: s_bfe_i32 s38, s27, 0x10002 +; GFX8-NEXT: s_bfe_i32 s39, s27, 0x10001 +; GFX8-NEXT: s_bfe_i32 s40, s27, 0x10000 +; GFX8-NEXT: s_bfe_i32 s41, s27, 0x10007 +; GFX8-NEXT: s_bfe_i32 s42, s27, 0x10006 +; GFX8-NEXT: s_bfe_i32 s43, s27, 0x10005 +; GFX8-NEXT: s_bfe_i32 s44, s27, 0x10004 +; GFX8-NEXT: s_bfe_i32 s45, s27, 0x1000b +; GFX8-NEXT: s_bfe_i32 s46, s27, 0x1000a +; GFX8-NEXT: s_bfe_i32 s47, s27, 0x10009 +; GFX8-NEXT: s_bfe_i32 s48, s27, 0x10008 +; GFX8-NEXT: s_bfe_i32 s49, s27, 0x1000f +; GFX8-NEXT: s_bfe_i32 s50, s27, 0x1000e +; GFX8-NEXT: s_bfe_i32 s51, s27, 0x1000d +; GFX8-NEXT: s_bfe_i32 s52, s27, 0x1000c +; GFX8-NEXT: s_bfe_i32 s53, s27, 0x10013 +; GFX8-NEXT: s_bfe_i32 s54, s27, 0x10012 +; GFX8-NEXT: s_bfe_i32 s55, s27, 0x10011 +; GFX8-NEXT: s_bfe_i32 s56, s27, 0x10010 +; GFX8-NEXT: s_bfe_i32 s57, s27, 0x10017 +; GFX8-NEXT: s_bfe_i32 s58, s27, 0x10016 +; GFX8-NEXT: s_bfe_i32 s59, s27, 0x10015 +; GFX8-NEXT: s_bfe_i32 s60, s27, 0x10014 +; GFX8-NEXT: s_bfe_i32 s61, s27, 0x1001b +; GFX8-NEXT: s_bfe_i32 s62, s27, 0x1001a +; GFX8-NEXT: s_bfe_i32 s63, s27, 0x10019 +; GFX8-NEXT: s_bfe_i32 s64, s27, 0x10018 +; GFX8-NEXT: s_ashr_i32 s26, s27, 31 +; GFX8-NEXT: s_bfe_i32 s65, s27, 0x1001e +; GFX8-NEXT: s_bfe_i32 s66, s27, 0x1001d +; GFX8-NEXT: s_bfe_i32 s27, s27, 0x1001c +; GFX8-NEXT: v_mov_b32_e32 v3, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0xf0 +; GFX8-NEXT: v_mov_b32_e32 v0, s27 +; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: v_mov_b32_e32 v1, s66 +; GFX8-NEXT: v_mov_b32_e32 v2, s65 +; GFX8-NEXT: v_mov_b32_e32 v5, s27 +; GFX8-NEXT: s_add_u32 s26, s0, 0xe0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: v_mov_b32_e32 v0, s64 +; GFX8-NEXT: v_mov_b32_e32 v1, s63 +; GFX8-NEXT: v_mov_b32_e32 v2, s62 +; GFX8-NEXT: v_mov_b32_e32 v3, s61 +; GFX8-NEXT: v_mov_b32_e32 v5, s27 +; GFX8-NEXT: s_add_u32 s26, s0, 0xd0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: v_mov_b32_e32 v0, s60 +; GFX8-NEXT: v_mov_b32_e32 v1, s59 +; GFX8-NEXT: v_mov_b32_e32 v2, s58 +; GFX8-NEXT: v_mov_b32_e32 v3, s57 +; GFX8-NEXT: v_mov_b32_e32 v5, s27 +; GFX8-NEXT: s_add_u32 s26, s0, 0xc0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: v_mov_b32_e32 v0, s56 +; GFX8-NEXT: v_mov_b32_e32 v1, s55 +; GFX8-NEXT: v_mov_b32_e32 v2, s54 +; GFX8-NEXT: v_mov_b32_e32 v3, s53 +; GFX8-NEXT: v_mov_b32_e32 v5, s27 +; GFX8-NEXT: s_add_u32 s26, s0, 0xb0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: v_mov_b32_e32 v0, s52 +; GFX8-NEXT: v_mov_b32_e32 v1, s51 +; GFX8-NEXT: v_mov_b32_e32 v2, s50 +; GFX8-NEXT: v_mov_b32_e32 v3, s49 +; GFX8-NEXT: v_mov_b32_e32 v5, s27 +; GFX8-NEXT: s_add_u32 s26, s0, 0xa0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: v_mov_b32_e32 v0, s48 +; GFX8-NEXT: v_mov_b32_e32 v1, s47 +; GFX8-NEXT: v_mov_b32_e32 v2, s46 +; GFX8-NEXT: v_mov_b32_e32 v3, s45 +; GFX8-NEXT: v_mov_b32_e32 v5, s27 +; GFX8-NEXT: s_add_u32 s26, s0, 0x90 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: v_mov_b32_e32 v0, s44 +; GFX8-NEXT: v_mov_b32_e32 v1, s43 +; GFX8-NEXT: v_mov_b32_e32 v2, s42 +; GFX8-NEXT: v_mov_b32_e32 v3, s41 +; GFX8-NEXT: v_mov_b32_e32 v5, s27 +; GFX8-NEXT: s_add_u32 s26, s0, 0x80 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: v_mov_b32_e32 v0, s40 +; GFX8-NEXT: v_mov_b32_e32 v1, s39 +; GFX8-NEXT: v_mov_b32_e32 v2, s38 +; GFX8-NEXT: v_mov_b32_e32 v3, s37 +; GFX8-NEXT: v_mov_b32_e32 v5, s27 +; GFX8-NEXT: s_add_u32 s26, s0, 0x70 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s35 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s33 +; GFX8-NEXT: v_mov_b32_e32 v5, s27 +; GFX8-NEXT: s_add_u32 s26, s0, 0x60 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: v_mov_b32_e32 v0, s31 +; GFX8-NEXT: v_mov_b32_e32 v1, s30 +; GFX8-NEXT: v_mov_b32_e32 v2, s29 +; GFX8-NEXT: v_mov_b32_e32 v3, s28 +; GFX8-NEXT: v_mov_b32_e32 v5, s27 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s22 +; GFX8-NEXT: s_add_u32 s22, s0, 0x50 +; GFX8-NEXT: v_mov_b32_e32 v2, s23 +; GFX8-NEXT: s_addc_u32 s23, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s22 +; GFX8-NEXT: v_mov_b32_e32 v0, s25 +; GFX8-NEXT: v_mov_b32_e32 v1, s24 +; GFX8-NEXT: v_mov_b32_e32 v5, s23 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s18 +; GFX8-NEXT: s_add_u32 s18, s0, 64 +; GFX8-NEXT: v_mov_b32_e32 v2, s19 +; GFX8-NEXT: s_addc_u32 s19, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s18 +; GFX8-NEXT: v_mov_b32_e32 v0, s21 +; GFX8-NEXT: v_mov_b32_e32 v1, s20 +; GFX8-NEXT: v_mov_b32_e32 v5, s19 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s14 +; GFX8-NEXT: s_add_u32 s14, s0, 48 +; GFX8-NEXT: v_mov_b32_e32 v2, s15 +; GFX8-NEXT: s_addc_u32 s15, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s14 +; GFX8-NEXT: v_mov_b32_e32 v0, s17 +; GFX8-NEXT: v_mov_b32_e32 v1, s16 +; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: s_add_u32 s10, s0, 32 +; GFX8-NEXT: v_mov_b32_e32 v2, s11 ; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_bfe_i32 v21, v21, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v23, s11 -; GFX8-NEXT: v_bfe_i32 v20, v20, 0, 1 -; GFX8-NEXT: v_bfe_i32 v19, v19, 0, 1 -; GFX8-NEXT: v_bfe_i32 v18, v18, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v22, s10 -; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: v_lshrrev_b16_e64 v3, 12, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 13, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v5, 14, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v6, 15, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 8, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 9, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 10, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v24, 11, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v26, 4, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v27, 5, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v28, 6, s3 -; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[18:21] -; GFX8-NEXT: v_lshrrev_b16_e64 v22, 7, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v23, 1, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v20, 2, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v21, 3, s3 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v19, s3 -; GFX8-NEXT: v_mov_b32_e32 v18, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NEXT: v_bfe_i32 v17, v17, 0, 1 -; GFX8-NEXT: v_bfe_i32 v16, v16, 0, 1 -; GFX8-NEXT: v_bfe_i32 v15, v15, 0, 1 -; GFX8-NEXT: v_bfe_i32 v14, v14, 0, 1 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[14:17] -; GFX8-NEXT: v_bfe_i32 v13, v13, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v15, s3 -; GFX8-NEXT: v_bfe_i32 v12, v12, 0, 1 -; GFX8-NEXT: v_bfe_i32 v11, v11, 0, 1 -; GFX8-NEXT: v_bfe_i32 v10, v10, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v14, s2 -; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NEXT: s_add_u32 s2, s0, 0xb0 -; GFX8-NEXT: v_mov_b32_e32 v12, s1 -; GFX8-NEXT: v_bfe_i32 v10, v9, 0, 1 -; GFX8-NEXT: v_bfe_i32 v9, v8, 0, 1 -; GFX8-NEXT: v_bfe_i32 v8, v7, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v7, s9 -; GFX8-NEXT: v_mov_b32_e32 v11, s0 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[11:12], v[7:10] -; GFX8-NEXT: v_lshrrev_b16_e64 v11, 6, s8 -; GFX8-NEXT: v_mov_b32_e32 v8, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v10, 5, s8 -; GFX8-NEXT: v_bfe_i32 v6, v6, 0, 1 -; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 1 -; GFX8-NEXT: v_bfe_i32 v4, v4, 0, 1 -; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v7, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0xa0 -; GFX8-NEXT: v_lshrrev_b16_e64 v13, 4, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v12, 1, s8 -; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[3:6] -; GFX8-NEXT: v_bfe_i32 v8, v11, 0, 1 -; GFX8-NEXT: v_lshrrev_b16_e64 v3, 2, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 3, s8 -; GFX8-NEXT: v_bfe_i32 v7, v10, 0, 1 -; GFX8-NEXT: v_bfe_i32 v11, v1, 0, 1 -; GFX8-NEXT: v_bfe_i32 v10, v0, 0, 1 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_bfe_i32 v5, v4, 0, 1 -; GFX8-NEXT: v_bfe_i32 v4, v3, 0, 1 -; GFX8-NEXT: v_bfe_i32 v3, v12, 0, 1 -; GFX8-NEXT: v_bfe_i32 v6, v13, 0, 1 -; GFX8-NEXT: v_bfe_i32 v13, v24, 0, 1 -; GFX8-NEXT: v_bfe_i32 v12, v2, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_add_u32 s2, s0, 0x90 -; GFX8-NEXT: v_lshrrev_b16_e64 v19, 5, s7 -; GFX8-NEXT: v_lshrrev_b16_e64 v15, 2, s7 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[10:13] -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_bfe_i32 v12, v15, 0, 1 -; GFX8-NEXT: v_bfe_i32 v15, v19, 0, 1 -; GFX8-NEXT: v_bfe_i32 v19, v23, 0, 1 -; GFX8-NEXT: v_bfe_i32 v25, v22, 0, 1 -; GFX8-NEXT: v_bfe_i32 v24, v28, 0, 1 -; GFX8-NEXT: v_bfe_i32 v23, v27, 0, 1 -; GFX8-NEXT: v_bfe_i32 v22, v26, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_add_u32 s2, s0, 0x80 -; GFX8-NEXT: v_lshrrev_b16_e64 v18, 4, s7 -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 1, s7 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[22:25] -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_bfe_i32 v11, v14, 0, 1 -; GFX8-NEXT: v_bfe_i32 v14, v18, 0, 1 -; GFX8-NEXT: v_bfe_i32 v21, v21, 0, 1 -; GFX8-NEXT: v_bfe_i32 v20, v20, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v18, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_add_u32 s2, s0, 0xf0 -; GFX8-NEXT: v_lshrrev_b16_e64 v16, 6, s7 -; GFX8-NEXT: v_lshrrev_b16_e64 v17, 7, s7 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[18:21] -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_bfe_i32 v17, v17, 0, 1 -; GFX8-NEXT: v_bfe_i32 v16, v16, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_add_u32 s2, s0, 0xe0 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 3, s7 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[14:17] -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_bfe_i32 v13, v2, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v10, s5 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_add_u32 s2, s0, 0x70 -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 7, s8 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[10:13] -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_bfe_i32 v9, v9, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_add_u32 s0, s0, 0x60 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[6:9] -; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GFX8-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NEXT: v_mov_b32_e32 v0, s13 +; GFX8-NEXT: v_mov_b32_e32 v1, s12 +; GFX8-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: s_add_u32 s6, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s9 +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: constant_sextload_v64i1_to_v64i32: @@ -4256,150 +4082,124 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v28, 12, s3 -; GFX12-NEXT: v_lshrrev_b16 v29, 13, s3 -; GFX12-NEXT: v_lshrrev_b16 v30, 14, s3 -; GFX12-NEXT: v_lshrrev_b16 v31, 15, s3 -; GFX12-NEXT: v_lshrrev_b16 v20, 4, s3 -; GFX12-NEXT: v_lshrrev_b16 v21, 5, s3 -; GFX12-NEXT: v_lshrrev_b16 v22, 6, s3 -; GFX12-NEXT: v_lshrrev_b16 v23, 7, s3 -; GFX12-NEXT: s_lshr_b32 s4, s3, 24 -; GFX12-NEXT: s_lshr_b32 s5, s2, 24 -; GFX12-NEXT: v_lshrrev_b16 v16, 14, s2 -; GFX12-NEXT: v_lshrrev_b16 v24, 8, s3 -; GFX12-NEXT: v_lshrrev_b16 v25, 9, s3 -; GFX12-NEXT: v_lshrrev_b16 v26, 10, s3 -; GFX12-NEXT: v_lshrrev_b16 v27, 11, s3 -; GFX12-NEXT: v_lshrrev_b16 v17, 1, s3 -; GFX12-NEXT: v_lshrrev_b16 v18, 2, s3 -; GFX12-NEXT: v_lshrrev_b16 v19, 3, s3 -; GFX12-NEXT: v_lshrrev_b16 v0, 12, s2 -; GFX12-NEXT: v_lshrrev_b16 v8, 13, s2 -; GFX12-NEXT: v_lshrrev_b16 v32, 15, s2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_lshrrev_b16 v12, 4, s4 -; GFX12-NEXT: v_lshrrev_b16 v13, 5, s4 -; GFX12-NEXT: v_lshrrev_b16 v14, 6, s4 -; GFX12-NEXT: v_lshrrev_b16 v15, 7, s4 -; GFX12-NEXT: v_lshrrev_b16 v1, 3, s5 -; GFX12-NEXT: v_lshrrev_b16 v7, 1, s5 -; GFX12-NEXT: v_lshrrev_b16 v44, 7, s5 -; GFX12-NEXT: v_lshrrev_b16 v40, 8, s2 -; GFX12-NEXT: v_lshrrev_b16 v41, 9, s2 -; GFX12-NEXT: v_lshrrev_b16 v42, 10, s2 -; GFX12-NEXT: v_lshrrev_b16 v43, 11, s2 -; GFX12-NEXT: v_lshrrev_b16 v36, 4, s2 -; GFX12-NEXT: v_lshrrev_b16 v37, 5, s2 -; GFX12-NEXT: v_lshrrev_b16 v38, 6, s2 -; GFX12-NEXT: v_lshrrev_b16 v39, 7, s2 -; GFX12-NEXT: v_lshrrev_b16 v33, 1, s2 -; GFX12-NEXT: v_lshrrev_b16 v34, 2, s2 -; GFX12-NEXT: v_lshrrev_b16 v35, 3, s2 -; GFX12-NEXT: v_lshrrev_b16 v9, 1, s4 -; GFX12-NEXT: v_lshrrev_b16 v10, 2, s4 -; GFX12-NEXT: v_lshrrev_b16 v11, 3, s4 -; GFX12-NEXT: v_lshrrev_b16 v4, 4, s5 -; GFX12-NEXT: v_lshrrev_b16 v5, 5, s5 -; GFX12-NEXT: v_lshrrev_b16 v6, 6, s5 -; GFX12-NEXT: v_lshrrev_b16 v2, 2, s5 -; GFX12-NEXT: s_bfe_i32 s4, s2, 0x10018 -; GFX12-NEXT: s_bfe_i32 s5, s3, 0x10018 -; GFX12-NEXT: s_bfe_i32 s6, s3, 0x10000 +; GFX12-NEXT: s_ashr_i32 s63, s3, 31 +; GFX12-NEXT: s_bfe_i32 s64, s3, 0x1001e +; GFX12-NEXT: s_bfe_i32 s65, s3, 0x1001c +; GFX12-NEXT: s_bfe_i32 s66, s3, 0x1001d +; GFX12-NEXT: s_bfe_i32 s59, s3, 0x1001b +; GFX12-NEXT: s_bfe_i32 s60, s3, 0x1001a +; GFX12-NEXT: s_bfe_i32 s61, s3, 0x10019 +; GFX12-NEXT: s_bfe_i32 s62, s3, 0x10018 +; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s66 +; GFX12-NEXT: s_bfe_i32 s55, s3, 0x10017 +; GFX12-NEXT: s_bfe_i32 s56, s3, 0x10016 +; GFX12-NEXT: s_bfe_i32 s57, s3, 0x10015 +; GFX12-NEXT: s_bfe_i32 s58, s3, 0x10014 +; GFX12-NEXT: v_dual_mov_b32 v0, s65 :: v_dual_mov_b32 v3, s63 +; GFX12-NEXT: v_dual_mov_b32 v2, s64 :: v_dual_mov_b32 v5, s61 +; GFX12-NEXT: s_bfe_i32 s51, s3, 0x10013 +; GFX12-NEXT: s_bfe_i32 s52, s3, 0x10012 +; GFX12-NEXT: s_bfe_i32 s53, s3, 0x10011 +; GFX12-NEXT: s_bfe_i32 s54, s3, 0x10010 +; GFX12-NEXT: v_dual_mov_b32 v4, s62 :: v_dual_mov_b32 v7, s59 +; GFX12-NEXT: v_dual_mov_b32 v6, s60 :: v_dual_mov_b32 v9, s57 +; GFX12-NEXT: v_dual_mov_b32 v8, s58 :: v_dual_mov_b32 v11, s55 +; GFX12-NEXT: v_dual_mov_b32 v10, s56 :: v_dual_mov_b32 v13, s53 +; GFX12-NEXT: s_bfe_i32 s36, s3, 0x10003 +; GFX12-NEXT: s_bfe_i32 s37, s3, 0x10002 +; GFX12-NEXT: s_bfe_i32 s38, s3, 0x10001 +; GFX12-NEXT: s_bfe_i32 s39, s3, 0x10000 +; GFX12-NEXT: s_bfe_i32 s40, s3, 0x10007 +; GFX12-NEXT: s_bfe_i32 s41, s3, 0x10006 +; GFX12-NEXT: s_bfe_i32 s42, s3, 0x10005 +; GFX12-NEXT: s_bfe_i32 s43, s3, 0x10004 +; GFX12-NEXT: s_bfe_i32 s44, s3, 0x1000b +; GFX12-NEXT: s_bfe_i32 s45, s3, 0x1000a +; GFX12-NEXT: s_bfe_i32 s46, s3, 0x10009 +; GFX12-NEXT: s_bfe_i32 s47, s3, 0x10008 +; GFX12-NEXT: s_bfe_i32 s48, s3, 0x1000f +; GFX12-NEXT: s_bfe_i32 s49, s3, 0x1000e +; GFX12-NEXT: s_bfe_i32 s50, s3, 0x1000d +; GFX12-NEXT: v_dual_mov_b32 v12, s54 :: v_dual_mov_b32 v15, s51 +; GFX12-NEXT: v_mov_b32_e32 v14, s52 +; GFX12-NEXT: s_bfe_i32 s3, s3, 0x1000c +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:240 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:224 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:208 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:192 +; GFX12-NEXT: v_dual_mov_b32 v1, s50 :: v_dual_mov_b32 v0, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s48 :: v_dual_mov_b32 v2, s49 +; GFX12-NEXT: v_dual_mov_b32 v5, s46 :: v_dual_mov_b32 v4, s47 +; GFX12-NEXT: v_dual_mov_b32 v7, s44 :: v_dual_mov_b32 v6, s45 +; GFX12-NEXT: v_mov_b32_e32 v9, s42 +; GFX12-NEXT: s_bfe_i32 s4, s2, 0x10003 +; GFX12-NEXT: s_bfe_i32 s5, s2, 0x10002 +; GFX12-NEXT: s_bfe_i32 s6, s2, 0x10001 ; GFX12-NEXT: s_bfe_i32 s7, s2, 0x10000 -; GFX12-NEXT: s_bfe_i32 s8, s2, 0x10013 -; GFX12-NEXT: s_bfe_i32 s9, s2, 0x10012 -; GFX12-NEXT: s_bfe_i32 s10, s2, 0x10011 -; GFX12-NEXT: s_bfe_i32 s11, s2, 0x10010 -; GFX12-NEXT: s_bfe_i32 s12, s2, 0x10017 -; GFX12-NEXT: s_bfe_i32 s13, s2, 0x10016 -; GFX12-NEXT: s_bfe_i32 s14, s2, 0x10015 -; GFX12-NEXT: s_bfe_i32 s2, s2, 0x10014 -; GFX12-NEXT: s_bfe_i32 s15, s3, 0x10013 -; GFX12-NEXT: s_bfe_i32 s16, s3, 0x10012 -; GFX12-NEXT: s_bfe_i32 s17, s3, 0x10011 -; GFX12-NEXT: s_bfe_i32 s18, s3, 0x10010 -; GFX12-NEXT: s_bfe_i32 s19, s3, 0x10017 -; GFX12-NEXT: s_bfe_i32 s20, s3, 0x10016 -; GFX12-NEXT: s_bfe_i32 s21, s3, 0x10014 -; GFX12-NEXT: s_bfe_i32 s3, s3, 0x10015 +; GFX12-NEXT: s_bfe_i32 s8, s2, 0x10007 +; GFX12-NEXT: s_bfe_i32 s9, s2, 0x10006 +; GFX12-NEXT: s_bfe_i32 s10, s2, 0x10005 +; GFX12-NEXT: s_bfe_i32 s11, s2, 0x10004 +; GFX12-NEXT: s_bfe_i32 s12, s2, 0x1000b +; GFX12-NEXT: s_bfe_i32 s13, s2, 0x1000a +; GFX12-NEXT: s_bfe_i32 s14, s2, 0x10009 +; GFX12-NEXT: s_bfe_i32 s15, s2, 0x10008 +; GFX12-NEXT: s_bfe_i32 s16, s2, 0x1000f +; GFX12-NEXT: s_bfe_i32 s17, s2, 0x1000e +; GFX12-NEXT: s_bfe_i32 s18, s2, 0x1000d +; GFX12-NEXT: s_bfe_i32 s19, s2, 0x1000c +; GFX12-NEXT: s_bfe_i32 s20, s2, 0x10013 +; GFX12-NEXT: s_bfe_i32 s21, s2, 0x10012 +; GFX12-NEXT: s_bfe_i32 s22, s2, 0x10011 +; GFX12-NEXT: s_bfe_i32 s23, s2, 0x10010 +; GFX12-NEXT: s_bfe_i32 s24, s2, 0x10017 +; GFX12-NEXT: s_bfe_i32 s25, s2, 0x10016 +; GFX12-NEXT: s_bfe_i32 s26, s2, 0x10015 +; GFX12-NEXT: s_bfe_i32 s27, s2, 0x10014 +; GFX12-NEXT: s_bfe_i32 s28, s2, 0x1001b +; GFX12-NEXT: s_bfe_i32 s29, s2, 0x1001a +; GFX12-NEXT: s_bfe_i32 s30, s2, 0x10019 +; GFX12-NEXT: s_bfe_i32 s31, s2, 0x10018 +; GFX12-NEXT: s_ashr_i32 s33, s2, 31 +; GFX12-NEXT: s_bfe_i32 s34, s2, 0x1001e +; GFX12-NEXT: s_bfe_i32 s35, s2, 0x1001d +; GFX12-NEXT: s_bfe_i32 s2, s2, 0x1001c +; GFX12-NEXT: v_dual_mov_b32 v8, s43 :: v_dual_mov_b32 v11, s40 +; GFX12-NEXT: v_dual_mov_b32 v10, s41 :: v_dual_mov_b32 v13, s38 +; GFX12-NEXT: v_dual_mov_b32 v12, s39 :: v_dual_mov_b32 v15, s36 +; GFX12-NEXT: v_dual_mov_b32 v14, s37 :: v_dual_mov_b32 v17, s35 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v64, 0 :: v_dual_mov_b32 v49, s3 -; GFX12-NEXT: v_bfe_i32 v23, v23, 0, 1 -; GFX12-NEXT: v_bfe_i32 v22, v22, 0, 1 -; GFX12-NEXT: v_bfe_i32 v21, v21, 0, 1 -; GFX12-NEXT: v_bfe_i32 v20, v20, 0, 1 -; GFX12-NEXT: v_bfe_i32 v31, v31, 0, 1 -; GFX12-NEXT: v_bfe_i32 v30, v30, 0, 1 -; GFX12-NEXT: v_bfe_i32 v29, v29, 0, 1 -; GFX12-NEXT: v_bfe_i32 v28, v28, 0, 1 -; GFX12-NEXT: v_dual_mov_b32 v48, s21 :: v_dual_mov_b32 v51, s19 -; GFX12-NEXT: v_dual_mov_b32 v50, s20 :: v_dual_mov_b32 v53, s17 -; GFX12-NEXT: v_dual_mov_b32 v54, s16 :: v_dual_mov_b32 v57, s14 -; GFX12-NEXT: v_dual_mov_b32 v56, s2 :: v_dual_mov_b32 v59, s12 -; GFX12-NEXT: v_dual_mov_b32 v58, s13 :: v_dual_mov_b32 v61, s10 -; GFX12-NEXT: v_bfe_i32 v19, v19, 0, 1 -; GFX12-NEXT: v_bfe_i32 v18, v18, 0, 1 -; GFX12-NEXT: v_bfe_i32 v17, v17, 0, 1 -; GFX12-NEXT: v_bfe_i32 v27, v27, 0, 1 -; GFX12-NEXT: v_bfe_i32 v26, v26, 0, 1 -; GFX12-NEXT: v_bfe_i32 v25, v25, 0, 1 -; GFX12-NEXT: v_bfe_i32 v24, v24, 0, 1 -; GFX12-NEXT: v_bfe_i32 v46, v16, 0, 1 -; GFX12-NEXT: v_dual_mov_b32 v52, s18 :: v_dual_mov_b32 v55, s15 -; GFX12-NEXT: v_dual_mov_b32 v60, s11 :: v_dual_mov_b32 v63, s8 -; GFX12-NEXT: v_mov_b32_e32 v62, s9 -; GFX12-NEXT: v_mov_b32_e32 v16, s6 -; GFX12-NEXT: v_bfe_i32 v3, v1, 0, 1 -; GFX12-NEXT: v_bfe_i32 v1, v7, 0, 1 -; GFX12-NEXT: v_bfe_i32 v7, v44, 0, 1 -; GFX12-NEXT: v_bfe_i32 v15, v15, 0, 1 -; GFX12-NEXT: v_bfe_i32 v14, v14, 0, 1 -; GFX12-NEXT: v_bfe_i32 v13, v13, 0, 1 -; GFX12-NEXT: v_bfe_i32 v12, v12, 0, 1 -; GFX12-NEXT: v_bfe_i32 v47, v32, 0, 1 -; GFX12-NEXT: v_bfe_i32 v45, v8, 0, 1 -; GFX12-NEXT: v_bfe_i32 v44, v0, 0, 1 -; GFX12-NEXT: v_bfe_i32 v11, v11, 0, 1 -; GFX12-NEXT: v_bfe_i32 v10, v10, 0, 1 -; GFX12-NEXT: v_bfe_i32 v9, v9, 0, 1 -; GFX12-NEXT: v_bfe_i32 v43, v43, 0, 1 -; GFX12-NEXT: v_bfe_i32 v42, v42, 0, 1 -; GFX12-NEXT: v_bfe_i32 v41, v41, 0, 1 -; GFX12-NEXT: v_bfe_i32 v40, v40, 0, 1 -; GFX12-NEXT: v_mov_b32_e32 v8, s5 -; GFX12-NEXT: v_bfe_i32 v6, v6, 0, 1 -; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 1 -; GFX12-NEXT: v_bfe_i32 v4, v4, 0, 1 -; GFX12-NEXT: v_bfe_i32 v39, v39, 0, 1 -; GFX12-NEXT: v_bfe_i32 v38, v38, 0, 1 -; GFX12-NEXT: v_bfe_i32 v37, v37, 0, 1 -; GFX12-NEXT: v_bfe_i32 v36, v36, 0, 1 -; GFX12-NEXT: v_bfe_i32 v2, v2, 0, 1 -; GFX12-NEXT: v_bfe_i32 v35, v35, 0, 1 -; GFX12-NEXT: v_bfe_i32 v34, v34, 0, 1 -; GFX12-NEXT: v_bfe_i32 v33, v33, 0, 1 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v64, v[48:51], s[0:1] offset:208 -; GFX12-NEXT: global_store_b128 v64, v[52:55], s[0:1] offset:192 -; GFX12-NEXT: v_mov_b32_e32 v32, s7 -; GFX12-NEXT: s_clause 0x7 -; GFX12-NEXT: global_store_b128 v64, v[56:59], s[0:1] offset:80 -; GFX12-NEXT: global_store_b128 v64, v[60:63], s[0:1] offset:64 -; GFX12-NEXT: global_store_b128 v64, v[44:47], s[0:1] offset:48 -; GFX12-NEXT: global_store_b128 v64, v[40:43], s[0:1] offset:32 -; GFX12-NEXT: global_store_b128 v64, v[36:39], s[0:1] offset:16 -; GFX12-NEXT: global_store_b128 v64, v[32:35], s[0:1] -; GFX12-NEXT: global_store_b128 v64, v[28:31], s[0:1] offset:176 -; GFX12-NEXT: global_store_b128 v64, v[24:27], s[0:1] offset:160 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v16, s2 :: v_dual_mov_b32 v19, s33 +; GFX12-NEXT: v_dual_mov_b32 v18, s34 :: v_dual_mov_b32 v21, s30 +; GFX12-NEXT: v_dual_mov_b32 v20, s31 :: v_dual_mov_b32 v23, s28 +; GFX12-NEXT: v_mov_b32_e32 v22, s29 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:176 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:160 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:144 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:128 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:112 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1] offset:96 +; GFX12-NEXT: v_dual_mov_b32 v1, s26 :: v_dual_mov_b32 v0, s27 +; GFX12-NEXT: v_dual_mov_b32 v3, s24 :: v_dual_mov_b32 v2, s25 +; GFX12-NEXT: v_dual_mov_b32 v5, s22 :: v_dual_mov_b32 v4, s23 +; GFX12-NEXT: v_dual_mov_b32 v7, s20 :: v_dual_mov_b32 v6, s21 +; GFX12-NEXT: v_dual_mov_b32 v9, s18 :: v_dual_mov_b32 v8, s19 +; GFX12-NEXT: v_dual_mov_b32 v11, s16 :: v_dual_mov_b32 v10, s17 +; GFX12-NEXT: v_dual_mov_b32 v13, s14 :: v_dual_mov_b32 v12, s15 +; GFX12-NEXT: v_dual_mov_b32 v15, s12 :: v_dual_mov_b32 v14, s13 +; GFX12-NEXT: v_dual_mov_b32 v17, s10 :: v_dual_mov_b32 v16, s11 +; GFX12-NEXT: v_dual_mov_b32 v19, s8 :: v_dual_mov_b32 v18, s9 +; GFX12-NEXT: v_dual_mov_b32 v21, s6 :: v_dual_mov_b32 v20, s7 +; GFX12-NEXT: v_dual_mov_b32 v23, s4 :: v_dual_mov_b32 v22, s5 ; GFX12-NEXT: s_clause 0x5 -; GFX12-NEXT: global_store_b128 v64, v[20:23], s[0:1] offset:144 -; GFX12-NEXT: global_store_b128 v64, v[16:19], s[0:1] offset:128 -; GFX12-NEXT: global_store_b128 v64, v[12:15], s[0:1] offset:240 -; GFX12-NEXT: global_store_b128 v64, v[8:11], s[0:1] offset:224 -; GFX12-NEXT: global_store_b128 v64, v[4:7], s[0:1] offset:112 -; GFX12-NEXT: global_store_b128 v64, v[0:3], s[0:1] offset:96 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:80 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:64 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -4720,6 +4520,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out ; GFX8-LABEL: constant_zextload_v2i1_to_v2i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: v_mov_b32_e32 v2, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -4727,11 +4528,10 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -4762,11 +4562,13 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v1, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v2, 1, v0 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 1, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 1, v2 ; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -4810,7 +4612,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 1 ; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 1 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -4846,7 +4648,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v4, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, 1, v0 ; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_bfe_i32 v2, v1, 0, 1 @@ -4891,28 +4693,26 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out ; GFX8-LABEL: constant_zextload_v3i1_to_v3i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: v_mov_b32_e32 v10, 2 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, v5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_load_ubyte v0, v[0:1] +; GFX8-NEXT: flat_load_ubyte v4, v[0:1] ; GFX8-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v7, s3 -; GFX8-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NEXT: v_mov_b32_e32 v9, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s1 +; GFX8-NEXT: v_mov_b32_e32 v8, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, 2, v0 -; GFX8-NEXT: v_and_b32_e32 v8, 1, v0 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX8-NEXT: flat_store_dwordx2 v[6:7], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v8 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_and_b32_e32 v0, 1, v4 +; GFX8-NEXT: v_bfe_u32 v2, v4, 1, 1 +; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: flat_store_dwordx2 v[8:9], v[4:5] +; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_v3i1_to_v3i64: @@ -4948,15 +4748,16 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v5, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0 -; GFX12-NEXT: v_lshrrev_b16 v3, 2, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX12-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_and_b32 v6, 1, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v3, v5 :: v_dual_and_b32 v4, 0xffff, v3 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v6 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; GFX12-NEXT: v_bfe_u32 v2, v0, 1, 1 +; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_lshrrev_b32_e32 v4, 2, v1 +; GFX12-NEXT: v_mov_b32_e32 v3, v5 +; GFX12-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_and_b32 v2, 0xffff, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b64 v5, v[4:5], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v5, v[0:3], s[0:1] @@ -5010,8 +4811,8 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out ; GFX8-NEXT: v_mov_b32_e32 v6, s2 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 2, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 2, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: v_bfe_i32 v8, v3, 0, 1 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 1 ; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 1 @@ -5058,8 +4859,8 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v6, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v1, 2, v0 -; GFX12-NEXT: v_lshrrev_b16 v2, 1, v0 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, 2, v0 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 1, v0 ; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_bfe_i32 v4, v1, 0, 1 @@ -5111,6 +4912,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out ; GFX8-LABEL: constant_zextload_v4i1_to_v4i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: v_mov_b32_e32 v2, 3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -5126,15 +4928,10 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out ; GFX8-NEXT: v_mov_b32_e32 v7, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e32 v4, 2, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v0 -; GFX8-NEXT: v_and_b32_e32 v12, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, 3, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v12 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX8-NEXT: v_and_b32_e32 v4, 1, v0 +; GFX8-NEXT: v_bfe_u32 v6, v0, 1, 1 +; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_bfe_u32 v0, v0, 2, 1 ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[0:3] ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GFX8-NEXT: s_endpgm @@ -5175,23 +4972,25 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v1, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v6, 1, v0 -; GFX12-NEXT: v_lshrrev_b16 v2, 2, v0 -; GFX12-NEXT: v_lshrrev_b16 v4, 1, v0 -; GFX12-NEXT: v_mov_b32_e32 v3, v1 -; GFX12-NEXT: v_lshrrev_b16 v0, 3, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v8, 1, v2 -; GFX12-NEXT: v_and_b32_e32 v9, 1, v4 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_and_b32 v4, 0xffff, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v8 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v9 -; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10002 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 3, v0 +; GFX12-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10001 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-NEXT: s_and_b32 s2, s2, 1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX12-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 -; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -5245,9 +5044,9 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out ; GFX8-NEXT: v_mov_b32_e32 v10, s2 ; GFX8-NEXT: v_mov_b32_e32 v8, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 2, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, 3, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 2, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 3, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: v_bfe_i32 v6, v4, 0, 1 ; GFX8-NEXT: v_bfe_i32 v4, v3, 0, 1 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 1 @@ -5299,9 +5098,9 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v8, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v1, 3, v0 -; GFX12-NEXT: v_lshrrev_b16 v2, 2, v0 -; GFX12-NEXT: v_lshrrev_b16 v3, 1, v0 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, 3, v0 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 2, v0 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, 1, v0 ; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-NEXT: v_bfe_i32 v6, v1, 0, 1 @@ -5380,39 +5179,31 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX8-NEXT: v_mov_b32_e32 v16, s0 ; GFX8-NEXT: s_add_u32 s0, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v21, s3 -; GFX8-NEXT: v_mov_b32_e32 v23, s1 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_mov_b32_e32 v20, s2 +; GFX8-NEXT: v_mov_b32_e32 v23, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v11, v1 ; GFX8-NEXT: v_mov_b32_e32 v13, v1 ; GFX8-NEXT: v_mov_b32_e32 v15, v1 -; GFX8-NEXT: v_mov_b32_e32 v20, s2 ; GFX8-NEXT: v_mov_b32_e32 v22, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e32 v2, 2, v0 -; GFX8-NEXT: v_and_b32_e32 v8, 1, v2 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, 4, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 6, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v10, 5, v0 -; GFX8-NEXT: v_and_b32_e32 v4, 1, v2 -; GFX8-NEXT: v_lshrrev_b16_e32 v14, 3, v0 +; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v0 +; GFX8-NEXT: v_bfe_u32 v6, v0, 5, 1 +; GFX8-NEXT: v_bfe_u32 v4, v0, 4, 1 +; GFX8-NEXT: v_bfe_u32 v10, v0, 3, 1 +; GFX8-NEXT: v_bfe_u32 v14, v0, 1, 1 ; GFX8-NEXT: v_and_b32_e32 v12, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, 7, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v0, 1, v0 -; GFX8-NEXT: v_and_b32_e32 v24, 1, v10 -; GFX8-NEXT: v_and_b32_e32 v10, 1, v14 -; GFX8-NEXT: v_and_b32_e32 v14, 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v6 -; GFX8-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v24 -; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[0:3] +; GFX8-NEXT: v_bfe_u32 v8, v0, 2, 1 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 7, v24 +; GFX8-NEXT: v_bfe_u32 v0, v24, 6, 1 ; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[4:7] ; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[8:11] +; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[0:3] ; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GFX8-NEXT: s_endpgm ; @@ -5468,27 +5259,24 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_u8 v0, v1, s[2:3] +; GFX12-NEXT: global_load_u8 v12, v1, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v12, 1, v0 -; GFX12-NEXT: v_lshrrev_b16 v4, 5, v0 -; GFX12-NEXT: v_lshrrev_b16 v8, 3, v0 -; GFX12-NEXT: v_lshrrev_b16 v14, 1, v0 -; GFX12-NEXT: v_lshrrev_b16 v2, 7, v0 -; GFX12-NEXT: v_lshrrev_b16 v6, 6, v0 -; GFX12-NEXT: v_lshrrev_b16 v10, 4, v0 -; GFX12-NEXT: v_and_b32_e32 v17, 1, v4 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v18, 1, v8 -; GFX12-NEXT: v_lshrrev_b16 v16, 2, v0 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_and_b32 v14, 1, v14 -; GFX12-NEXT: v_dual_mov_b32 v11, v1 :: v_dual_and_b32 v0, 1, v6 -; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_and_b32 v2, 0xffff, v2 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v12 +; GFX12-NEXT: v_mov_b32_e32 v5, v1 +; GFX12-NEXT: v_mov_b32_e32 v7, v1 +; GFX12-NEXT: v_bfe_u32 v6, v12, 5, 1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v8, 1, v16 -; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_and_b32 v4, 1, v10 -; GFX12-NEXT: v_dual_mov_b32 v15, v1 :: v_dual_and_b32 v6, 0xffff, v17 -; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v18 -; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 7, v0 +; GFX12-NEXT: v_bfe_u32 v0, v0, 6, 1 +; GFX12-NEXT: v_bfe_u32 v4, v12, 4, 1 +; GFX12-NEXT: v_mov_b32_e32 v9, v1 +; GFX12-NEXT: v_mov_b32_e32 v11, v1 +; GFX12-NEXT: v_bfe_u32 v10, v12, 3, 1 +; GFX12-NEXT: v_bfe_u32 v8, v12, 2, 1 +; GFX12-NEXT: v_mov_b32_e32 v13, v1 +; GFX12-NEXT: v_mov_b32_e32 v15, v1 +; GFX12-NEXT: v_bfe_u32 v14, v12, 1, 1 +; GFX12-NEXT: v_and_b32_e32 v12, 1, v12 ; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48 ; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:32 @@ -5554,47 +5342,56 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v17, s1 +; GFX8-NEXT: v_mov_b32_e32 v16, s0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s3, v0 +; GFX8-NEXT: s_lshr_b32 s2, s3, 6 +; GFX8-NEXT: s_lshr_b32 s4, s3, 7 +; GFX8-NEXT: s_lshr_b32 s6, s3, 4 +; GFX8-NEXT: s_lshr_b32 s8, s3, 5 +; GFX8-NEXT: s_lshr_b32 s10, s3, 2 +; GFX8-NEXT: s_lshr_b32 s12, s3, 3 +; GFX8-NEXT: s_lshr_b32 s14, s3, 1 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 48 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v19, s3 ; GFX8-NEXT: v_mov_b32_e32 v18, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: v_mov_b32_e32 v17, s1 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v7, s5 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v16, s0 +; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[4:7] ; GFX8-NEXT: s_add_u32 s0, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v8, s6 +; GFX8-NEXT: v_mov_b32_e32 v9, s7 +; GFX8-NEXT: v_mov_b32_e32 v10, s8 +; GFX8-NEXT: v_mov_b32_e32 v11, s9 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v21, s3 -; GFX8-NEXT: v_mov_b32_e32 v23, s1 -; GFX8-NEXT: v_mov_b32_e32 v20, s2 -; GFX8-NEXT: v_mov_b32_e32 v22, s0 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 6, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v5, 7, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v7, 4, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v8, 5, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, 2, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 3, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v0 -; GFX8-NEXT: v_bfe_i32 v14, v5, 0, 1 -; GFX8-NEXT: v_bfe_i32 v12, v3, 0, 1 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[8:11] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 1 -; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 1 -; GFX8-NEXT: v_bfe_i32 v6, v6, 0, 1 -; GFX8-NEXT: v_bfe_i32 v4, v4, 0, 1 -; GFX8-NEXT: v_bfe_i32 v10, v8, 0, 1 -; GFX8-NEXT: v_bfe_i32 v8, v7, 0, 1 -; GFX8-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GFX8-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[12:15] -; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[8:11] -; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[4:7] +; GFX8-NEXT: v_mov_b32_e32 v12, s10 +; GFX8-NEXT: v_mov_b32_e32 v13, s11 +; GFX8-NEXT: v_mov_b32_e32 v14, s12 +; GFX8-NEXT: v_mov_b32_e32 v15, s13 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s14 +; GFX8-NEXT: v_mov_b32_e32 v3, s15 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[12:15] ; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -5657,36 +5454,41 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v16, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_u8 v1, v16, s[2:3] +; GFX12-NEXT: global_load_u8 v0, v16, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v3, 6, v1 -; GFX12-NEXT: v_lshrrev_b16 v5, 7, v1 -; GFX12-NEXT: v_lshrrev_b16 v7, 4, v1 -; GFX12-NEXT: v_lshrrev_b16 v4, 3, v1 -; GFX12-NEXT: v_lshrrev_b16 v8, 2, v1 -; GFX12-NEXT: v_lshrrev_b16 v9, 5, v1 -; GFX12-NEXT: v_lshrrev_b16 v2, 1, v1 -; GFX12-NEXT: v_bfe_i32 v14, v5, 0, 1 -; GFX12-NEXT: v_bfe_i32 v12, v3, 0, 1 -; GFX12-NEXT: v_bfe_i32 v6, v4, 0, 1 -; GFX12-NEXT: v_bfe_i32 v4, v8, 0, 1 -; GFX12-NEXT: v_bfe_i32 v10, v9, 0, 1 -; GFX12-NEXT: v_bfe_i32 v8, v7, 0, 1 -; GFX12-NEXT: v_bfe_i32 v0, v1, 0, 1 -; GFX12-NEXT: v_bfe_i32 v2, v2, 0, 1 -; GFX12-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GFX12-NEXT: v_readfirstlane_b32 s3, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v9, s3 +; GFX12-NEXT: s_lshr_b32 s2, s3, 6 +; GFX12-NEXT: s_lshr_b32 s4, s3, 7 +; GFX12-NEXT: s_lshr_b32 s6, s3, 4 +; GFX12-NEXT: s_lshr_b32 s8, s3, 5 +; GFX12-NEXT: s_lshr_b32 s10, s3, 2 +; GFX12-NEXT: s_lshr_b32 s12, s3, 3 +; GFX12-NEXT: s_lshr_b32 s14, s3, 1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 +; GFX12-NEXT: v_bfe_i32 v12, v9, 0, 1 +; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v5, s7 +; GFX12-NEXT: v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v7, s9 +; GFX12-NEXT: v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v9, s11 +; GFX12-NEXT: v_dual_mov_b32 v10, s12 :: v_dual_mov_b32 v11, s13 +; GFX12-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 ; GFX12-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GFX12-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GFX12-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GFX12-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX12-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX12-NEXT: s_clause 0x3 -; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:48 -; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32 -; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16 -; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] +; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -5759,102 +5561,85 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX8-LABEL: constant_zextload_v16i1_to_v16i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_mov_b32_e32 v6, v2 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: s_add_u32 s2, s0, 0x70 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: s_add_u32 s4, s0, 0x50 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v23, s5 -; GFX8-NEXT: v_mov_b32_e32 v22, s4 -; GFX8-NEXT: v_mov_b32_e32 v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v11, v2 -; GFX8-NEXT: v_mov_b32_e32 v12, v2 -; GFX8-NEXT: v_mov_b32_e32 v14, v2 -; GFX8-NEXT: v_mov_b32_e32 v15, v2 -; GFX8-NEXT: v_mov_b32_e32 v17, v2 -; GFX8-NEXT: v_mov_b32_e32 v19, v2 -; GFX8-NEXT: v_mov_b32_e32 v21, v2 -; GFX8-NEXT: v_mov_b32_e32 v25, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v11, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 10, v0 -; GFX8-NEXT: v_and_b32_e32 v5, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 11, v0 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff, v1 -; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[5:8] -; GFX8-NEXT: v_mov_b32_e32 v23, s3 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 14, v0 -; GFX8-NEXT: v_mov_b32_e32 v22, s2 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 15, v0 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v0 +; GFX8-NEXT: s_bfe_u32 s3, s2, 0x10009 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x1000d +; GFX8-NEXT: s_bfe_u32 s5, s2, 0x10007 +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x10003 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x10001 +; GFX8-NEXT: s_and_b32 s8, s2, 1 +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x10002 +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x10004 +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x10006 +; GFX8-NEXT: s_bfe_u32 s12, s2, 0x1000c +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x1000a +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0x50 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v13, s3 +; GFX8-NEXT: v_mov_b32_e32 v12, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 64 -; GFX8-NEXT: v_mov_b32_e32 v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v7, v2 -; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[1:4] -; GFX8-NEXT: v_mov_b32_e32 v23, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, 1 +; GFX8-NEXT: v_bfe_u32 v2, v4, 11, 1 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: v_and_b32_sdwa v8, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 9, v0 +; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v13, s3 +; GFX8-NEXT: v_mov_b32_e32 v12, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0x70 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 15, v4 +; GFX8-NEXT: v_bfe_u32 v14, v4, 5, 1 +; GFX8-NEXT: v_bfe_u32 v8, v4, 14, 1 +; GFX8-NEXT: v_bfe_u32 v4, v4, 8, 1 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[4:7] +; GFX8-NEXT: v_mov_b32_e32 v0, s12 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x60 -; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, s3 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[8:11] +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 48 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 12, v0 -; GFX8-NEXT: flat_store_dwordx4 v[1:2], v[8:11] -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_mov_b32_e32 v9, s3 -; GFX8-NEXT: v_and_b32_e32 v11, 1, v6 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 13, v0 -; GFX8-NEXT: v_mov_b32_e32 v8, s2 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX8-NEXT: v_mov_b32_e32 v0, s11 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v13, 0xffff, v6 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: flat_store_dwordx4 v[3:4], v[11:14] -; GFX8-NEXT: s_add_u32 s0, s0, 16 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 7, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 6, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v14, 4, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v16, 5, v0 -; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, 2, v0 -; GFX8-NEXT: v_and_b32_e32 v18, 1, v14 -; GFX8-NEXT: v_and_b32_e32 v14, 1, v6 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 3, v0 -; GFX8-NEXT: v_and_b32_e32 v22, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v0, 1, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX8-NEXT: v_and_b32_e32 v16, 1, v16 -; GFX8-NEXT: v_mov_b32_e32 v11, s3 -; GFX8-NEXT: v_mov_b32_e32 v13, s1 -; GFX8-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v16 -; GFX8-NEXT: v_and_b32_e32 v16, 0xffff, v3 -; GFX8-NEXT: v_mov_b32_e32 v10, s2 -; GFX8-NEXT: v_mov_b32_e32 v12, s0 -; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v0 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[14:17] -; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[18:21] -; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[4:7] -; GFX8-NEXT: flat_store_dwordx4 v[1:2], v[22:25] +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NEXT: v_mov_b32_e32 v2, v14 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s9 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_v16i1_to_v16i64: @@ -5947,56 +5732,55 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v1, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v28, 1, v0 -; GFX12-NEXT: v_lshrrev_b16 v4, 11, v0 -; GFX12-NEXT: v_lshrrev_b16 v8, 9, v0 -; GFX12-NEXT: v_lshrrev_b16 v12, 13, v0 -; GFX12-NEXT: v_lshrrev_b16 v16, 7, v0 -; GFX12-NEXT: v_lshrrev_b16 v2, 15, v0 -; GFX12-NEXT: v_lshrrev_b16 v6, 14, v0 -; GFX12-NEXT: v_lshrrev_b16 v10, 10, v0 -; GFX12-NEXT: v_lshrrev_b16 v20, 5, v0 -; GFX12-NEXT: v_lshrrev_b16 v24, 3, v0 -; GFX12-NEXT: v_lshrrev_b16 v32, 1, v0 -; GFX12-NEXT: v_and_b32_e32 v33, 1, v4 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v34, 1, v8 -; GFX12-NEXT: v_lshrrev_b16 v14, 8, v0 -; GFX12-NEXT: v_lshrrev_b16 v18, 12, v0 -; GFX12-NEXT: v_and_b32_e32 v35, 1, v12 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_and_b32 v36, 1, v16 -; GFX12-NEXT: v_lshrrev_b16 v22, 6, v0 -; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v38, 1, v24 -; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_and_b32 v32, 1, v32 -; GFX12-NEXT: v_dual_mov_b32 v21, v1 :: v_dual_and_b32 v4, 1, v10 -; GFX12-NEXT: v_mov_b32_e32 v23, v1 -; GFX12-NEXT: v_dual_mov_b32 v25, v1 :: v_dual_and_b32 v2, 0xffff, v2 -; GFX12-NEXT: v_mov_b32_e32 v31, v1 -; GFX12-NEXT: v_lshrrev_b16 v26, 4, v0 -; GFX12-NEXT: v_lshrrev_b16 v30, 2, v0 -; GFX12-NEXT: v_and_b32_e32 v37, 1, v20 -; GFX12-NEXT: v_and_b32_e32 v0, 1, v6 -; GFX12-NEXT: v_dual_mov_b32 v11, v1 :: v_dual_and_b32 v6, 0xffff, v33 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX12-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_and_b32 v24, 1, v30 -; GFX12-NEXT: v_and_b32_e32 v8, 1, v14 -; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_and_b32 v10, 0xffff, v34 -; GFX12-NEXT: v_dual_mov_b32 v15, v1 :: v_dual_and_b32 v20, 1, v26 -; GFX12-NEXT: v_dual_mov_b32 v17, v1 :: v_dual_and_b32 v16, 1, v22 -; GFX12-NEXT: v_and_b32_e32 v12, 1, v18 -; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v35 -; GFX12-NEXT: v_dual_mov_b32 v27, v1 :: v_dual_and_b32 v18, 0xffff, v36 -; GFX12-NEXT: v_dual_mov_b32 v29, v1 :: v_dual_and_b32 v30, 0xffff, v32 -; GFX12-NEXT: v_and_b32_e32 v26, 0xffff, v38 -; GFX12-NEXT: v_and_b32_e32 v22, 0xffff, v37 -; GFX12-NEXT: s_clause 0x7 -; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:80 -; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:64 -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112 -; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:96 -; GFX12-NEXT: global_store_b128 v1, v[16:19], s[0:1] offset:48 -; GFX12-NEXT: global_store_b128 v1, v[20:23], s[0:1] offset:32 -; GFX12-NEXT: global_store_b128 v1, v[24:27], s[0:1] offset:16 -; GFX12-NEXT: global_store_b128 v1, v[28:31], s[0:1] +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GFX12-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-NEXT: v_mov_b32_e32 v7, v1 +; GFX12-NEXT: v_mov_b32_e32 v11, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_bfe_u32 v2, v4, 11, 1 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000a +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000d +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1000c +; GFX12-NEXT: v_mov_b32_e32 v5, v1 +; GFX12-NEXT: v_bfe_u32 v6, v4, 5, 1 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10007 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10006 +; GFX12-NEXT: v_mov_b32_e32 v9, v1 +; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10002 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:96 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10004 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10009 +; GFX12-NEXT: s_bfe_u32 s5, s2, 0x10001 +; GFX12-NEXT: v_lshrrev_b32_e32 v10, 15, v4 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, v6 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10003 +; GFX12-NEXT: s_and_b32 s2, s2, 1 +; GFX12-NEXT: v_bfe_u32 v8, v4, 14, 1 +; GFX12-NEXT: v_bfe_u32 v4, v4, 8, 1 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: v_mov_b32_e32 v6, s3 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: v_mov_b32_e32 v2, s5 +; GFX12-NEXT: s_clause 0x2 +; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:112 +; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:64 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -6086,92 +5870,109 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v27, s1 +; GFX8-NEXT: v_mov_b32_e32 v26, s0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s3, v0 +; GFX8-NEXT: s_lshr_b32 s2, s3, 14 +; GFX8-NEXT: s_lshr_b32 s4, s3, 15 +; GFX8-NEXT: s_lshr_b32 s6, s3, 12 +; GFX8-NEXT: s_lshr_b32 s8, s3, 13 +; GFX8-NEXT: s_lshr_b32 s10, s3, 10 +; GFX8-NEXT: s_lshr_b32 s12, s3, 11 +; GFX8-NEXT: s_lshr_b32 s14, s3, 8 +; GFX8-NEXT: s_lshr_b32 s16, s3, 9 +; GFX8-NEXT: s_lshr_b32 s18, s3, 6 +; GFX8-NEXT: s_lshr_b32 s20, s3, 7 +; GFX8-NEXT: s_lshr_b32 s22, s3, 4 +; GFX8-NEXT: s_lshr_b32 s24, s3, 5 +; GFX8-NEXT: s_lshr_b32 s26, s3, 2 +; GFX8-NEXT: s_lshr_b32 s28, s3, 3 +; GFX8-NEXT: s_lshr_b32 s30, s3, 1 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x70 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: v_mov_b32_e32 v5, s2 +; GFX8-NEXT: v_mov_b32_e32 v23, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_mov_b32_e32 v22, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x60 +; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[2:5] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v8, s3 -; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0x50 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v7, s7 +; GFX8-NEXT: v_mov_b32_e32 v8, s8 +; GFX8-NEXT: v_mov_b32_e32 v9, s9 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v10, s3 -; GFX8-NEXT: v_mov_b32_e32 v9, s2 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[6:9] +; GFX8-NEXT: v_mov_b32_e32 v10, s10 +; GFX8-NEXT: v_mov_b32_e32 v9, s3 +; GFX8-NEXT: v_mov_b32_e32 v8, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 64 +; GFX8-NEXT: v_mov_b32_e32 v11, s11 +; GFX8-NEXT: v_mov_b32_e32 v12, s12 +; GFX8-NEXT: v_mov_b32_e32 v13, s13 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v16, s3 -; GFX8-NEXT: v_mov_b32_e32 v15, s2 +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[10:13] +; GFX8-NEXT: v_mov_b32_e32 v9, s3 +; GFX8-NEXT: v_mov_b32_e32 v8, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 48 +; GFX8-NEXT: v_mov_b32_e32 v14, s14 +; GFX8-NEXT: v_mov_b32_e32 v15, s15 +; GFX8-NEXT: v_mov_b32_e32 v16, s16 +; GFX8-NEXT: v_mov_b32_e32 v17, s17 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v23, s3 -; GFX8-NEXT: v_mov_b32_e32 v22, s2 +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[14:17] +; GFX8-NEXT: v_mov_b32_e32 v9, s3 +; GFX8-NEXT: v_mov_b32_e32 v8, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: v_mov_b32_e32 v21, s1 +; GFX8-NEXT: v_mov_b32_e32 v18, s18 +; GFX8-NEXT: v_mov_b32_e32 v19, s19 +; GFX8-NEXT: v_mov_b32_e32 v20, s20 +; GFX8-NEXT: v_mov_b32_e32 v21, s21 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v20, s0 +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[18:21] +; GFX8-NEXT: v_mov_b32_e32 v9, s3 ; GFX8-NEXT: s_add_u32 s0, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v22, s22 +; GFX8-NEXT: v_mov_b32_e32 v23, s23 +; GFX8-NEXT: v_mov_b32_e32 v24, s24 +; GFX8-NEXT: v_mov_b32_e32 v25, s25 +; GFX8-NEXT: v_mov_b32_e32 v8, s2 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v25, s3 -; GFX8-NEXT: v_mov_b32_e32 v27, s1 -; GFX8-NEXT: v_mov_b32_e32 v24, s2 -; GFX8-NEXT: v_mov_b32_e32 v26, s0 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 14, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, 15, v0 -; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 1 -; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 1 -; GFX8-NEXT: v_lshrrev_b16_e32 v11, 12, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v12, 13, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GFX8-NEXT: flat_store_dwordx4 v[5:6], v[1:4] -; GFX8-NEXT: v_lshrrev_b16_e32 v13, 10, v0 -; GFX8-NEXT: v_bfe_i32 v3, v12, 0, 1 -; GFX8-NEXT: v_bfe_i32 v1, v11, 0, 1 -; GFX8-NEXT: v_lshrrev_b16_e32 v14, 11, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 3, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v5, 2, v0 -; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[1:4] -; GFX8-NEXT: v_lshrrev_b16_e32 v17, 8, v0 -; GFX8-NEXT: v_bfe_i32 v3, v6, 0, 1 -; GFX8-NEXT: v_bfe_i32 v6, v14, 0, 1 -; GFX8-NEXT: v_bfe_i32 v4, v13, 0, 1 -; GFX8-NEXT: v_lshrrev_b16_e32 v18, 9, v0 -; GFX8-NEXT: v_bfe_i32 v1, v5, 0, 1 -; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[4:7] -; GFX8-NEXT: v_bfe_i32 v10, v18, 0, 1 -; GFX8-NEXT: v_bfe_i32 v8, v17, 0, 1 -; GFX8-NEXT: v_lshrrev_b16_e32 v19, 6, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v28, 4, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v29, 5, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v0 -; GFX8-NEXT: v_bfe_i32 v12, v0, 0, 1 -; GFX8-NEXT: v_lshrrev_b16_e32 v0, 7, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[8:11] -; GFX8-NEXT: v_bfe_i32 v18, v0, 0, 1 -; GFX8-NEXT: v_bfe_i32 v16, v19, 0, 1 -; GFX8-NEXT: v_bfe_i32 v7, v29, 0, 1 -; GFX8-NEXT: v_bfe_i32 v5, v28, 0, 1 -; GFX8-NEXT: v_bfe_i32 v14, v2, 0, 1 -; GFX8-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX8-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GFX8-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[16:19] -; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[5:8] -; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[1:4] -; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[12:15] +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[22:25] +; GFX8-NEXT: v_mov_b32_e32 v9, s1 +; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: v_mov_b32_e32 v5, s27 +; GFX8-NEXT: v_mov_b32_e32 v6, s28 +; GFX8-NEXT: v_mov_b32_e32 v7, s29 +; GFX8-NEXT: v_mov_b32_e32 v8, s0 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s30 +; GFX8-NEXT: v_mov_b32_e32 v3, s31 +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[0:3] ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: constant_sextload_v16i1_to_v16i64: @@ -6278,64 +6079,71 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v32, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_u16 v1, v32, s[2:3] +; GFX12-NEXT: global_load_u16 v0, v32, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v3, 14, v1 -; GFX12-NEXT: v_lshrrev_b16 v5, 15, v1 -; GFX12-NEXT: v_lshrrev_b16 v7, 12, v1 -; GFX12-NEXT: v_lshrrev_b16 v9, 13, v1 -; GFX12-NEXT: v_lshrrev_b16 v11, 10, v1 -; GFX12-NEXT: v_lshrrev_b16 v13, 11, v1 -; GFX12-NEXT: v_lshrrev_b16 v15, 8, v1 -; GFX12-NEXT: v_lshrrev_b16 v16, 9, v1 -; GFX12-NEXT: v_lshrrev_b16 v12, 6, v1 -; GFX12-NEXT: v_lshrrev_b16 v14, 7, v1 -; GFX12-NEXT: v_lshrrev_b16 v8, 4, v1 -; GFX12-NEXT: v_lshrrev_b16 v2, 1, v1 -; GFX12-NEXT: v_lshrrev_b16 v4, 3, v1 -; GFX12-NEXT: v_lshrrev_b16 v10, 2, v1 -; GFX12-NEXT: v_lshrrev_b16 v17, 5, v1 -; GFX12-NEXT: v_bfe_i32 v30, v5, 0, 1 -; GFX12-NEXT: v_bfe_i32 v28, v3, 0, 1 -; GFX12-NEXT: v_bfe_i32 v26, v9, 0, 1 -; GFX12-NEXT: v_bfe_i32 v24, v7, 0, 1 -; GFX12-NEXT: v_bfe_i32 v22, v13, 0, 1 -; GFX12-NEXT: v_bfe_i32 v20, v11, 0, 1 -; GFX12-NEXT: v_bfe_i32 v18, v16, 0, 1 -; GFX12-NEXT: v_bfe_i32 v16, v15, 0, 1 -; GFX12-NEXT: v_bfe_i32 v14, v14, 0, 1 -; GFX12-NEXT: v_bfe_i32 v12, v12, 0, 1 -; GFX12-NEXT: v_bfe_i32 v0, v1, 0, 1 -; GFX12-NEXT: v_bfe_i32 v2, v2, 0, 1 -; GFX12-NEXT: v_bfe_i32 v6, v4, 0, 1 -; GFX12-NEXT: v_bfe_i32 v4, v10, 0, 1 -; GFX12-NEXT: v_bfe_i32 v10, v17, 0, 1 -; GFX12-NEXT: v_bfe_i32 v8, v8, 0, 1 -; GFX12-NEXT: v_ashrrev_i32_e32 v31, 31, v30 +; GFX12-NEXT: v_readfirstlane_b32 s3, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshr_b32 s4, s3, 15 +; GFX12-NEXT: s_lshr_b32 s2, s3, 14 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v28, s3 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: s_lshr_b32 s6, s3, 12 +; GFX12-NEXT: s_lshr_b32 s8, s3, 13 +; GFX12-NEXT: s_lshr_b32 s10, s3, 10 +; GFX12-NEXT: s_lshr_b32 s12, s3, 11 +; GFX12-NEXT: s_lshr_b32 s14, s3, 8 +; GFX12-NEXT: s_lshr_b32 s16, s3, 9 +; GFX12-NEXT: s_lshr_b32 s18, s3, 6 +; GFX12-NEXT: s_lshr_b32 s20, s3, 7 +; GFX12-NEXT: s_lshr_b32 s22, s3, 4 +; GFX12-NEXT: s_lshr_b32 s24, s3, 5 +; GFX12-NEXT: s_lshr_b32 s26, s3, 2 +; GFX12-NEXT: s_lshr_b32 s28, s3, 3 +; GFX12-NEXT: s_lshr_b32 s30, s3, 1 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s7 +; GFX12-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s4 +; GFX12-NEXT: v_dual_mov_b32 v7, s9 :: v_dual_mov_b32 v4, s6 +; GFX12-NEXT: v_dual_mov_b32 v9, s11 :: v_dual_mov_b32 v6, s8 +; GFX12-NEXT: v_dual_mov_b32 v11, s13 :: v_dual_mov_b32 v8, s10 +; GFX12-NEXT: v_dual_mov_b32 v13, s15 :: v_dual_mov_b32 v10, s12 +; GFX12-NEXT: v_mov_b32_e32 v15, s17 +; GFX12-NEXT: v_bfe_i32 v28, v28, 0, 1 +; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v12, s14 :: v_dual_mov_b32 v17, s19 +; GFX12-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v19, s21 +; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v16, s18 :: v_dual_mov_b32 v21, s23 +; GFX12-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v23, s25 +; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v25, s27 +; GFX12-NEXT: v_dual_mov_b32 v22, s24 :: v_dual_mov_b32 v27, s29 +; GFX12-NEXT: v_dual_mov_b32 v24, s26 :: v_dual_mov_b32 v31, s31 +; GFX12-NEXT: v_mov_b32_e32 v26, s28 +; GFX12-NEXT: v_mov_b32_e32 v30, s30 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:112 +; GFX12-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:96 ; GFX12-NEXT: v_ashrrev_i32_e32 v29, 31, v28 -; GFX12-NEXT: v_ashrrev_i32_e32 v27, 31, v26 -; GFX12-NEXT: v_ashrrev_i32_e32 v25, 31, v24 -; GFX12-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GFX12-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GFX12-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GFX12-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX12-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GFX12-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX12-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX12-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX12-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GFX12-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GFX12-NEXT: s_clause 0x7 -; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:112 -; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:96 -; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:80 -; GFX12-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:64 -; GFX12-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:48 -; GFX12-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:32 -; GFX12-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:16 -; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1] +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:80 +; GFX12-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:64 +; GFX12-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -6456,177 +6264,153 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-LABEL: constant_zextload_v32i1_to_v32i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX8-NEXT: s_load_dword s6, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 13, s2 -; GFX8-NEXT: v_and_b32_e32 v15, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 9, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 11, s2 -; GFX8-NEXT: v_and_b32_e32 v8, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 7, s2 -; GFX8-NEXT: v_and_b32_e32 v11, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s2 -; GFX8-NEXT: s_lshr_b32 s14, s2, 24 -; GFX8-NEXT: v_and_b32_e32 v5, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s2 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x10018 -; GFX8-NEXT: s_and_b32 s11, s2, 1 -; GFX8-NEXT: s_bfe_u32 s15, s2, 0x10011 -; GFX8-NEXT: s_bfe_u32 s16, s2, 0x10010 -; GFX8-NEXT: s_bfe_u32 s17, s2, 0x10012 -; GFX8-NEXT: s_bfe_u32 s18, s2, 0x10013 -; GFX8-NEXT: s_bfe_u32 s19, s2, 0x10014 -; GFX8-NEXT: s_bfe_u32 s20, s2, 0x10015 -; GFX8-NEXT: s_bfe_u32 s21, s2, 0x10016 -; GFX8-NEXT: s_bfe_u32 s22, s2, 0x10017 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 14, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 12, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 10, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v3, 6, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v7, 4, s2 -; GFX8-NEXT: v_and_b32_e32 v10, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e64 v12, 2, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v20, 15, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0xb0 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: s_add_u32 s4, s0, 0xa0 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NEXT: s_add_u32 s6, s0, 0x90 +; GFX8-NEXT: s_lshr_b32 s7, s6, 31 +; GFX8-NEXT: s_bfe_u32 s8, s6, 0x1001d +; GFX8-NEXT: s_bfe_u32 s9, s6, 0x1001b +; GFX8-NEXT: s_bfe_u32 s10, s6, 0x10019 +; GFX8-NEXT: s_bfe_u32 s11, s6, 0x10017 +; GFX8-NEXT: s_bfe_u32 s12, s6, 0x10013 +; GFX8-NEXT: s_bfe_u32 s13, s6, 0x10011 +; GFX8-NEXT: s_bfe_u32 s14, s6, 0x1000f +; GFX8-NEXT: s_bfe_u32 s15, s6, 0x1000d +; GFX8-NEXT: s_bfe_u32 s16, s6, 0x1000b +; GFX8-NEXT: s_bfe_u32 s17, s6, 0x10009 +; GFX8-NEXT: s_bfe_u32 s18, s6, 0x10007 +; GFX8-NEXT: s_bfe_u32 s19, s6, 0x10005 +; GFX8-NEXT: s_bfe_u32 s4, s6, 0x10003 +; GFX8-NEXT: s_bfe_u32 s2, s6, 0x10001 +; GFX8-NEXT: s_and_b32 s3, s6, 1 +; GFX8-NEXT: s_bfe_u32 s5, s6, 0x10002 +; GFX8-NEXT: s_bfe_u32 s20, s6, 0x10004 +; GFX8-NEXT: s_bfe_u32 s21, s6, 0x10006 +; GFX8-NEXT: s_bfe_u32 s22, s6, 0x10008 +; GFX8-NEXT: s_bfe_u32 s23, s6, 0x1000a +; GFX8-NEXT: s_bfe_u32 s24, s6, 0x1000c +; GFX8-NEXT: s_bfe_u32 s25, s6, 0x1000e +; GFX8-NEXT: s_bfe_u32 s26, s6, 0x10010 +; GFX8-NEXT: s_bfe_u32 s27, s6, 0x10012 +; GFX8-NEXT: s_bfe_u32 s28, s6, 0x10014 +; GFX8-NEXT: s_bfe_u32 s29, s6, 0x10015 +; GFX8-NEXT: s_bfe_u32 s30, s6, 0x10016 +; GFX8-NEXT: s_bfe_u32 s31, s6, 0x10018 +; GFX8-NEXT: s_bfe_u32 s33, s6, 0x1001a +; GFX8-NEXT: s_bfe_u32 s34, s6, 0x1001c +; GFX8-NEXT: s_bfe_u32 s6, s6, 0x1001e +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_add_u32 s6, s0, 0xf0 +; GFX8-NEXT: v_mov_b32_e32 v2, s7 ; GFX8-NEXT: s_addc_u32 s7, s1, 0 -; GFX8-NEXT: s_add_u32 s8, s0, 0x80 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NEXT: s_add_u32 s12, s0, 0x70 -; GFX8-NEXT: v_and_b32_e32 v16, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s14 -; GFX8-NEXT: s_addc_u32 s13, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v17, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s14 -; GFX8-NEXT: v_mov_b32_e32 v23, s13 -; GFX8-NEXT: v_and_b32_e32 v25, 1, v1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v22, s12 -; GFX8-NEXT: s_add_u32 s12, s0, 0xf0 -; GFX8-NEXT: v_and_b32_e32 v18, 1, v4 -; GFX8-NEXT: v_mov_b32_e32 v19, v1 -; GFX8-NEXT: v_mov_b32_e32 v21, v1 -; GFX8-NEXT: s_addc_u32 s13, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[18:21] -; GFX8-NEXT: v_mov_b32_e32 v23, s13 -; GFX8-NEXT: v_lshrrev_b16_e64 v6, 6, s14 -; GFX8-NEXT: v_mov_b32_e32 v22, s12 -; GFX8-NEXT: s_add_u32 s12, s0, 0x60 -; GFX8-NEXT: v_and_b32_e32 v18, 1, v6 -; GFX8-NEXT: v_lshrrev_b16_e64 v20, 7, s14 -; GFX8-NEXT: s_addc_u32 s13, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[18:21] -; GFX8-NEXT: v_lshrrev_b16_e64 v24, 4, s14 -; GFX8-NEXT: v_and_b32_e32 v18, 1, v14 -; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v15 -; GFX8-NEXT: v_mov_b32_e32 v15, s13 -; GFX8-NEXT: v_mov_b32_e32 v14, s12 -; GFX8-NEXT: s_add_u32 s12, s0, 0x50 -; GFX8-NEXT: s_addc_u32 s13, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v23, s13 -; GFX8-NEXT: v_mov_b32_e32 v22, s12 -; GFX8-NEXT: s_add_u32 s12, s0, 64 -; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[18:21] -; GFX8-NEXT: s_addc_u32 s13, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v18, 1, v9 -; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v11 -; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[18:21] -; GFX8-NEXT: v_and_b32_e32 v11, 0xffff, v10 -; GFX8-NEXT: v_mov_b32_e32 v10, 1 -; GFX8-NEXT: v_mov_b32_e32 v23, s13 -; GFX8-NEXT: v_and_b32_sdwa v18, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v8 -; GFX8-NEXT: v_mov_b32_e32 v22, s12 -; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[18:21] -; GFX8-NEXT: v_and_b32_e32 v23, 0xffff, v2 -; GFX8-NEXT: v_and_b32_e32 v18, 1, v7 -; GFX8-NEXT: v_mov_b32_e32 v8, s3 -; GFX8-NEXT: v_and_b32_e32 v21, 1, v3 -; GFX8-NEXT: v_mov_b32_e32 v0, s21 -; GFX8-NEXT: v_mov_b32_e32 v2, s22 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, s2 -; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v8, s5 -; GFX8-NEXT: v_mov_b32_e32 v0, s19 -; GFX8-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NEXT: v_mov_b32_e32 v7, s4 -; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v8, s7 -; GFX8-NEXT: v_mov_b32_e32 v0, s17 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 -; GFX8-NEXT: v_mov_b32_e32 v7, s6 -; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v7, s8 -; GFX8-NEXT: v_mov_b32_e32 v0, s16 -; GFX8-NEXT: v_mov_b32_e32 v2, s15 -; GFX8-NEXT: v_mov_b32_e32 v8, s9 -; GFX8-NEXT: s_add_u32 s2, s0, 48 -; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3] -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_and_b32_e32 v15, 1, v24 -; GFX8-NEXT: v_mov_b32_e32 v22, v1 -; GFX8-NEXT: v_mov_b32_e32 v24, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[21:24] -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v5 -; GFX8-NEXT: v_mov_b32_e32 v21, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[18:21] -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_and_b32_e32 v14, 0xffff, v16 -; GFX8-NEXT: v_and_b32_e32 v9, 1, v12 -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v12, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_mov_b32_e32 v8, s1 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[9:12] -; GFX8-NEXT: v_mov_b32_e32 v0, s11 -; GFX8-NEXT: v_mov_b32_e32 v2, v14 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, s0 -; GFX8-NEXT: s_add_u32 s2, s0, 0xe0 -; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3] -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_add_u32 s2, s0, 0xd0 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 1, s14 -; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX8-NEXT: v_mov_b32_e32 v16, v1 -; GFX8-NEXT: v_mov_b32_e32 v18, v1 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e64 v26, 2, s14 -; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[15:18] -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: s_add_u32 s0, s0, 0xc0 -; GFX8-NEXT: v_and_b32_e32 v13, 0xffff, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 1, v26 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v25 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7] -; GFX8-NEXT: v_mov_b32_e32 v0, s10 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, v13 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: s_add_u32 s6, s0, 0xe0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s34 +; GFX8-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: s_add_u32 s6, s0, 0xd0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s33 +; GFX8-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: s_add_u32 s6, s0, 0xc0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s31 +; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: s_add_u32 s6, s0, 0xb0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s30 +; GFX8-NEXT: v_mov_b32_e32 v2, s11 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: s_add_u32 s6, s0, 0xa0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s28 +; GFX8-NEXT: v_mov_b32_e32 v2, s29 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: s_add_u32 s6, s0, 0x90 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s27 +; GFX8-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: s_add_u32 s6, s0, 0x80 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s26 +; GFX8-NEXT: v_mov_b32_e32 v2, s13 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: s_add_u32 s6, s0, 0x70 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s25 +; GFX8-NEXT: v_mov_b32_e32 v2, s14 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: s_add_u32 s6, s0, 0x60 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s24 +; GFX8-NEXT: v_mov_b32_e32 v2, s15 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: s_add_u32 s6, s0, 0x50 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s23 +; GFX8-NEXT: v_mov_b32_e32 v2, s16 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: s_add_u32 s6, s0, 64 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s22 +; GFX8-NEXT: v_mov_b32_e32 v2, s17 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: s_add_u32 s6, s0, 48 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s21 +; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: s_add_u32 s6, s0, 32 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: v_mov_b32_e32 v2, s19 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_add_u32 s4, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -6795,114 +6579,102 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v0, 13, s2 -; GFX12-NEXT: v_lshrrev_b16 v3, 11, s2 -; GFX12-NEXT: s_lshr_b32 s3, s2, 24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: v_lshrrev_b16 v2, 12, s2 -; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX12-NEXT: v_lshrrev_b16 v4, 9, s2 -; GFX12-NEXT: v_lshrrev_b16 v8, 7, s2 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1001e +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3 +; GFX12-NEXT: s_lshr_b32 s4, s2, 31 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1001d +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1001c +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:240 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1001b +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1001a +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:224 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10019 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:208 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_lshrrev_b16 v16, 7, s3 -; GFX12-NEXT: v_lshrrev_b16 v18, 6, s3 -; GFX12-NEXT: v_lshrrev_b16 v17, 5, s3 -; GFX12-NEXT: v_lshrrev_b16 v20, 4, s3 -; GFX12-NEXT: v_lshrrev_b16 v21, 3, s3 -; GFX12-NEXT: v_lshrrev_b16 v22, 2, s3 -; GFX12-NEXT: v_lshrrev_b16 v23, 1, s3 -; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10016 -; GFX12-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10017 -; GFX12-NEXT: v_lshrrev_b16 v11, 5, s2 -; GFX12-NEXT: v_lshrrev_b16 v13, 3, s2 -; GFX12-NEXT: v_and_b32_e32 v24, 1, v4 -; GFX12-NEXT: v_and_b32_e32 v25, 1, v8 -; GFX12-NEXT: v_and_b32_e32 v28, 1, v21 -; GFX12-NEXT: v_dual_mov_b32 v42, v1 :: v_dual_and_b32 v31, 1, v2 -; GFX12-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_and_b32 v33, 0xffff, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10017 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10016 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:192 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_and_b32 v21, 0xffff, v3 -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10014 -; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10015 -; GFX12-NEXT: v_lshrrev_b16 v9, 8, s2 -; GFX12-NEXT: v_lshrrev_b16 v15, 1, s2 -; GFX12-NEXT: v_and_b32_e32 v11, 1, v11 -; GFX12-NEXT: v_and_b32_e32 v13, 1, v13 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10015 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-NEXT: v_mov_b32_e32 v0, s3 -; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10012 -; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10013 -; GFX12-NEXT: v_lshrrev_b16 v6, 10, s2 -; GFX12-NEXT: v_and_b32_e32 v26, 1, v15 -; GFX12-NEXT: v_dual_mov_b32 v36, v1 :: v_dual_and_b32 v15, 1, v9 -; GFX12-NEXT: v_and_b32_e32 v9, 1, v17 -; GFX12-NEXT: v_and_b32_e32 v29, 1, v23 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10013 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10012 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v2, s6 -; GFX12-NEXT: v_mov_b32_e32 v0, s3 -; GFX12-NEXT: v_lshrrev_b16 v5, 15, s2 -; GFX12-NEXT: v_lshrrev_b16 v7, 14, s2 -; GFX12-NEXT: v_lshrrev_b16 v10, 6, s2 -; GFX12-NEXT: v_lshrrev_b16 v12, 4, s2 -; GFX12-NEXT: v_lshrrev_b16 v14, 2, s2 -; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018 -; GFX12-NEXT: s_and_b32 s5, s2, 1 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10011 -; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10010 -; GFX12-NEXT: v_dual_mov_b32 v38, v1 :: v_dual_and_b32 v19, 1, v6 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v13 -; GFX12-NEXT: v_and_b32_e32 v17, 0xffff, v24 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10010 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144 ; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v13, v1 -; GFX12-NEXT: v_and_b32_e32 v43, 0xffff, v26 -; GFX12-NEXT: v_and_b32_e32 v4, 1, v14 -; GFX12-NEXT: v_and_b32_e32 v8, 1, v12 -; GFX12-NEXT: v_and_b32_e32 v44, 0xffff, v29 -; GFX12-NEXT: v_dual_mov_b32 v34, v1 :: v_dual_and_b32 v35, 1, v18 -; GFX12-NEXT: v_and_b32_e32 v37, 0xffff, v16 -; GFX12-NEXT: v_and_b32_e32 v39, 1, v7 -; GFX12-NEXT: v_dual_mov_b32 v16, v1 :: v_dual_and_b32 v41, 0xffff, v5 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000f +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1000e ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:128 -; GFX12-NEXT: v_mov_b32_e32 v5, v1 -; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v0, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, v43 :: v_dual_and_b32 v29, 0xffff, v9 -; GFX12-NEXT: v_dual_mov_b32 v40, v1 :: v_dual_and_b32 v23, 1, v22 -; GFX12-NEXT: v_dual_mov_b32 v30, v1 :: v_dual_and_b32 v27, 1, v20 -; GFX12-NEXT: v_mov_b32_e32 v20, v1 -; GFX12-NEXT: v_mov_b32_e32 v22, v1 -; GFX12-NEXT: v_mov_b32_e32 v18, v1 -; GFX12-NEXT: v_and_b32_e32 v12, 1, v10 -; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v11 -; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v25 -; GFX12-NEXT: v_mov_b32_e32 v24, v1 -; GFX12-NEXT: s_clause 0x4 -; GFX12-NEXT: global_store_b128 v1, v[35:38], s[0:1] offset:240 -; GFX12-NEXT: global_store_b128 v1, v[39:42], s[0:1] offset:112 -; GFX12-NEXT: global_store_b128 v1, v[31:34], s[0:1] offset:96 -; GFX12-NEXT: global_store_b128 v1, v[19:22], s[0:1] offset:80 -; GFX12-NEXT: global_store_b128 v1, v[15:18], s[0:1] offset:64 -; GFX12-NEXT: v_mov_b32_e32 v15, v1 -; GFX12-NEXT: v_mov_b32_e32 v11, v1 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:16 -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: v_dual_mov_b32 v2, v44 :: v_dual_mov_b32 v9, v1 -; GFX12-NEXT: v_dual_mov_b32 v26, v1 :: v_dual_and_b32 v25, 0xffff, v28 -; GFX12-NEXT: v_mov_b32_e32 v28, v1 -; GFX12-NEXT: s_clause 0x4 -; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:48 -; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:32 -; GFX12-NEXT: global_store_b128 v1, v[27:30], s[0:1] offset:224 -; GFX12-NEXT: global_store_b128 v1, v[23:26], s[0:1] offset:208 -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:192 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000d +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1000c +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000b +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1000a +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:96 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10009 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10008 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10007 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10006 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:64 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10005 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10004 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10003 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10002 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10001 +; GFX12-NEXT: s_and_b32 s2, s2, 1 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -7082,189 +6854,220 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s6, s4, 22 -; GFX8-NEXT: s_lshr_b32 s8, s4, 23 -; GFX8-NEXT: s_lshr_b32 s10, s4, 20 -; GFX8-NEXT: s_lshr_b32 s12, s4, 21 -; GFX8-NEXT: s_lshr_b32 s14, s4, 18 -; GFX8-NEXT: s_lshr_b32 s16, s4, 19 -; GFX8-NEXT: s_lshr_b32 s18, s4, 16 -; GFX8-NEXT: s_lshr_b32 s20, s4, 17 -; GFX8-NEXT: s_lshr_b32 s2, s4, 24 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 14, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v3, 15, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v15, 12, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v17, 13, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v13, 10, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 11, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 8, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v10, 9, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v6, 6, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v8, 7, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 4, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v5, 5, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 2, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v7, 1, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v11, 6, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v12, 7, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v16, 4, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v18, 5, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v19, 2, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v20, 3, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v27, 1, s2 -; GFX8-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX8-NEXT: s_lshr_b32 s44, s2, 30 +; GFX8-NEXT: s_lshr_b32 s46, s2, 31 +; GFX8-NEXT: s_lshr_b32 s48, s2, 28 +; GFX8-NEXT: s_lshr_b32 s50, s2, 29 +; GFX8-NEXT: s_lshr_b32 s52, s2, 26 +; GFX8-NEXT: s_lshr_b32 s54, s2, 27 +; GFX8-NEXT: s_lshr_b32 s56, s2, 24 +; GFX8-NEXT: s_lshr_b32 s58, s2, 25 +; GFX8-NEXT: s_lshr_b32 s60, s2, 22 +; GFX8-NEXT: s_lshr_b32 s62, s2, 23 +; GFX8-NEXT: s_lshr_b32 s64, s2, 20 +; GFX8-NEXT: s_lshr_b32 s66, s2, 21 +; GFX8-NEXT: s_lshr_b32 s42, s2, 18 +; GFX8-NEXT: s_lshr_b32 s40, s2, 19 +; GFX8-NEXT: s_lshr_b32 s38, s2, 16 +; GFX8-NEXT: s_lshr_b32 s36, s2, 17 +; GFX8-NEXT: s_lshr_b32 s34, s2, 14 +; GFX8-NEXT: s_lshr_b32 s30, s2, 15 +; GFX8-NEXT: s_lshr_b32 s28, s2, 12 +; GFX8-NEXT: s_lshr_b32 s26, s2, 13 +; GFX8-NEXT: s_lshr_b32 s24, s2, 10 +; GFX8-NEXT: s_lshr_b32 s22, s2, 11 +; GFX8-NEXT: s_lshr_b32 s20, s2, 8 +; GFX8-NEXT: s_lshr_b32 s18, s2, 9 +; GFX8-NEXT: s_lshr_b32 s16, s2, 6 +; GFX8-NEXT: s_lshr_b32 s14, s2, 7 +; GFX8-NEXT: s_lshr_b32 s12, s2, 4 +; GFX8-NEXT: s_lshr_b32 s10, s2, 5 +; GFX8-NEXT: s_lshr_b32 s8, s2, 2 +; GFX8-NEXT: s_lshr_b32 s6, s2, 3 +; GFX8-NEXT: s_lshr_b32 s68, s2, 1 +; GFX8-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[2:3], s[68:69], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 -; GFX8-NEXT: v_mov_b32_e32 v21, s6 -; GFX8-NEXT: s_add_u32 s6, s0, 0xb0 -; GFX8-NEXT: v_mov_b32_e32 v22, s7 -; GFX8-NEXT: s_addc_u32 s7, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v26, s7 -; GFX8-NEXT: v_mov_b32_e32 v25, s6 -; GFX8-NEXT: s_add_u32 s6, s0, 0xa0 -; GFX8-NEXT: v_mov_b32_e32 v23, s8 -; GFX8-NEXT: v_mov_b32_e32 v24, s9 -; GFX8-NEXT: s_addc_u32 s7, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[25:26], v[21:24] -; GFX8-NEXT: v_mov_b32_e32 v26, s7 -; GFX8-NEXT: v_mov_b32_e32 v25, s6 -; GFX8-NEXT: s_add_u32 s6, s0, 0x90 -; GFX8-NEXT: v_mov_b32_e32 v21, s10 -; GFX8-NEXT: v_mov_b32_e32 v22, s11 -; GFX8-NEXT: v_mov_b32_e32 v23, s12 -; GFX8-NEXT: v_mov_b32_e32 v24, s13 -; GFX8-NEXT: s_addc_u32 s7, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[25:26], v[21:24] -; GFX8-NEXT: v_mov_b32_e32 v26, s7 -; GFX8-NEXT: v_mov_b32_e32 v25, s6 -; GFX8-NEXT: s_add_u32 s6, s0, 0x80 -; GFX8-NEXT: v_mov_b32_e32 v21, s14 -; GFX8-NEXT: v_mov_b32_e32 v22, s15 -; GFX8-NEXT: v_mov_b32_e32 v23, s16 -; GFX8-NEXT: v_mov_b32_e32 v24, s17 -; GFX8-NEXT: s_addc_u32 s7, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[25:26], v[21:24] -; GFX8-NEXT: v_mov_b32_e32 v26, s7 -; GFX8-NEXT: v_mov_b32_e32 v21, s18 -; GFX8-NEXT: v_mov_b32_e32 v22, s19 -; GFX8-NEXT: v_mov_b32_e32 v23, s20 -; GFX8-NEXT: v_mov_b32_e32 v24, s21 -; GFX8-NEXT: v_mov_b32_e32 v25, s6 -; GFX8-NEXT: s_add_u32 s6, s0, 0x70 -; GFX8-NEXT: flat_store_dwordx4 v[25:26], v[21:24] -; GFX8-NEXT: s_addc_u32 s7, s1, 0 -; GFX8-NEXT: v_bfe_i32 v23, v3, 0, 1 -; GFX8-NEXT: v_bfe_i32 v21, v2, 0, 1 +; GFX8-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX8-NEXT: v_mov_b32_e32 v0, s44 +; GFX8-NEXT: s_add_u32 s44, s0, 0xf0 +; GFX8-NEXT: v_mov_b32_e32 v1, s45 +; GFX8-NEXT: s_addc_u32 s45, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s44 +; GFX8-NEXT: v_mov_b32_e32 v2, s46 +; GFX8-NEXT: v_mov_b32_e32 v3, s47 +; GFX8-NEXT: v_mov_b32_e32 v5, s45 +; GFX8-NEXT: s_add_u32 s44, s0, 0xe0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s45, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s44 +; GFX8-NEXT: v_mov_b32_e32 v0, s48 +; GFX8-NEXT: v_mov_b32_e32 v1, s49 +; GFX8-NEXT: v_mov_b32_e32 v2, s50 +; GFX8-NEXT: v_mov_b32_e32 v3, s51 +; GFX8-NEXT: v_mov_b32_e32 v5, s45 +; GFX8-NEXT: s_add_u32 s44, s0, 0xd0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s45, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s44 +; GFX8-NEXT: v_mov_b32_e32 v0, s52 +; GFX8-NEXT: v_mov_b32_e32 v1, s53 +; GFX8-NEXT: v_mov_b32_e32 v2, s54 +; GFX8-NEXT: v_mov_b32_e32 v3, s55 +; GFX8-NEXT: v_mov_b32_e32 v5, s45 +; GFX8-NEXT: s_add_u32 s44, s0, 0xc0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s45, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s44 +; GFX8-NEXT: v_mov_b32_e32 v0, s56 +; GFX8-NEXT: v_mov_b32_e32 v1, s57 +; GFX8-NEXT: v_mov_b32_e32 v2, s58 +; GFX8-NEXT: v_mov_b32_e32 v3, s59 +; GFX8-NEXT: v_mov_b32_e32 v5, s45 +; GFX8-NEXT: s_add_u32 s44, s0, 0xb0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s45, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s44 +; GFX8-NEXT: v_mov_b32_e32 v0, s60 +; GFX8-NEXT: v_mov_b32_e32 v1, s61 +; GFX8-NEXT: v_mov_b32_e32 v2, s62 +; GFX8-NEXT: v_mov_b32_e32 v3, s63 +; GFX8-NEXT: v_mov_b32_e32 v5, s45 +; GFX8-NEXT: s_add_u32 s44, s0, 0xa0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s45, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s44 +; GFX8-NEXT: v_mov_b32_e32 v0, s64 +; GFX8-NEXT: v_mov_b32_e32 v1, s65 +; GFX8-NEXT: v_mov_b32_e32 v2, s66 +; GFX8-NEXT: v_mov_b32_e32 v3, s67 +; GFX8-NEXT: v_mov_b32_e32 v5, s45 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s40 +; GFX8-NEXT: s_add_u32 s40, s0, 0x90 +; GFX8-NEXT: v_mov_b32_e32 v3, s41 +; GFX8-NEXT: s_addc_u32 s41, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s40 +; GFX8-NEXT: v_mov_b32_e32 v0, s42 +; GFX8-NEXT: v_mov_b32_e32 v1, s43 +; GFX8-NEXT: v_mov_b32_e32 v5, s41 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s36 +; GFX8-NEXT: s_add_u32 s36, s0, 0x80 +; GFX8-NEXT: v_mov_b32_e32 v3, s37 +; GFX8-NEXT: s_addc_u32 s37, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s36 +; GFX8-NEXT: v_mov_b32_e32 v0, s38 +; GFX8-NEXT: v_mov_b32_e32 v1, s39 +; GFX8-NEXT: v_mov_b32_e32 v5, s37 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s30 +; GFX8-NEXT: s_add_u32 s30, s0, 0x70 +; GFX8-NEXT: v_mov_b32_e32 v3, s31 +; GFX8-NEXT: s_addc_u32 s31, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s30 +; GFX8-NEXT: v_mov_b32_e32 v0, s34 +; GFX8-NEXT: v_mov_b32_e32 v1, s35 +; GFX8-NEXT: v_mov_b32_e32 v5, s31 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0x60 +; GFX8-NEXT: v_mov_b32_e32 v3, s27 +; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: v_mov_b32_e32 v0, s28 +; GFX8-NEXT: v_mov_b32_e32 v1, s29 +; GFX8-NEXT: v_mov_b32_e32 v5, s27 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s22 +; GFX8-NEXT: s_add_u32 s22, s0, 0x50 +; GFX8-NEXT: v_mov_b32_e32 v3, s23 +; GFX8-NEXT: s_addc_u32 s23, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s22 +; GFX8-NEXT: v_mov_b32_e32 v0, s24 +; GFX8-NEXT: v_mov_b32_e32 v1, s25 +; GFX8-NEXT: v_mov_b32_e32 v5, s23 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: s_add_u32 s18, s0, 64 +; GFX8-NEXT: v_mov_b32_e32 v3, s19 +; GFX8-NEXT: s_addc_u32 s19, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s18 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: v_mov_b32_e32 v1, s21 +; GFX8-NEXT: v_mov_b32_e32 v5, s19 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s14 +; GFX8-NEXT: s_add_u32 s14, s0, 48 +; GFX8-NEXT: v_mov_b32_e32 v3, s15 +; GFX8-NEXT: s_addc_u32 s15, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s14 +; GFX8-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: s_add_u32 s10, s0, 32 +; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NEXT: v_mov_b32_e32 v0, s12 +; GFX8-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: s_add_u32 s6, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: s_add_u32 s6, s0, 0x60 -; GFX8-NEXT: s_addc_u32 s7, s1, 0 -; GFX8-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GFX8-NEXT: v_ashrrev_i32_e32 v22, 31, v21 -; GFX8-NEXT: v_mov_b32_e32 v26, s7 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[21:24] -; GFX8-NEXT: v_mov_b32_e32 v25, s6 -; GFX8-NEXT: v_bfe_i32 v23, v17, 0, 1 -; GFX8-NEXT: v_bfe_i32 v21, v15, 0, 1 -; GFX8-NEXT: s_add_u32 s6, s0, 0x50 -; GFX8-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GFX8-NEXT: v_ashrrev_i32_e32 v22, 31, v21 -; GFX8-NEXT: s_addc_u32 s7, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[25:26], v[21:24] -; GFX8-NEXT: v_bfe_i32 v25, v14, 0, 1 -; GFX8-NEXT: v_bfe_i32 v23, v13, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v14, s7 -; GFX8-NEXT: v_mov_b32_e32 v13, s6 -; GFX8-NEXT: s_add_u32 s6, s0, 64 -; GFX8-NEXT: v_ashrrev_i32_e32 v26, 31, v25 -; GFX8-NEXT: v_ashrrev_i32_e32 v24, 31, v23 ; GFX8-NEXT: s_addc_u32 s7, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[13:14], v[23:26] -; GFX8-NEXT: v_bfe_i32 v12, v12, 0, 1 -; GFX8-NEXT: v_bfe_i32 v25, v10, 0, 1 -; GFX8-NEXT: v_bfe_i32 v23, v9, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v10, s7 -; GFX8-NEXT: v_mov_b32_e32 v9, s6 -; GFX8-NEXT: s_add_u32 s6, s0, 48 -; GFX8-NEXT: v_ashrrev_i32_e32 v26, 31, v25 -; GFX8-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GFX8-NEXT: s_addc_u32 s7, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[23:26] -; GFX8-NEXT: v_bfe_i32 v10, v11, 0, 1 -; GFX8-NEXT: v_bfe_i32 v25, v8, 0, 1 -; GFX8-NEXT: v_bfe_i32 v23, v6, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v9, s7 -; GFX8-NEXT: v_ashrrev_i32_e32 v26, 31, v25 -; GFX8-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GFX8-NEXT: v_mov_b32_e32 v8, s6 -; GFX8-NEXT: s_add_u32 s6, s0, 32 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[23:26] -; GFX8-NEXT: s_addc_u32 s7, s1, 0 -; GFX8-NEXT: v_bfe_i32 v25, v5, 0, 1 -; GFX8-NEXT: v_bfe_i32 v23, v4, 0, 1 ; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: v_ashrrev_i32_e32 v26, 31, v25 -; GFX8-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_add_u32 s6, s0, 16 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[23:26] -; GFX8-NEXT: s_addc_u32 s7, s1, 0 -; GFX8-NEXT: v_bfe_i32 v25, v1, 0, 1 -; GFX8-NEXT: v_bfe_i32 v23, v0, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_ashrrev_i32_e32 v26, 31, v25 -; GFX8-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_bfe_i32 v6, v7, 0, 1 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[23:26] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: s_add_u32 s4, s0, 0xf0 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] -; GFX8-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_add_u32 s4, s0, 0xe0 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[10:13] -; GFX8-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_add_u32 s4, s0, 0xd0 -; GFX8-NEXT: v_bfe_i32 v17, v18, 0, 1 -; GFX8-NEXT: v_bfe_i32 v15, v16, 0, 1 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GFX8-NEXT: s_add_u32 s0, s0, 0xc0 -; GFX8-NEXT: v_bfe_i32 v21, v20, 0, 1 -; GFX8-NEXT: v_bfe_i32 v19, v19, 0, 1 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[15:18] +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_bfe_i32 v2, v27, 0, 1 -; GFX8-NEXT: v_ashrrev_i32_e32 v22, 31, v21 -; GFX8-NEXT: v_ashrrev_i32_e32 v20, 31, v19 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[19:22] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -7465,121 +7268,123 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v26, 6, s2 -; GFX12-NEXT: v_lshrrev_b16 v28, 7, s2 -; GFX12-NEXT: v_lshrrev_b16 v4, 2, s2 -; GFX12-NEXT: v_lshrrev_b16 v5, 3, s2 -; GFX12-NEXT: s_lshr_b32 s22, s2, 24 -; GFX12-NEXT: v_lshrrev_b16 v8, 4, s2 -; GFX12-NEXT: v_lshrrev_b16 v3, 5, s2 -; GFX12-NEXT: v_lshrrev_b16 v2, 1, s2 -; GFX12-NEXT: v_lshrrev_b16 v7, 6, s22 -; GFX12-NEXT: v_lshrrev_b16 v11, 7, s22 -; GFX12-NEXT: v_lshrrev_b16 v13, 4, s22 -; GFX12-NEXT: v_lshrrev_b16 v15, 5, s22 -; GFX12-NEXT: v_lshrrev_b16 v0, 14, s2 -; GFX12-NEXT: v_lshrrev_b16 v1, 15, s2 -; GFX12-NEXT: v_lshrrev_b16 v14, 2, s22 -; GFX12-NEXT: v_lshrrev_b16 v16, 3, s22 -; GFX12-NEXT: v_lshrrev_b16 v35, 12, s2 -; GFX12-NEXT: v_lshrrev_b16 v37, 13, s2 -; GFX12-NEXT: v_lshrrev_b16 v34, 10, s2 -; GFX12-NEXT: v_lshrrev_b16 v36, 11, s2 -; GFX12-NEXT: v_lshrrev_b16 v9, 8, s2 -; GFX12-NEXT: v_lshrrev_b16 v27, 9, s2 -; GFX12-NEXT: v_lshrrev_b16 v12, 1, s22 -; GFX12-NEXT: v_bfe_i32 v6, v5, 0, 1 -; GFX12-NEXT: v_bfe_i32 v4, v4, 0, 1 -; GFX12-NEXT: v_bfe_i32 v28, v28, 0, 1 -; GFX12-NEXT: v_bfe_i32 v26, v26, 0, 1 -; GFX12-NEXT: s_lshr_b32 s4, s2, 22 -; GFX12-NEXT: s_lshr_b32 s8, s2, 23 -; GFX12-NEXT: v_bfe_i32 v2, v2, 0, 1 -; GFX12-NEXT: v_bfe_i32 v10, v3, 0, 1 -; GFX12-NEXT: v_bfe_i32 v8, v8, 0, 1 -; GFX12-NEXT: s_lshr_b32 s10, s2, 20 -; GFX12-NEXT: s_lshr_b32 s12, s2, 21 -; GFX12-NEXT: s_lshr_b32 s20, s2, 17 -; GFX12-NEXT: v_bfe_i32 v24, v11, 0, 1 -; GFX12-NEXT: v_bfe_i32 v22, v7, 0, 1 -; GFX12-NEXT: s_lshr_b32 s14, s2, 18 -; GFX12-NEXT: s_lshr_b32 s16, s2, 19 -; GFX12-NEXT: v_bfe_i32 v20, v15, 0, 1 -; GFX12-NEXT: v_bfe_i32 v18, v13, 0, 1 -; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 -; GFX12-NEXT: s_lshr_b32 s18, s2, 16 -; GFX12-NEXT: s_bfe_i64 s[6:7], s[2:3], 0x10000 -; GFX12-NEXT: v_bfe_i32 v16, v16, 0, 1 -; GFX12-NEXT: v_bfe_i32 v14, v14, 0, 1 -; GFX12-NEXT: v_bfe_i32 v44, v1, 0, 1 -; GFX12-NEXT: v_bfe_i32 v42, v0, 0, 1 -; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v62, 0 :: v_dual_mov_b32 v47, s5 -; GFX12-NEXT: v_bfe_i32 v12, v12, 0, 1 -; GFX12-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX12-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX12-NEXT: v_bfe_i32 v32, v27, 0, 1 -; GFX12-NEXT: v_bfe_i32 v30, v9, 0, 1 -; GFX12-NEXT: v_ashrrev_i32_e32 v29, 31, v28 -; GFX12-NEXT: v_ashrrev_i32_e32 v27, 31, v26 -; GFX12-NEXT: v_bfe_i32 v36, v36, 0, 1 -; GFX12-NEXT: v_bfe_i32 v34, v34, 0, 1 -; GFX12-NEXT: v_bfe_i32 v40, v37, 0, 1 -; GFX12-NEXT: v_bfe_i32 v38, v35, 0, 1 -; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v46, s4 :: v_dual_mov_b32 v49, s9 -; GFX12-NEXT: v_dual_mov_b32 v48, s8 :: v_dual_mov_b32 v51, s11 -; GFX12-NEXT: s_bfe_i64 s[2:3], s[22:23], 0x10000 -; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX12-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GFX12-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v53, s13 -; GFX12-NEXT: v_dual_mov_b32 v52, s12 :: v_dual_mov_b32 v55, s15 -; GFX12-NEXT: v_dual_mov_b32 v60, s20 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: v_ashrrev_i32_e32 v25, 31, v24 -; GFX12-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GFX12-NEXT: v_dual_mov_b32 v54, s14 :: v_dual_mov_b32 v57, s17 -; GFX12-NEXT: v_dual_mov_b32 v56, s16 :: v_dual_mov_b32 v59, s19 -; GFX12-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GFX12-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GFX12-NEXT: v_dual_mov_b32 v58, s18 :: v_dual_mov_b32 v61, s21 -; GFX12-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX12-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GFX12-NEXT: v_ashrrev_i32_e32 v45, 31, v44 -; GFX12-NEXT: v_ashrrev_i32_e32 v43, 31, v42 -; GFX12-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GFX12-NEXT: v_ashrrev_i32_e32 v33, 31, v32 -; GFX12-NEXT: v_ashrrev_i32_e32 v31, 31, v30 -; GFX12-NEXT: v_ashrrev_i32_e32 v37, 31, v36 -; GFX12-NEXT: v_ashrrev_i32_e32 v35, 31, v34 -; GFX12-NEXT: v_ashrrev_i32_e32 v41, 31, v40 -; GFX12-NEXT: v_ashrrev_i32_e32 v39, 31, v38 -; GFX12-NEXT: s_clause 0x9 -; GFX12-NEXT: global_store_b128 v62, v[46:49], s[0:1] offset:176 -; GFX12-NEXT: global_store_b128 v62, v[50:53], s[0:1] offset:160 -; GFX12-NEXT: global_store_b128 v62, v[54:57], s[0:1] offset:144 -; GFX12-NEXT: global_store_b128 v62, v[58:61], s[0:1] offset:128 -; GFX12-NEXT: global_store_b128 v62, v[42:45], s[0:1] offset:112 -; GFX12-NEXT: global_store_b128 v62, v[38:41], s[0:1] offset:96 -; GFX12-NEXT: global_store_b128 v62, v[34:37], s[0:1] offset:80 -; GFX12-NEXT: global_store_b128 v62, v[30:33], s[0:1] offset:64 -; GFX12-NEXT: global_store_b128 v62, v[26:29], s[0:1] offset:48 -; GFX12-NEXT: global_store_b128 v62, v[8:11], s[0:1] offset:32 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v11, s3 +; GFX12-NEXT: s_lshr_b32 s34, s2, 30 +; GFX12-NEXT: s_lshr_b32 s36, s2, 31 +; GFX12-NEXT: s_lshr_b32 s38, s2, 28 +; GFX12-NEXT: s_lshr_b32 s40, s2, 29 +; GFX12-NEXT: s_lshr_b32 s42, s2, 26 +; GFX12-NEXT: s_lshr_b32 s44, s2, 27 +; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX12-NEXT: s_lshr_b32 s46, s2, 24 +; GFX12-NEXT: s_lshr_b32 s48, s2, 25 +; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s35 +; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v0, s34 :: v_dual_mov_b32 v3, s37 +; GFX12-NEXT: v_dual_mov_b32 v2, s36 :: v_dual_mov_b32 v5, s39 +; GFX12-NEXT: s_lshr_b32 s26, s2, 22 +; GFX12-NEXT: s_lshr_b32 s50, s2, 23 +; GFX12-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v4, s38 :: v_dual_mov_b32 v7, s41 +; GFX12-NEXT: v_dual_mov_b32 v6, s40 :: v_dual_mov_b32 v9, s43 +; GFX12-NEXT: s_lshr_b32 s52, s2, 20 +; GFX12-NEXT: s_lshr_b32 s54, s2, 21 +; GFX12-NEXT: v_dual_mov_b32 v8, s42 :: v_dual_mov_b32 v11, s45 +; GFX12-NEXT: v_dual_mov_b32 v10, s44 :: v_dual_mov_b32 v13, s47 +; GFX12-NEXT: s_lshr_b32 s56, s2, 18 +; GFX12-NEXT: s_lshr_b32 s58, s2, 19 +; GFX12-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v12, s46 :: v_dual_mov_b32 v15, s49 +; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX12-NEXT: v_mov_b32_e32 v14, s48 +; GFX12-NEXT: s_lshr_b32 s60, s2, 16 +; GFX12-NEXT: s_lshr_b32 s62, s2, 17 +; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 +; GFX12-NEXT: s_lshr_b32 s64, s2, 14 +; GFX12-NEXT: s_lshr_b32 s66, s2, 15 +; GFX12-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:240 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:224 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:208 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:192 +; GFX12-NEXT: v_dual_mov_b32 v1, s27 :: v_dual_mov_b32 v0, s26 +; GFX12-NEXT: v_dual_mov_b32 v3, s51 :: v_dual_mov_b32 v2, s50 +; GFX12-NEXT: v_mov_b32_e32 v5, s53 +; GFX12-NEXT: s_lshr_b32 s30, s2, 12 +; GFX12-NEXT: s_lshr_b32 s28, s2, 13 +; GFX12-NEXT: s_lshr_b32 s24, s2, 10 +; GFX12-NEXT: s_lshr_b32 s22, s2, 11 +; GFX12-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v4, s52 :: v_dual_mov_b32 v7, s55 +; GFX12-NEXT: v_dual_mov_b32 v6, s54 :: v_dual_mov_b32 v9, s57 +; GFX12-NEXT: s_lshr_b32 s20, s2, 8 +; GFX12-NEXT: s_lshr_b32 s18, s2, 9 +; GFX12-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v8, s56 :: v_dual_mov_b32 v11, s59 +; GFX12-NEXT: v_dual_mov_b32 v10, s58 :: v_dual_mov_b32 v13, s61 +; GFX12-NEXT: s_lshr_b32 s16, s2, 6 +; GFX12-NEXT: s_lshr_b32 s14, s2, 7 +; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v12, s60 :: v_dual_mov_b32 v15, s63 +; GFX12-NEXT: v_dual_mov_b32 v14, s62 :: v_dual_mov_b32 v17, s65 +; GFX12-NEXT: s_lshr_b32 s12, s2, 4 +; GFX12-NEXT: s_lshr_b32 s10, s2, 5 +; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v16, s64 :: v_dual_mov_b32 v19, s67 +; GFX12-NEXT: v_dual_mov_b32 v18, s66 :: v_dual_mov_b32 v21, s31 +; GFX12-NEXT: s_lshr_b32 s8, s2, 2 +; GFX12-NEXT: s_lshr_b32 s6, s2, 3 +; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v20, s30 :: v_dual_mov_b32 v23, s29 +; GFX12-NEXT: v_mov_b32_e32 v22, s28 ; GFX12-NEXT: s_clause 0x5 -; GFX12-NEXT: global_store_b128 v62, v[4:7], s[0:1] offset:16 -; GFX12-NEXT: global_store_b128 v62, v[0:3], s[0:1] -; GFX12-NEXT: global_store_b128 v62, v[22:25], s[0:1] offset:240 -; GFX12-NEXT: global_store_b128 v62, v[18:21], s[0:1] offset:224 -; GFX12-NEXT: global_store_b128 v62, v[14:17], s[0:1] offset:208 -; GFX12-NEXT: global_store_b128 v62, v[10:13], s[0:1] offset:192 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:176 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:160 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:144 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:128 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:112 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1] offset:96 +; GFX12-NEXT: v_dual_mov_b32 v1, s25 :: v_dual_mov_b32 v0, s24 +; GFX12-NEXT: v_dual_mov_b32 v3, s23 :: v_dual_mov_b32 v2, s22 +; GFX12-NEXT: v_mov_b32_e32 v5, s21 +; GFX12-NEXT: s_lshr_b32 s68, s2, 1 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v7, s19 +; GFX12-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s17 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v11, s15 +; GFX12-NEXT: v_dual_mov_b32 v10, s14 :: v_dual_mov_b32 v13, s13 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[68:69], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v15, s11 +; GFX12-NEXT: v_dual_mov_b32 v14, s10 :: v_dual_mov_b32 v17, s9 +; GFX12-NEXT: v_dual_mov_b32 v16, s8 :: v_dual_mov_b32 v19, s7 +; GFX12-NEXT: v_dual_mov_b32 v18, s6 :: v_dual_mov_b32 v21, s5 +; GFX12-NEXT: v_dual_mov_b32 v20, s4 :: v_dual_mov_b32 v23, s3 +; GFX12-NEXT: v_mov_b32_e32 v22, s2 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:80 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:64 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -7796,345 +7601,311 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-LABEL: constant_zextload_v64i1_to_v64i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 13, s2 -; GFX8-NEXT: v_and_b32_e32 v18, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 11, s2 -; GFX8-NEXT: v_and_b32_e32 v16, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 9, s2 -; GFX8-NEXT: v_and_b32_e32 v15, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 7, s2 -; GFX8-NEXT: v_and_b32_e32 v13, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 5, s2 -; GFX8-NEXT: v_and_b32_e32 v10, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 3, s2 -; GFX8-NEXT: v_mov_b32_e32 v12, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 14, s2 -; GFX8-NEXT: s_lshr_b32 s33, s3, 24 -; GFX8-NEXT: s_lshr_b32 s24, s2, 24 -; GFX8-NEXT: v_lshrrev_b16_e64 v19, 12, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v17, 10, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v11, 4, s2 -; GFX8-NEXT: v_and_b32_e32 v8, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 2, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 1, s2 -; GFX8-NEXT: s_bfe_u32 s20, s2, 0x10018 -; GFX8-NEXT: s_bfe_u32 s21, s3, 0x10018 -; GFX8-NEXT: s_and_b32 s22, s3, 1 -; GFX8-NEXT: s_and_b32 s23, s2, 1 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 15, s2 -; GFX8-NEXT: s_bfe_u32 s25, s2, 0x10011 -; GFX8-NEXT: s_bfe_u32 s26, s2, 0x10010 -; GFX8-NEXT: s_bfe_u32 s27, s2, 0x10012 -; GFX8-NEXT: s_bfe_u32 s28, s2, 0x10013 -; GFX8-NEXT: s_bfe_u32 s29, s2, 0x10014 -; GFX8-NEXT: s_bfe_u32 s30, s2, 0x10015 -; GFX8-NEXT: s_bfe_u32 s31, s2, 0x10016 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x10017 -; GFX8-NEXT: s_bfe_u32 s34, s3, 0x10011 -; GFX8-NEXT: s_bfe_u32 s35, s3, 0x10010 -; GFX8-NEXT: s_bfe_u32 s36, s3, 0x10012 -; GFX8-NEXT: s_bfe_u32 s37, s3, 0x10013 -; GFX8-NEXT: s_bfe_u32 s38, s3, 0x10016 -; GFX8-NEXT: s_bfe_u32 s39, s3, 0x10017 -; GFX8-NEXT: s_bfe_u32 s40, s3, 0x10015 -; GFX8-NEXT: s_bfe_u32 s41, s3, 0x10014 -; GFX8-NEXT: s_add_u32 s4, s0, 0x1a0 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NEXT: s_add_u32 s6, s0, 0x1b0 -; GFX8-NEXT: s_addc_u32 s7, s1, 0 -; GFX8-NEXT: s_add_u32 s8, s0, 0x190 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NEXT: s_add_u32 s10, s0, 0x180 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: s_add_u32 s12, s0, 0xb0 -; GFX8-NEXT: s_addc_u32 s13, s1, 0 -; GFX8-NEXT: s_add_u32 s14, s0, 0xa0 -; GFX8-NEXT: s_addc_u32 s15, s1, 0 -; GFX8-NEXT: s_add_u32 s16, s0, 0x90 -; GFX8-NEXT: s_addc_u32 s17, s1, 0 -; GFX8-NEXT: s_add_u32 s18, s0, 0x80 -; GFX8-NEXT: s_addc_u32 s19, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 13, s3 -; GFX8-NEXT: s_add_u32 s42, s0, 0x70 -; GFX8-NEXT: v_and_b32_e32 v7, 1, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v23, s42 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v24, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0x170 -; GFX8-NEXT: v_lshrrev_b16_e64 v22, 14, s3 -; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[2:5] -; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshr_b32 s44, s43, 31 +; GFX8-NEXT: s_bfe_u32 s45, s43, 0x1001d +; GFX8-NEXT: s_bfe_u32 s46, s43, 0x1001b +; GFX8-NEXT: s_bfe_u32 s47, s43, 0x10019 +; GFX8-NEXT: s_bfe_u32 s48, s43, 0x10017 +; GFX8-NEXT: s_bfe_u32 s49, s43, 0x10013 +; GFX8-NEXT: s_bfe_u32 s50, s43, 0x10011 +; GFX8-NEXT: s_bfe_u32 s51, s43, 0x1000f +; GFX8-NEXT: s_bfe_u32 s52, s43, 0x1000d +; GFX8-NEXT: s_bfe_u32 s53, s43, 0x1000b +; GFX8-NEXT: s_bfe_u32 s40, s43, 0x10009 +; GFX8-NEXT: s_bfe_u32 s38, s43, 0x10007 +; GFX8-NEXT: s_bfe_u32 s37, s43, 0x10005 +; GFX8-NEXT: s_bfe_u32 s35, s43, 0x10003 +; GFX8-NEXT: s_bfe_u32 s33, s43, 0x10001 +; GFX8-NEXT: s_lshr_b32 s30, s42, 31 +; GFX8-NEXT: s_bfe_u32 s28, s42, 0x1001d +; GFX8-NEXT: s_bfe_u32 s26, s42, 0x1001b +; GFX8-NEXT: s_bfe_u32 s25, s42, 0x10019 +; GFX8-NEXT: s_bfe_u32 s22, s42, 0x10017 +; GFX8-NEXT: s_bfe_u32 s19, s42, 0x10013 +; GFX8-NEXT: s_bfe_u32 s17, s42, 0x10011 +; GFX8-NEXT: s_bfe_u32 s15, s42, 0x1000f +; GFX8-NEXT: s_bfe_u32 s13, s42, 0x1000d +; GFX8-NEXT: s_bfe_u32 s12, s42, 0x1000b +; GFX8-NEXT: s_bfe_u32 s10, s42, 0x10009 +; GFX8-NEXT: s_bfe_u32 s8, s42, 0x10007 +; GFX8-NEXT: s_bfe_u32 s6, s42, 0x10005 +; GFX8-NEXT: s_bfe_u32 s4, s42, 0x10003 +; GFX8-NEXT: s_bfe_u32 s2, s42, 0x10001 +; GFX8-NEXT: s_and_b32 s3, s42, 1 +; GFX8-NEXT: s_bfe_u32 s5, s42, 0x10002 +; GFX8-NEXT: s_bfe_u32 s7, s42, 0x10004 +; GFX8-NEXT: s_bfe_u32 s9, s42, 0x10006 +; GFX8-NEXT: s_bfe_u32 s11, s42, 0x10008 +; GFX8-NEXT: s_bfe_u32 s14, s42, 0x1000a +; GFX8-NEXT: s_bfe_u32 s16, s42, 0x1000c +; GFX8-NEXT: s_bfe_u32 s18, s42, 0x1000e +; GFX8-NEXT: s_bfe_u32 s20, s42, 0x10010 +; GFX8-NEXT: s_bfe_u32 s21, s42, 0x10012 +; GFX8-NEXT: s_bfe_u32 s23, s42, 0x10014 +; GFX8-NEXT: s_bfe_u32 s24, s42, 0x10015 +; GFX8-NEXT: s_bfe_u32 s27, s42, 0x10016 +; GFX8-NEXT: s_bfe_u32 s29, s42, 0x10018 +; GFX8-NEXT: s_bfe_u32 s31, s42, 0x1001a +; GFX8-NEXT: s_bfe_u32 s34, s42, 0x1001c +; GFX8-NEXT: s_bfe_u32 s36, s42, 0x1001e +; GFX8-NEXT: s_and_b32 s39, s43, 1 +; GFX8-NEXT: s_bfe_u32 s41, s43, 0x10002 +; GFX8-NEXT: s_bfe_u32 s54, s43, 0x10004 +; GFX8-NEXT: s_bfe_u32 s55, s43, 0x10006 +; GFX8-NEXT: s_bfe_u32 s56, s43, 0x10008 +; GFX8-NEXT: s_bfe_u32 s57, s43, 0x1000a +; GFX8-NEXT: s_bfe_u32 s58, s43, 0x1000c +; GFX8-NEXT: s_bfe_u32 s59, s43, 0x1000e +; GFX8-NEXT: s_bfe_u32 s60, s43, 0x10010 +; GFX8-NEXT: s_bfe_u32 s61, s43, 0x10012 +; GFX8-NEXT: s_bfe_u32 s62, s43, 0x10016 +; GFX8-NEXT: s_bfe_u32 s63, s43, 0x10018 +; GFX8-NEXT: s_bfe_u32 s64, s43, 0x1001a +; GFX8-NEXT: s_bfe_u32 s65, s43, 0x1001c +; GFX8-NEXT: s_bfe_u32 s66, s43, 0x1001e +; GFX8-NEXT: s_bfe_u32 s42, s43, 0x10015 +; GFX8-NEXT: s_bfe_u32 s43, s43, 0x10014 ; GFX8-NEXT: v_mov_b32_e32 v2, s42 -; GFX8-NEXT: v_and_b32_e32 v22, 1, v22 -; GFX8-NEXT: v_lshrrev_b16_e64 v24, 15, s3 -; GFX8-NEXT: v_mov_b32_e32 v23, v1 -; GFX8-NEXT: v_mov_b32_e32 v25, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s43 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[22:25] -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 11, s3 +; GFX8-NEXT: s_add_u32 s42, s0, 0x1a0 +; GFX8-NEXT: v_mov_b32_e32 v0, s43 +; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: v_mov_b32_e32 v5, s43 ; GFX8-NEXT: s_add_u32 s42, s0, 0x1f0 -; GFX8-NEXT: v_lshrrev_b16_e64 v21, 6, s33 -; GFX8-NEXT: v_and_b32_e32 v4, 1, v2 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s42 -; GFX8-NEXT: v_and_b32_e32 v21, 1, v21 -; GFX8-NEXT: v_lshrrev_b16_e64 v23, 7, s33 -; GFX8-NEXT: v_mov_b32_e32 v22, v1 -; GFX8-NEXT: v_mov_b32_e32 v24, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0xf0 -; GFX8-NEXT: v_lshrrev_b16_e64 v20, 6, s24 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[21:24] +; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: v_mov_b32_e32 v0, s66 +; GFX8-NEXT: v_mov_b32_e32 v2, s44 +; GFX8-NEXT: v_mov_b32_e32 v5, s43 +; GFX8-NEXT: s_add_u32 s42, s0, 0x1e0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s42 -; GFX8-NEXT: v_and_b32_e32 v22, 1, v20 -; GFX8-NEXT: v_lshrrev_b16_e64 v24, 7, s24 -; GFX8-NEXT: v_mov_b32_e32 v23, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0x60 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[22:25] +; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: v_mov_b32_e32 v0, s65 +; GFX8-NEXT: v_mov_b32_e32 v2, s45 +; GFX8-NEXT: v_mov_b32_e32 v5, s43 +; GFX8-NEXT: s_add_u32 s42, s0, 0x1d0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v22, 1, v19 -; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v18 -; GFX8-NEXT: v_mov_b32_e32 v18, s42 -; GFX8-NEXT: v_mov_b32_e32 v19, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0x50 -; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[22:25] +; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: v_mov_b32_e32 v0, s64 +; GFX8-NEXT: v_mov_b32_e32 v2, s46 +; GFX8-NEXT: v_mov_b32_e32 v5, s43 +; GFX8-NEXT: s_add_u32 s42, s0, 0x1c0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v22, 1, v17 -; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v16 -; GFX8-NEXT: v_mov_b32_e32 v16, s42 -; GFX8-NEXT: v_mov_b32_e32 v17, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 64 -; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v17, 1 +; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: v_mov_b32_e32 v0, s63 +; GFX8-NEXT: v_mov_b32_e32 v2, s47 +; GFX8-NEXT: v_mov_b32_e32 v5, s43 +; GFX8-NEXT: s_add_u32 s42, s0, 0x1b0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v26, s42 -; GFX8-NEXT: v_and_b32_sdwa v22, v12, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v15 -; GFX8-NEXT: v_mov_b32_e32 v27, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 48 -; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] +; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: v_mov_b32_e32 v0, s62 +; GFX8-NEXT: v_mov_b32_e32 v2, s48 +; GFX8-NEXT: v_mov_b32_e32 v5, s43 +; GFX8-NEXT: s_add_u32 s42, s0, 0x190 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v22, 1, v14 -; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v13 -; GFX8-NEXT: v_mov_b32_e32 v13, s42 -; GFX8-NEXT: v_mov_b32_e32 v14, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 32 -; GFX8-NEXT: flat_store_dwordx4 v[13:14], v[22:25] +; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: v_mov_b32_e32 v0, s61 +; GFX8-NEXT: v_mov_b32_e32 v2, s49 +; GFX8-NEXT: v_mov_b32_e32 v5, s43 +; GFX8-NEXT: s_add_u32 s42, s0, 0x180 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v22, 1, v11 -; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v10 -; GFX8-NEXT: v_mov_b32_e32 v10, s42 -; GFX8-NEXT: v_mov_b32_e32 v11, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 16 -; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[22:25] +; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: v_mov_b32_e32 v0, s60 +; GFX8-NEXT: v_mov_b32_e32 v2, s50 +; GFX8-NEXT: v_mov_b32_e32 v5, s43 +; GFX8-NEXT: s_add_u32 s42, s0, 0x170 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v22, 1, v9 -; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v8 -; GFX8-NEXT: v_mov_b32_e32 v8, s42 -; GFX8-NEXT: v_mov_b32_e32 v9, s43 +; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: v_mov_b32_e32 v0, s59 +; GFX8-NEXT: v_mov_b32_e32 v2, s51 +; GFX8-NEXT: v_mov_b32_e32 v5, s43 ; GFX8-NEXT: s_add_u32 s42, s0, 0x160 -; GFX8-NEXT: v_lshrrev_b16_e64 v5, 12, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v10, 3, s33 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[22:25] -; GFX8-NEXT: v_lshrrev_b16_e64 v8, 1, s33 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v22, s42 -; GFX8-NEXT: v_and_b32_e32 v28, 1, v10 -; GFX8-NEXT: v_and_b32_e32 v19, 1, v8 -; GFX8-NEXT: v_and_b32_e32 v8, 1, v5 -; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v7 -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v11, v1 -; GFX8-NEXT: v_mov_b32_e32 v23, s43 -; GFX8-NEXT: v_lshrrev_b16_e64 v5, 5, s24 +; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: v_mov_b32_e32 v0, s58 +; GFX8-NEXT: v_mov_b32_e32 v2, s52 +; GFX8-NEXT: v_mov_b32_e32 v5, s43 ; GFX8-NEXT: s_add_u32 s42, s0, 0x150 -; GFX8-NEXT: v_lshrrev_b16_e64 v21, 10, s3 -; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[8:11] -; GFX8-NEXT: v_and_b32_e32 v22, 1, v5 -; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v4 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s42 -; GFX8-NEXT: v_and_b32_e32 v7, 1, v21 -; GFX8-NEXT: v_mov_b32_e32 v8, v1 -; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, s57 +; GFX8-NEXT: v_mov_b32_e32 v2, s53 ; GFX8-NEXT: v_mov_b32_e32 v5, s43 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[7:10] -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 3, s24 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 9, s3 -; GFX8-NEXT: v_and_b32_e32 v10, 1, v4 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 1, s24 ; GFX8-NEXT: s_add_u32 s42, s0, 0x140 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: v_and_b32_e32 v20, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v8, s42 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 7, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v18, 6, s3 -; GFX8-NEXT: v_and_b32_e32 v11, 0xffff, v4 -; GFX8-NEXT: v_and_b32_sdwa v4, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v20 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, s43 +; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: v_mov_b32_e32 v0, s56 +; GFX8-NEXT: v_mov_b32_e32 v2, s40 +; GFX8-NEXT: v_mov_b32_e32 v5, s43 ; GFX8-NEXT: s_add_u32 s42, s0, 0x130 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX8-NEXT: v_lshrrev_b16_e64 v3, 5, s3 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v7, 1, v18 -; GFX8-NEXT: v_mov_b32_e32 v17, s42 -; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v10 -; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v2 -; GFX8-NEXT: v_mov_b32_e32 v8, v1 -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v18, s43 +; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: v_mov_b32_e32 v0, s55 +; GFX8-NEXT: v_mov_b32_e32 v2, s38 +; GFX8-NEXT: v_mov_b32_e32 v5, s43 ; GFX8-NEXT: s_add_u32 s42, s0, 0x120 -; GFX8-NEXT: v_lshrrev_b16_e64 v16, 4, s3 -; GFX8-NEXT: flat_store_dwordx4 v[17:18], v[7:10] -; GFX8-NEXT: v_and_b32_e32 v18, 0xffff, v3 -; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s42 -; GFX8-NEXT: v_lshrrev_b16_e64 v12, 3, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v13, 1, s3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff, v19 -; GFX8-NEXT: v_and_b32_e32 v16, 1, v16 -; GFX8-NEXT: v_mov_b32_e32 v17, v1 -; GFX8-NEXT: v_mov_b32_e32 v19, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0x110 -; GFX8-NEXT: v_and_b32_e32 v12, 1, v12 -; GFX8-NEXT: v_lshrrev_b16_e64 v15, 2, s3 -; GFX8-NEXT: v_and_b32_e32 v13, 1, v13 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[16:19] +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s42 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_and_b32_e32 v17, 1, v15 -; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v12 -; GFX8-NEXT: v_mov_b32_e32 v18, v1 -; GFX8-NEXT: v_mov_b32_e32 v20, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s43 -; GFX8-NEXT: v_and_b32_e32 v8, 0xffff, v13 -; GFX8-NEXT: v_mov_b32_e32 v13, s5 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[17:20] -; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: v_mov_b32_e32 v0, s54 +; GFX8-NEXT: v_mov_b32_e32 v2, s37 +; GFX8-NEXT: v_mov_b32_e32 v5, s43 +; GFX8-NEXT: s_add_u32 s40, s0, 0x110 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s41 -; GFX8-NEXT: v_mov_b32_e32 v2, s40 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v12, s4 -; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v13, s7 -; GFX8-NEXT: v_mov_b32_e32 v0, s38 -; GFX8-NEXT: v_mov_b32_e32 v2, s39 -; GFX8-NEXT: v_mov_b32_e32 v12, s6 -; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v13, s9 +; GFX8-NEXT: s_addc_u32 s41, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s40 +; GFX8-NEXT: v_mov_b32_e32 v2, s35 +; GFX8-NEXT: v_mov_b32_e32 v5, s41 +; GFX8-NEXT: s_add_u32 s38, s0, 0x100 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s39 +; GFX8-NEXT: s_addc_u32 s39, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s38 +; GFX8-NEXT: v_mov_b32_e32 v2, s33 +; GFX8-NEXT: v_mov_b32_e32 v5, s39 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s36 -; GFX8-NEXT: v_mov_b32_e32 v2, s37 -; GFX8-NEXT: v_mov_b32_e32 v12, s8 -; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v13, s11 -; GFX8-NEXT: v_mov_b32_e32 v0, s35 -; GFX8-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NEXT: v_mov_b32_e32 v12, s10 -; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v12, s12 -; GFX8-NEXT: v_mov_b32_e32 v0, s31 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v13, s13 -; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v12, s14 -; GFX8-NEXT: v_mov_b32_e32 v0, s29 +; GFX8-NEXT: s_add_u32 s36, s0, 0xf0 +; GFX8-NEXT: s_addc_u32 s37, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s36 ; GFX8-NEXT: v_mov_b32_e32 v2, s30 -; GFX8-NEXT: v_mov_b32_e32 v13, s15 -; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v12, s16 -; GFX8-NEXT: v_mov_b32_e32 v0, s27 +; GFX8-NEXT: v_mov_b32_e32 v5, s37 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s34 +; GFX8-NEXT: s_add_u32 s34, s0, 0xe0 +; GFX8-NEXT: s_addc_u32 s35, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 ; GFX8-NEXT: v_mov_b32_e32 v2, s28 -; GFX8-NEXT: v_mov_b32_e32 v13, s17 -; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v12, s18 -; GFX8-NEXT: v_mov_b32_e32 v0, s26 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: s_add_u32 s30, s0, 0xd0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s31 +; GFX8-NEXT: s_addc_u32 s31, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s30 +; GFX8-NEXT: v_mov_b32_e32 v2, s26 +; GFX8-NEXT: v_mov_b32_e32 v5, s31 +; GFX8-NEXT: s_add_u32 s28, s0, 0xc0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s29 +; GFX8-NEXT: s_addc_u32 s29, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s28 ; GFX8-NEXT: v_mov_b32_e32 v2, s25 -; GFX8-NEXT: v_mov_b32_e32 v13, s19 -; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v13, s1 -; GFX8-NEXT: s_add_u32 s2, s0, 0x100 +; GFX8-NEXT: v_mov_b32_e32 v5, s29 +; GFX8-NEXT: s_add_u32 s26, s0, 0xb0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s27 +; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: v_mov_b32_e32 v2, s22 +; GFX8-NEXT: v_mov_b32_e32 v5, s27 +; GFX8-NEXT: s_add_u32 s22, s0, 0xa0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s23 -; GFX8-NEXT: v_mov_b32_e32 v2, v10 -; GFX8-NEXT: v_mov_b32_e32 v12, s0 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v13, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 5, s33 -; GFX8-NEXT: v_mov_b32_e32 v0, s22 -; GFX8-NEXT: v_mov_b32_e32 v2, v8 -; GFX8-NEXT: v_mov_b32_e32 v12, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0x1e0 -; GFX8-NEXT: v_and_b32_e32 v26, 1, v14 -; GFX8-NEXT: v_lshrrev_b16_e64 v27, 4, s33 -; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_and_b32_e32 v17, 1, v27 -; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v26 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_add_u32 s2, s0, 0x1d0 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[17:20] -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_add_u32 s2, s0, 0x1c0 -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 2, s33 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v14, 1, v14 -; GFX8-NEXT: v_and_b32_e32 v16, 0xffff, v28 -; GFX8-NEXT: v_mov_b32_e32 v15, v1 -; GFX8-NEXT: v_mov_b32_e32 v17, v1 -; GFX8-NEXT: v_mov_b32_e32 v13, s3 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[14:17] +; GFX8-NEXT: s_addc_u32 s23, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s22 +; GFX8-NEXT: v_mov_b32_e32 v2, s24 +; GFX8-NEXT: v_mov_b32_e32 v5, s23 +; GFX8-NEXT: s_add_u32 s22, s0, 0x90 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s23, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s22 ; GFX8-NEXT: v_mov_b32_e32 v0, s21 -; GFX8-NEXT: v_mov_b32_e32 v2, v5 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v12, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0xe0 -; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v23, 4, s24 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_add_u32 s2, s0, 0xd0 -; GFX8-NEXT: v_and_b32_e32 v7, 1, v23 -; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v22 -; GFX8-NEXT: v_mov_b32_e32 v8, v1 -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e64 v21, 2, s24 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[7:10] -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: s_add_u32 s0, s0, 0xc0 -; GFX8-NEXT: v_and_b32_e32 v4, 1, v21 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s19 +; GFX8-NEXT: v_mov_b32_e32 v5, s23 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: s_add_u32 s20, s0, 0x80 +; GFX8-NEXT: s_addc_u32 s21, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s20 +; GFX8-NEXT: v_mov_b32_e32 v2, s17 +; GFX8-NEXT: v_mov_b32_e32 v5, s21 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: s_add_u32 s18, s0, 0x70 +; GFX8-NEXT: s_addc_u32 s19, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s18 +; GFX8-NEXT: v_mov_b32_e32 v2, s15 +; GFX8-NEXT: v_mov_b32_e32 v5, s19 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NEXT: s_add_u32 s16, s0, 0x60 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 +; GFX8-NEXT: v_mov_b32_e32 v2, s13 +; GFX8-NEXT: v_mov_b32_e32 v5, s17 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NEXT: s_add_u32 s12, s0, 0x50 +; GFX8-NEXT: s_addc_u32 s13, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s12 +; GFX8-NEXT: v_mov_b32_e32 v0, s14 +; GFX8-NEXT: v_mov_b32_e32 v5, s13 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: s_add_u32 s10, s0, 64 +; GFX8-NEXT: v_mov_b32_e32 v0, s11 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NEXT: s_add_u32 s8, s0, 48 +; GFX8-NEXT: v_mov_b32_e32 v0, s9 +; GFX8-NEXT: s_addc_u32 s9, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: s_add_u32 s6, s0, 32 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_add_u32 s4, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, v11 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -8450,217 +8221,200 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v0, 13, s2 -; GFX12-NEXT: v_lshrrev_b16 v3, 11, s2 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: v_lshrrev_b16 v2, 12, s2 -; GFX12-NEXT: v_lshrrev_b16 v4, 9, s2 -; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX12-NEXT: v_lshrrev_b16 v10, 3, s2 -; GFX12-NEXT: s_bfe_u32 s8, s3, 0x10014 -; GFX12-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX12-NEXT: v_lshrrev_b16 v6, 7, s2 -; GFX12-NEXT: s_bfe_u32 s9, s3, 0x10015 -; GFX12-NEXT: v_lshrrev_b16 v12, 1, s2 -; GFX12-NEXT: v_lshrrev_b16 v16, 11, s3 -; GFX12-NEXT: v_and_b32_e32 v36, 1, v4 -; GFX12-NEXT: v_and_b32_e32 v43, 1, v10 -; GFX12-NEXT: v_dual_mov_b32 v68, v1 :: v_dual_and_b32 v69, 1, v2 -; GFX12-NEXT: v_dual_mov_b32 v62, v1 :: v_dual_and_b32 v71, 0xffff, v0 +; GFX12-NEXT: s_bfe_u32 s4, s3, 0x10014 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_and_b32 v67, 0xffff, v3 -; GFX12-NEXT: v_mov_b32_e32 v66, v1 -; GFX12-NEXT: v_dual_mov_b32 v2, s9 :: v_dual_mov_b32 v3, v1 -; GFX12-NEXT: s_lshr_b32 s4, s3, 24 -; GFX12-NEXT: v_lshrrev_b16 v8, 5, s2 -; GFX12-NEXT: v_lshrrev_b16 v14, 13, s3 -; GFX12-NEXT: v_lshrrev_b16 v18, 9, s3 -; GFX12-NEXT: v_dual_mov_b32 v47, v1 :: v_dual_and_b32 v38, 1, v6 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10015 +; GFX12-NEXT: s_lshr_b32 s4, s3, 31 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_lshrrev_b16 v4, 5, s4 -; GFX12-NEXT: v_lshrrev_b16 v6, 3, s4 -; GFX12-NEXT: s_bfe_u32 s8, s3, 0x10016 -; GFX12-NEXT: s_bfe_u32 s9, s3, 0x10017 -; GFX12-NEXT: v_lshrrev_b16 v20, 7, s3 -; GFX12-NEXT: v_and_b32_e32 v45, 1, v12 -; GFX12-NEXT: v_and_b32_e32 v41, 1, v16 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: s_bfe_u32 s5, s3, 0x1001e ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:416 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v2, s9 -; GFX12-NEXT: v_mov_b32_e32 v0, s8 -; GFX12-NEXT: s_lshr_b32 s5, s2, 24 -; GFX12-NEXT: v_lshrrev_b16 v22, 5, s3 -; GFX12-NEXT: v_lshrrev_b16 v24, 3, s3 -; GFX12-NEXT: v_dual_mov_b32 v49, v1 :: v_dual_and_b32 v40, 1, v8 -; GFX12-NEXT: v_and_b32_e32 v44, 1, v14 -; GFX12-NEXT: v_and_b32_e32 v14, 1, v6 +; GFX12-NEXT: v_mov_b32_e32 v0, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_bfe_u32 s4, s3, 0x1001d +; GFX12-NEXT: s_bfe_u32 s5, s3, 0x1001c +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:496 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_lshrrev_b16 v6, 5, s5 -; GFX12-NEXT: v_lshrrev_b16 v8, 1, s5 -; GFX12-NEXT: v_lshrrev_b16 v10, 3, s5 -; GFX12-NEXT: s_bfe_u32 s8, s3, 0x10012 -; GFX12-NEXT: v_and_b32_e32 v37, 1, v18 -; GFX12-NEXT: v_and_b32_e32 v18, 1, v4 -; GFX12-NEXT: v_lshrrev_b16 v4, 1, s4 -; GFX12-NEXT: s_bfe_u32 s9, s3, 0x10013 -; GFX12-NEXT: v_and_b32_e32 v33, 1, v20 -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:432 +; GFX12-NEXT: v_mov_b32_e32 v0, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_bfe_u32 s4, s3, 0x1001b +; GFX12-NEXT: s_bfe_u32 s5, s3, 0x1001a +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:480 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v2, s9 -; GFX12-NEXT: v_mov_b32_e32 v0, s8 -; GFX12-NEXT: v_lshrrev_b16 v9, 15, s3 -; GFX12-NEXT: v_lshrrev_b16 v11, 14, s3 -; GFX12-NEXT: v_lshrrev_b16 v23, 12, s3 -; GFX12-NEXT: v_lshrrev_b16 v25, 10, s3 -; GFX12-NEXT: v_lshrrev_b16 v27, 8, s3 -; GFX12-NEXT: v_lshrrev_b16 v29, 6, s3 -; GFX12-NEXT: v_lshrrev_b16 v28, 4, s3 -; GFX12-NEXT: v_lshrrev_b16 v26, 1, s3 -; GFX12-NEXT: v_and_b32_e32 v30, 1, v22 -; GFX12-NEXT: v_and_b32_e32 v31, 1, v24 -; GFX12-NEXT: v_lshrrev_b16 v24, 2, s3 -; GFX12-NEXT: v_lshrrev_b16 v22, 7, s5 -; GFX12-NEXT: v_lshrrev_b16 v20, 6, s5 -; GFX12-NEXT: v_and_b32_e32 v39, 1, v6 -; GFX12-NEXT: v_lshrrev_b16 v6, 2, s5 -; GFX12-NEXT: s_and_b32 s6, s3, 1 -; GFX12-NEXT: s_bfe_u32 s8, s3, 0x10011 -; GFX12-NEXT: v_and_b32_e32 v35, 1, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 1, v8 -; GFX12-NEXT: v_and_b32_e32 v8, 1, v10 -; GFX12-NEXT: v_lshrrev_b16 v10, 4, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_bfe_u32 s4, s3, 0x10019 ; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10018 -; GFX12-NEXT: s_bfe_u32 s3, s3, 0x10010 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:464 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_bfe_u32 s4, s3, 0x10017 +; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10016 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:448 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_bfe_u32 s4, s3, 0x10013 +; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10012 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:432 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_bfe_u32 s4, s3, 0x10011 +; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10010 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:400 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v0, s3 -; GFX12-NEXT: v_mov_b32_e32 v2, s8 -; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10016 -; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10017 -; GFX12-NEXT: v_lshrrev_b16 v13, 10, s2 -; GFX12-NEXT: v_and_b32_e32 v26, 1, v26 -; GFX12-NEXT: v_and_b32_e32 v82, 0xffff, v35 -; GFX12-NEXT: v_and_b32_e32 v35, 1, v27 +; GFX12-NEXT: v_mov_b32_e32 v0, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_bfe_u32 s4, s3, 0x1000f +; GFX12-NEXT: s_bfe_u32 s5, s3, 0x1000e ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:384 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v0, s3 -; GFX12-NEXT: v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v27, v1 -; GFX12-NEXT: v_and_b32_e32 v81, 0xffff, v4 -; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10014 -; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10015 -; GFX12-NEXT: v_dual_mov_b32 v72, v1 :: v_dual_and_b32 v65, 1, v13 -; GFX12-NEXT: v_mov_b32_e32 v13, v1 -; GFX12-NEXT: v_and_b32_e32 v83, 0xffff, v26 -; GFX12-NEXT: v_and_b32_e32 v26, 0xffff, v31 -; GFX12-NEXT: v_and_b32_e32 v31, 1, v29 -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176 +; GFX12-NEXT: v_mov_b32_e32 v0, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_bfe_u32 s4, s3, 0x1000d +; GFX12-NEXT: s_bfe_u32 s5, s3, 0x1000c +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:368 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_bfe_u32 s4, s3, 0x1000b +; GFX12-NEXT: s_bfe_u32 s5, s3, 0x1000a +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:352 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_bfe_u32 s4, s3, 0x10009 +; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10008 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:336 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_bfe_u32 s4, s3, 0x10007 +; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10006 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:320 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_bfe_u32 s4, s3, 0x10005 +; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10004 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:304 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_bfe_u32 s4, s3, 0x10003 +; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10002 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:288 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_bfe_u32 s4, s3, 0x10001 +; GFX12-NEXT: s_and_b32 s3, s3, 1 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:272 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s3 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_lshr_b32 s3, s2, 31 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1001e +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:256 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1001d +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1001c +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:240 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1001b +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1001a +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:224 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10019 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:208 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10017 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10016 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:192 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10014 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10015 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s3 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10013 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10012 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10011 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10010 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000f +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1000e +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:128 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000d +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1000c +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000b +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1000a +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:96 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10009 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10008 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10007 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10006 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:64 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10005 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10004 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v2, s8 -; GFX12-NEXT: v_mov_b32_e32 v0, s3 -; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10012 -; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10013 -; GFX12-NEXT: v_lshrrev_b16 v5, 15, s2 -; GFX12-NEXT: v_lshrrev_b16 v7, 14, s2 -; GFX12-NEXT: v_lshrrev_b16 v21, 2, s2 -; GFX12-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10003 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10002 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v0, s3 -; GFX12-NEXT: v_mov_b32_e32 v2, s8 -; GFX12-NEXT: v_lshrrev_b16 v15, 8, s2 -; GFX12-NEXT: v_lshrrev_b16 v17, 6, s2 -; GFX12-NEXT: v_lshrrev_b16 v19, 4, s2 -; GFX12-NEXT: v_lshrrev_b16 v32, 7, s4 -; GFX12-NEXT: v_lshrrev_b16 v34, 6, s4 -; GFX12-NEXT: v_lshrrev_b16 v16, 4, s4 -; GFX12-NEXT: v_lshrrev_b16 v12, 2, s4 -; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018 -; GFX12-NEXT: s_and_b32 s7, s2, 1 -; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10011 -; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10010 -; GFX12-NEXT: v_and_b32_e32 v4, 1, v6 -; GFX12-NEXT: v_and_b32_e32 v37, 0xffff, v37 -; GFX12-NEXT: v_dual_mov_b32 v78, v1 :: v_dual_and_b32 v41, 0xffff, v41 -; GFX12-NEXT: v_dual_mov_b32 v80, v1 :: v_dual_and_b32 v29, 0xffff, v45 -; GFX12-NEXT: v_dual_mov_b32 v21, v1 :: v_dual_and_b32 v50, 1, v21 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v8 -; GFX12-NEXT: v_and_b32_e32 v8, 1, v10 -; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v39 -; GFX12-NEXT: v_and_b32_e32 v39, 1, v25 -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10001 +; GFX12-NEXT: s_and_b32 s2, s2, 1 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 ; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_and_b32 v77, 1, v7 -; GFX12-NEXT: v_and_b32_e32 v79, 0xffff, v5 -; GFX12-NEXT: v_dual_mov_b32 v70, v1 :: v_dual_and_b32 v63, 0xffff, v36 -; GFX12-NEXT: v_dual_mov_b32 v74, v1 :: v_dual_and_b32 v61, 1, v15 -; GFX12-NEXT: v_dual_mov_b32 v64, v1 :: v_dual_and_b32 v73, 1, v11 -; GFX12-NEXT: v_dual_mov_b32 v59, v1 :: v_dual_and_b32 v12, 1, v12 -; GFX12-NEXT: v_dual_mov_b32 v51, v1 :: v_dual_and_b32 v20, 1, v20 -; GFX12-NEXT: v_dual_mov_b32 v53, v1 :: v_dual_and_b32 v22, 0xffff, v22 -; GFX12-NEXT: v_and_b32_e32 v52, 0xffff, v43 -; GFX12-NEXT: v_and_b32_e32 v58, 1, v17 -; GFX12-NEXT: v_dual_mov_b32 v25, v1 :: v_dual_and_b32 v60, 0xffff, v38 -; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:128 -; GFX12-NEXT: v_mov_b32_e32 v0, s7 -; GFX12-NEXT: v_mov_b32_e32 v2, v29 -; GFX12-NEXT: s_clause 0x3 -; GFX12-NEXT: global_store_b128 v1, v[77:80], s[0:1] offset:112 -; GFX12-NEXT: global_store_b128 v1, v[69:72], s[0:1] offset:96 -; GFX12-NEXT: global_store_b128 v1, v[65:68], s[0:1] offset:80 -; GFX12-NEXT: global_store_b128 v1, v[61:64], s[0:1] offset:64 -; GFX12-NEXT: v_dual_mov_b32 v61, v1 :: v_dual_and_b32 v16, 1, v16 -; GFX12-NEXT: v_dual_mov_b32 v55, v1 :: v_dual_and_b32 v54, 1, v19 -; GFX12-NEXT: v_dual_mov_b32 v15, v1 :: v_dual_and_b32 v56, 0xffff, v40 -; GFX12-NEXT: v_dual_mov_b32 v57, v1 :: v_dual_and_b32 v18, 0xffff, v18 -; GFX12-NEXT: v_and_b32_e32 v46, 1, v34 -; GFX12-NEXT: v_and_b32_e32 v48, 0xffff, v32 -; GFX12-NEXT: v_dual_mov_b32 v76, v1 :: v_dual_and_b32 v75, 0xffff, v9 -; GFX12-NEXT: v_dual_mov_b32 v43, v1 :: v_dual_and_b32 v24, 1, v24 -; GFX12-NEXT: v_dual_mov_b32 v17, v1 :: v_dual_and_b32 v42, 1, v23 -; GFX12-NEXT: v_dual_mov_b32 v45, v1 :: v_dual_and_b32 v44, 0xffff, v44 -; GFX12-NEXT: s_clause 0x6 -; GFX12-NEXT: global_store_b128 v1, v[58:61], s[0:1] offset:48 -; GFX12-NEXT: global_store_b128 v1, v[54:57], s[0:1] offset:32 -; GFX12-NEXT: global_store_b128 v1, v[50:53], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] -; GFX12-NEXT: global_store_b128 v1, v[46:49], s[0:1] offset:496 -; GFX12-NEXT: global_store_b128 v1, v[73:76], s[0:1] offset:368 -; GFX12-NEXT: global_store_b128 v1, v[42:45], s[0:1] offset:352 -; GFX12-NEXT: v_mov_b32_e32 v40, v1 -; GFX12-NEXT: v_mov_b32_e32 v42, v1 -; GFX12-NEXT: v_mov_b32_e32 v32, v1 -; GFX12-NEXT: v_mov_b32_e32 v34, v1 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: v_mov_b32_e32 v2, v83 -; GFX12-NEXT: v_mov_b32_e32 v36, v1 -; GFX12-NEXT: v_dual_mov_b32 v38, v1 :: v_dual_mov_b32 v7, v1 -; GFX12-NEXT: s_clause 0x3 -; GFX12-NEXT: global_store_b128 v1, v[39:42], s[0:1] offset:336 -; GFX12-NEXT: global_store_b128 v1, v[35:38], s[0:1] offset:320 -; GFX12-NEXT: global_store_b128 v1, v[31:34], s[0:1] offset:304 -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:256 -; GFX12-NEXT: v_mov_b32_e32 v0, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, v82 :: v_dual_mov_b32 v23, v1 -; GFX12-NEXT: v_mov_b32_e32 v19, v1 -; GFX12-NEXT: s_clause 0x3 -; GFX12-NEXT: global_store_b128 v1, v[24:27], s[0:1] offset:272 -; GFX12-NEXT: global_store_b128 v1, v[20:23], s[0:1] offset:240 -; GFX12-NEXT: global_store_b128 v1, v[16:19], s[0:1] offset:480 -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:448 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: v_dual_mov_b32 v2, v81 :: v_dual_mov_b32 v9, v1 -; GFX12-NEXT: v_mov_b32_e32 v11, v1 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_and_b32 v28, 1, v28 -; GFX12-NEXT: v_dual_mov_b32 v29, v1 :: v_dual_and_b32 v30, 0xffff, v30 -; GFX12-NEXT: v_mov_b32_e32 v31, v1 -; GFX12-NEXT: s_clause 0x4 -; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:464 -; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:224 -; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:208 -; GFX12-NEXT: global_store_b128 v1, v[28:31], s[0:1] offset:288 -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:192 ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -8996,381 +8750,445 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_sextload_v64i1_to_v64i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX8-NEXT: s_mov_b32 s7, 0 -; GFX8-NEXT: s_mov_b32 s13, s7 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s90, -1 +; GFX8-NEXT: s_mov_b32 s91, 0xe80000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v29, s1 -; GFX8-NEXT: v_mov_b32_e32 v28, s0 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; GFX8-NEXT: s_add_u32 s88, s88, s9 +; GFX8-NEXT: s_addc_u32 s89, s89, 0 +; GFX8-NEXT: ; implicit-def: $vgpr44 : SGPR spill to VGPR lane ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s16, s11, 22 -; GFX8-NEXT: s_lshr_b32 s18, s11, 23 -; GFX8-NEXT: s_lshr_b32 s20, s11, 20 -; GFX8-NEXT: s_lshr_b32 s22, s11, 21 -; GFX8-NEXT: s_lshr_b32 s24, s11, 18 -; GFX8-NEXT: s_lshr_b32 s26, s11, 19 -; GFX8-NEXT: s_lshr_b32 s28, s11, 16 -; GFX8-NEXT: s_lshr_b32 s30, s11, 17 -; GFX8-NEXT: s_lshr_b32 s34, s10, 22 -; GFX8-NEXT: s_lshr_b32 s36, s10, 23 -; GFX8-NEXT: s_lshr_b32 s38, s10, 20 -; GFX8-NEXT: s_lshr_b32 s40, s10, 21 -; GFX8-NEXT: s_lshr_b32 s42, s10, 18 -; GFX8-NEXT: s_lshr_b32 s44, s10, 19 -; GFX8-NEXT: s_lshr_b32 s46, s10, 16 -; GFX8-NEXT: s_lshr_b32 s48, s10, 17 -; GFX8-NEXT: s_mov_b32 s6, s11 -; GFX8-NEXT: s_lshr_b32 s12, s11, 24 -; GFX8-NEXT: s_lshr_b32 s8, s10, 24 -; GFX8-NEXT: s_bfe_i64 s[2:3], s[8:9], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[4:5], s[12:13], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[14:15], s[10:11], 0x10000 +; GFX8-NEXT: s_lshr_b32 s0, s3, 8 +; GFX8-NEXT: v_writelane_b32 v44, s0, 0 +; GFX8-NEXT: v_writelane_b32 v44, s1, 1 +; GFX8-NEXT: s_lshr_b32 s0, s2, 1 +; GFX8-NEXT: s_lshr_b32 s36, s3, 21 +; GFX8-NEXT: s_lshr_b32 s30, s3, 19 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 +; GFX8-NEXT: s_lshr_b32 s74, s3, 30 +; GFX8-NEXT: s_lshr_b32 s50, s3, 31 +; GFX8-NEXT: s_lshr_b32 s72, s3, 28 +; GFX8-NEXT: s_lshr_b32 s48, s3, 29 +; GFX8-NEXT: s_lshr_b32 s70, s3, 26 +; GFX8-NEXT: s_lshr_b32 s46, s3, 27 +; GFX8-NEXT: s_lshr_b32 s68, s3, 24 +; GFX8-NEXT: s_lshr_b32 s42, s3, 25 +; GFX8-NEXT: s_lshr_b32 s66, s3, 22 +; GFX8-NEXT: s_lshr_b32 s40, s3, 23 +; GFX8-NEXT: s_lshr_b32 s64, s3, 20 +; GFX8-NEXT: s_lshr_b32 s62, s3, 18 +; GFX8-NEXT: s_lshr_b32 s56, s3, 16 +; GFX8-NEXT: s_lshr_b32 s18, s3, 17 +; GFX8-NEXT: s_lshr_b32 s58, s3, 14 +; GFX8-NEXT: s_lshr_b32 s38, s3, 15 +; GFX8-NEXT: s_lshr_b32 s60, s3, 12 +; GFX8-NEXT: s_lshr_b32 s44, s3, 13 +; GFX8-NEXT: s_lshr_b32 s54, s3, 10 +; GFX8-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX8-NEXT: v_writelane_b32 v44, s0, 2 +; GFX8-NEXT: s_lshr_b32 s52, s3, 11 +; GFX8-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[70:71], s[70:71], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[68:69], s[68:69], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX8-NEXT: v_mov_b32_e32 v18, s36 +; GFX8-NEXT: v_mov_b32_e32 v19, s37 +; GFX8-NEXT: v_mov_b32_e32 v26, s30 +; GFX8-NEXT: v_mov_b32_e32 v27, s31 +; GFX8-NEXT: s_bfe_i64 s[30:31], s[44:45], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[36:37], s[38:39], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX8-NEXT: v_writelane_b32 v44, s1, 3 +; GFX8-NEXT: s_lshr_b32 s6, s3, 9 +; GFX8-NEXT: s_lshr_b32 s8, s3, 6 +; GFX8-NEXT: s_lshr_b32 s10, s3, 7 +; GFX8-NEXT: s_lshr_b32 s12, s3, 4 +; GFX8-NEXT: s_lshr_b32 s14, s3, 5 +; GFX8-NEXT: s_lshr_b32 s16, s3, 2 +; GFX8-NEXT: s_lshr_b32 s20, s3, 3 +; GFX8-NEXT: s_lshr_b32 s22, s3, 1 +; GFX8-NEXT: s_mov_b32 s24, s3 +; GFX8-NEXT: s_lshr_b32 s26, s2, 30 +; GFX8-NEXT: s_lshr_b32 s28, s2, 31 +; GFX8-NEXT: s_lshr_b32 s34, s2, 28 +; GFX8-NEXT: v_mov_b32_e32 v4, s74 +; GFX8-NEXT: v_mov_b32_e32 v8, s72 +; GFX8-NEXT: v_mov_b32_e32 v0, s70 +; GFX8-NEXT: v_mov_b32_e32 v55, s68 +; GFX8-NEXT: v_mov_b32_e32 v20, s66 +; GFX8-NEXT: v_mov_b32_e32 v16, s64 +; GFX8-NEXT: v_mov_b32_e32 v24, s62 +; GFX8-NEXT: v_mov_b32_e32 v28, s56 +; GFX8-NEXT: v_mov_b32_e32 v32, s58 +; GFX8-NEXT: v_mov_b32_e32 v36, s60 +; GFX8-NEXT: s_lshr_b32 s86, s2, 29 +; GFX8-NEXT: v_mov_b32_e32 v40, s54 +; GFX8-NEXT: s_lshr_b32 s84, s2, 26 +; GFX8-NEXT: s_lshr_b32 s82, s2, 27 +; GFX8-NEXT: s_lshr_b32 s80, s2, 24 +; GFX8-NEXT: v_mov_b32_e32 v6, s50 +; GFX8-NEXT: s_lshr_b32 s78, s2, 25 +; GFX8-NEXT: s_lshr_b32 s76, s2, 22 +; GFX8-NEXT: v_mov_b32_e32 v10, s48 +; GFX8-NEXT: s_lshr_b32 s74, s2, 23 +; GFX8-NEXT: s_lshr_b32 s72, s2, 20 +; GFX8-NEXT: v_mov_b32_e32 v2, s46 +; GFX8-NEXT: s_lshr_b32 s70, s2, 21 +; GFX8-NEXT: s_lshr_b32 s68, s2, 18 +; GFX8-NEXT: v_mov_b32_e32 v57, s42 +; GFX8-NEXT: s_lshr_b32 s66, s2, 19 +; GFX8-NEXT: s_lshr_b32 s64, s2, 16 +; GFX8-NEXT: v_mov_b32_e32 v22, s40 +; GFX8-NEXT: s_lshr_b32 s62, s2, 17 +; GFX8-NEXT: s_lshr_b32 s60, s2, 14 +; GFX8-NEXT: s_lshr_b32 s58, s2, 15 +; GFX8-NEXT: s_lshr_b32 s56, s2, 12 +; GFX8-NEXT: s_lshr_b32 s54, s2, 13 +; GFX8-NEXT: s_bfe_i64 vcc, s[52:53], 0x10000 +; GFX8-NEXT: s_lshr_b32 s52, s2, 10 +; GFX8-NEXT: v_mov_b32_e32 v30, s18 +; GFX8-NEXT: v_mov_b32_e32 v31, s19 +; GFX8-NEXT: s_lshr_b32 s50, s2, 11 +; GFX8-NEXT: s_lshr_b32 s48, s2, 8 +; GFX8-NEXT: v_mov_b32_e32 v34, s36 +; GFX8-NEXT: s_lshr_b32 s46, s2, 9 +; GFX8-NEXT: s_lshr_b32 s44, s2, 6 +; GFX8-NEXT: v_mov_b32_e32 v38, s30 +; GFX8-NEXT: s_lshr_b32 s42, s2, 7 +; GFX8-NEXT: s_lshr_b32 s40, s2, 4 +; GFX8-NEXT: s_lshr_b32 s38, s2, 5 +; GFX8-NEXT: s_lshr_b32 s36, s2, 2 +; GFX8-NEXT: s_lshr_b32 s30, s2, 3 +; GFX8-NEXT: s_bfe_i64 s[18:19], s[2:3], 0x10000 +; GFX8-NEXT: v_readlane_b32 s2, v44, 0 +; GFX8-NEXT: v_readlane_b32 s3, v44, 1 +; GFX8-NEXT: v_mov_b32_e32 v5, s75 +; GFX8-NEXT: v_mov_b32_e32 v7, s51 +; GFX8-NEXT: v_mov_b32_e32 v9, s73 +; GFX8-NEXT: v_mov_b32_e32 v11, s49 +; GFX8-NEXT: v_mov_b32_e32 v1, s71 +; GFX8-NEXT: v_mov_b32_e32 v3, s47 +; GFX8-NEXT: v_mov_b32_e32 v56, s69 +; GFX8-NEXT: v_mov_b32_e32 v58, s43 +; GFX8-NEXT: v_mov_b32_e32 v21, s67 +; GFX8-NEXT: v_mov_b32_e32 v23, s41 +; GFX8-NEXT: v_mov_b32_e32 v17, s65 +; GFX8-NEXT: v_mov_b32_e32 v25, s63 +; GFX8-NEXT: v_mov_b32_e32 v29, s57 +; GFX8-NEXT: v_mov_b32_e32 v33, s59 +; GFX8-NEXT: v_mov_b32_e32 v35, s37 +; GFX8-NEXT: v_mov_b32_e32 v37, s61 +; GFX8-NEXT: v_mov_b32_e32 v39, s31 +; GFX8-NEXT: v_mov_b32_e32 v41, s55 +; GFX8-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[68:69], s[68:69], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[70:71], s[70:71], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[76:77], s[76:77], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[78:79], s[78:79], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[80:81], s[80:81], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[82:83], s[82:83], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[84:85], s[84:85], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[86:87], s[86:87], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX8-NEXT: v_mov_b32_e32 v22, s16 -; GFX8-NEXT: s_add_u32 s16, s0, 0x1b0 -; GFX8-NEXT: v_mov_b32_e32 v23, s17 -; GFX8-NEXT: s_addc_u32 s17, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v27, s17 -; GFX8-NEXT: v_mov_b32_e32 v26, s16 -; GFX8-NEXT: s_add_u32 s16, s0, 0x1a0 -; GFX8-NEXT: v_mov_b32_e32 v24, s18 -; GFX8-NEXT: v_mov_b32_e32 v25, s19 -; GFX8-NEXT: s_addc_u32 s17, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v27, s17 -; GFX8-NEXT: v_mov_b32_e32 v26, s16 -; GFX8-NEXT: s_add_u32 s16, s0, 0x190 -; GFX8-NEXT: v_mov_b32_e32 v22, s20 -; GFX8-NEXT: v_mov_b32_e32 v23, s21 -; GFX8-NEXT: v_mov_b32_e32 v24, s22 -; GFX8-NEXT: v_mov_b32_e32 v25, s23 -; GFX8-NEXT: s_addc_u32 s17, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v27, s17 -; GFX8-NEXT: v_mov_b32_e32 v26, s16 -; GFX8-NEXT: s_add_u32 s16, s0, 0x180 -; GFX8-NEXT: v_mov_b32_e32 v22, s24 -; GFX8-NEXT: v_mov_b32_e32 v23, s25 -; GFX8-NEXT: v_mov_b32_e32 v24, s26 -; GFX8-NEXT: v_mov_b32_e32 v25, s27 -; GFX8-NEXT: s_addc_u32 s17, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v27, s17 -; GFX8-NEXT: v_mov_b32_e32 v26, s16 -; GFX8-NEXT: s_add_u32 s16, s0, 0xb0 -; GFX8-NEXT: v_mov_b32_e32 v22, s28 -; GFX8-NEXT: v_mov_b32_e32 v23, s29 -; GFX8-NEXT: v_mov_b32_e32 v24, s30 -; GFX8-NEXT: v_mov_b32_e32 v25, s31 -; GFX8-NEXT: s_addc_u32 s17, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v27, s17 -; GFX8-NEXT: v_mov_b32_e32 v26, s16 -; GFX8-NEXT: s_add_u32 s16, s0, 0xa0 -; GFX8-NEXT: v_mov_b32_e32 v22, s34 -; GFX8-NEXT: v_mov_b32_e32 v23, s35 -; GFX8-NEXT: v_mov_b32_e32 v24, s36 -; GFX8-NEXT: v_mov_b32_e32 v25, s37 -; GFX8-NEXT: s_addc_u32 s17, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v27, s17 -; GFX8-NEXT: v_mov_b32_e32 v26, s16 -; GFX8-NEXT: s_add_u32 s16, s0, 0x90 -; GFX8-NEXT: v_mov_b32_e32 v22, s38 -; GFX8-NEXT: v_mov_b32_e32 v23, s39 -; GFX8-NEXT: v_mov_b32_e32 v24, s40 -; GFX8-NEXT: v_mov_b32_e32 v25, s41 -; GFX8-NEXT: s_addc_u32 s17, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v27, s17 -; GFX8-NEXT: v_mov_b32_e32 v26, s16 -; GFX8-NEXT: s_add_u32 s16, s0, 0x80 -; GFX8-NEXT: v_mov_b32_e32 v22, s42 -; GFX8-NEXT: v_mov_b32_e32 v23, s43 -; GFX8-NEXT: v_mov_b32_e32 v24, s44 -; GFX8-NEXT: v_mov_b32_e32 v25, s45 -; GFX8-NEXT: s_addc_u32 s17, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v27, s17 -; GFX8-NEXT: v_mov_b32_e32 v26, s16 -; GFX8-NEXT: s_add_u32 s16, s0, 0x70 -; GFX8-NEXT: v_lshrrev_b16_e64 v20, 14, s10 -; GFX8-NEXT: v_lshrrev_b16_e64 v21, 15, s10 -; GFX8-NEXT: v_mov_b32_e32 v22, s46 -; GFX8-NEXT: v_mov_b32_e32 v23, s47 -; GFX8-NEXT: v_mov_b32_e32 v24, s48 -; GFX8-NEXT: v_mov_b32_e32 v25, s49 -; GFX8-NEXT: s_addc_u32 s17, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_bfe_i32 v26, v21, 0, 1 -; GFX8-NEXT: v_bfe_i32 v24, v20, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v21, s17 -; GFX8-NEXT: v_mov_b32_e32 v20, s16 -; GFX8-NEXT: s_add_u32 s16, s0, 0x60 -; GFX8-NEXT: v_lshrrev_b16_e64 v18, 12, s10 -; GFX8-NEXT: v_lshrrev_b16_e64 v19, 13, s10 -; GFX8-NEXT: v_ashrrev_i32_e32 v27, 31, v26 -; GFX8-NEXT: v_ashrrev_i32_e32 v25, 31, v24 -; GFX8-NEXT: s_addc_u32 s17, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[24:27] -; GFX8-NEXT: v_lshrrev_b16_e64 v16, 10, s10 -; GFX8-NEXT: v_bfe_i32 v26, v19, 0, 1 -; GFX8-NEXT: v_bfe_i32 v24, v18, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v19, s17 -; GFX8-NEXT: v_lshrrev_b16_e64 v17, 11, s10 -; GFX8-NEXT: v_ashrrev_i32_e32 v27, 31, v26 -; GFX8-NEXT: v_ashrrev_i32_e32 v25, 31, v24 -; GFX8-NEXT: v_mov_b32_e32 v18, s16 -; GFX8-NEXT: s_add_u32 s16, s0, 0x50 -; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[24:27] -; GFX8-NEXT: s_addc_u32 s17, s1, 0 -; GFX8-NEXT: v_bfe_i32 v26, v17, 0, 1 -; GFX8-NEXT: v_bfe_i32 v24, v16, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v16, s16 -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 8, s10 -; GFX8-NEXT: v_lshrrev_b16_e64 v15, 9, s10 -; GFX8-NEXT: v_lshrrev_b16_e64 v12, 6, s10 -; GFX8-NEXT: v_lshrrev_b16_e64 v13, 7, s10 -; GFX8-NEXT: v_lshrrev_b16_e64 v10, 4, s10 -; GFX8-NEXT: v_lshrrev_b16_e64 v11, 5, s10 -; GFX8-NEXT: v_lshrrev_b16_e64 v8, 2, s10 -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 3, s10 -; GFX8-NEXT: v_lshrrev_b16_e64 v7, 1, s10 -; GFX8-NEXT: v_ashrrev_i32_e32 v27, 31, v26 -; GFX8-NEXT: v_ashrrev_i32_e32 v25, 31, v24 -; GFX8-NEXT: v_mov_b32_e32 v17, s17 -; GFX8-NEXT: s_add_u32 s10, s0, 64 -; GFX8-NEXT: v_lshrrev_b16_e64 v5, 14, s11 -; GFX8-NEXT: v_lshrrev_b16_e64 v6, 15, s11 -; GFX8-NEXT: v_lshrrev_b16_e64 v3, 12, s11 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 13, s11 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 10, s11 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 11, s11 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 8, s11 -; GFX8-NEXT: v_lshrrev_b16_e64 v23, 9, s11 -; GFX8-NEXT: v_lshrrev_b16_e64 v22, 6, s11 -; GFX8-NEXT: v_lshrrev_b16_e64 v21, 7, s11 -; GFX8-NEXT: v_lshrrev_b16_e64 v20, 4, s11 -; GFX8-NEXT: v_lshrrev_b16_e64 v19, 5, s11 -; GFX8-NEXT: v_lshrrev_b16_e64 v18, 2, s11 -; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[24:27] -; GFX8-NEXT: v_lshrrev_b16_e64 v17, 3, s11 -; GFX8-NEXT: v_lshrrev_b16_e64 v16, 1, s11 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: v_bfe_i32 v26, v15, 0, 1 -; GFX8-NEXT: v_bfe_i32 v24, v14, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v15, s11 -; GFX8-NEXT: v_mov_b32_e32 v14, s10 -; GFX8-NEXT: s_add_u32 s10, s0, 48 -; GFX8-NEXT: v_ashrrev_i32_e32 v27, 31, v26 -; GFX8-NEXT: v_ashrrev_i32_e32 v25, 31, v24 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 +; GFX8-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[6:7], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[6:7], s[2:3], 0x10000 +; GFX8-NEXT: s_add_u32 s2, s4, 0x1f0 +; GFX8-NEXT: s_addc_u32 s3, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v43, s3 +; GFX8-NEXT: v_mov_b32_e32 v42, s2 +; GFX8-NEXT: s_add_u32 s2, s4, 0x1e0 +; GFX8-NEXT: s_addc_u32 s3, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v46, s3 +; GFX8-NEXT: v_mov_b32_e32 v45, s2 +; GFX8-NEXT: s_add_u32 s2, s4, 0x1d0 +; GFX8-NEXT: s_addc_u32 s3, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v48, s3 +; GFX8-NEXT: v_mov_b32_e32 v47, s2 +; GFX8-NEXT: s_add_u32 s2, s4, 0x1c0 +; GFX8-NEXT: s_addc_u32 s3, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v50, s3 +; GFX8-NEXT: v_mov_b32_e32 v49, s2 +; GFX8-NEXT: s_add_u32 s2, s4, 0x1b0 +; GFX8-NEXT: s_addc_u32 s3, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v52, s3 +; GFX8-NEXT: v_mov_b32_e32 v51, s2 +; GFX8-NEXT: s_add_u32 s2, s4, 0x1a0 +; GFX8-NEXT: s_addc_u32 s3, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v54, s3 +; GFX8-NEXT: v_mov_b32_e32 v53, s2 +; GFX8-NEXT: s_add_u32 s2, s4, 0x190 +; GFX8-NEXT: s_addc_u32 s3, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, s3 +; GFX8-NEXT: v_mov_b32_e32 v14, s2 +; GFX8-NEXT: s_add_u32 s2, s4, 0x180 +; GFX8-NEXT: s_addc_u32 s3, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v13, s3 +; GFX8-NEXT: v_mov_b32_e32 v12, s2 +; GFX8-NEXT: buffer_store_dword v12, off, s[88:91], 0 ; 4-byte Folded Spill +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_store_dword v13, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill +; GFX8-NEXT: flat_store_dwordx4 v[42:43], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[45:46], v[8:11] +; GFX8-NEXT: flat_store_dwordx4 v[47:48], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[49:50], v[55:58] +; GFX8-NEXT: flat_store_dwordx4 v[51:52], v[20:23] +; GFX8-NEXT: flat_store_dwordx4 v[53:54], v[16:19] ; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[24:27] -; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 1 -; GFX8-NEXT: v_bfe_i32 v26, v13, 0, 1 -; GFX8-NEXT: v_bfe_i32 v24, v12, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v13, s11 -; GFX8-NEXT: v_ashrrev_i32_e32 v27, 31, v26 -; GFX8-NEXT: v_ashrrev_i32_e32 v25, 31, v24 -; GFX8-NEXT: v_mov_b32_e32 v12, s10 -; GFX8-NEXT: s_add_u32 s10, s0, 32 -; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[24:27] -; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: v_bfe_i32 v26, v11, 0, 1 -; GFX8-NEXT: v_bfe_i32 v24, v10, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v10, s10 -; GFX8-NEXT: v_ashrrev_i32_e32 v27, 31, v26 -; GFX8-NEXT: v_ashrrev_i32_e32 v25, 31, v24 -; GFX8-NEXT: v_mov_b32_e32 v11, s11 -; GFX8-NEXT: s_add_u32 s10, s0, 16 -; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[24:27] -; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: v_bfe_i32 v26, v9, 0, 1 -; GFX8-NEXT: v_bfe_i32 v24, v8, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v8, s10 -; GFX8-NEXT: v_ashrrev_i32_e32 v27, 31, v26 -; GFX8-NEXT: v_ashrrev_i32_e32 v25, 31, v24 -; GFX8-NEXT: v_mov_b32_e32 v9, s11 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[24:27] -; GFX8-NEXT: s_add_u32 s10, s0, 0x170 -; GFX8-NEXT: v_bfe_i32 v26, v7, 0, 1 -; GFX8-NEXT: v_ashrrev_i32_e32 v27, 31, v26 -; GFX8-NEXT: v_mov_b32_e32 v24, s14 -; GFX8-NEXT: v_mov_b32_e32 v25, s15 -; GFX8-NEXT: flat_store_dwordx4 v[28:29], v[24:27] -; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: v_bfe_i32 v26, v6, 0, 1 -; GFX8-NEXT: v_bfe_i32 v24, v5, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v5, s10 -; GFX8-NEXT: v_mov_b32_e32 v6, s11 -; GFX8-NEXT: s_add_u32 s10, s0, 0x160 -; GFX8-NEXT: v_ashrrev_i32_e32 v27, 31, v26 -; GFX8-NEXT: v_ashrrev_i32_e32 v25, 31, v24 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[5:6], v[24:27] -; GFX8-NEXT: v_bfe_i32 v5, v4, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v25, s11 -; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX8-NEXT: v_mov_b32_e32 v24, s10 -; GFX8-NEXT: s_add_u32 s10, s0, 0x150 -; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[3:6] -; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 1 -; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 1 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s10 -; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GFX8-NEXT: v_mov_b32_e32 v6, s11 -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 6, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v28, 7, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v26, 4, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v27, 5, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v24, 2, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v25, 3, s8 -; GFX8-NEXT: flat_store_dwordx4 v[5:6], v[1:4] -; GFX8-NEXT: v_lshrrev_b16_e64 v6, 1, s8 -; GFX8-NEXT: s_add_u32 s8, s0, 0x140 -; GFX8-NEXT: v_bfe_i32 v2, v23, 0, 1 -; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 1 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, s9 -; GFX8-NEXT: s_add_u32 s8, s0, 0x130 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_bfe_i32 v4, v22, 0, 1 -; GFX8-NEXT: v_bfe_i32 v2, v6, 0, 1 -; GFX8-NEXT: v_bfe_i32 v6, v21, 0, 1 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: s_add_u32 s8, s0, 0x120 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] -; GFX8-NEXT: v_bfe_i32 v21, v19, 0, 1 -; GFX8-NEXT: v_bfe_i32 v19, v20, 0, 1 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: v_ashrrev_i32_e32 v22, 31, v21 -; GFX8-NEXT: v_ashrrev_i32_e32 v20, 31, v19 -; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: s_add_u32 s8, s0, 0x110 -; GFX8-NEXT: v_bfe_i32 v6, v25, 0, 1 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[19:22] -; GFX8-NEXT: v_bfe_i32 v25, v17, 0, 1 -; GFX8-NEXT: v_bfe_i32 v23, v18, 0, 1 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 +; GFX8-NEXT: buffer_load_dword v18, off, s[88:91], 0 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v19, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload +; GFX8-NEXT: s_add_u32 s2, s4, 0x170 +; GFX8-NEXT: s_addc_u32 s3, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v60, s3 +; GFX8-NEXT: v_mov_b32_e32 v59, s2 +; GFX8-NEXT: s_add_u32 s2, s4, 0x160 +; GFX8-NEXT: s_addc_u32 s3, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v62, s3 +; GFX8-NEXT: v_mov_b32_e32 v61, s2 +; GFX8-NEXT: s_add_u32 s2, s4, 0x150 +; GFX8-NEXT: s_addc_u32 s3, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v46, s3 +; GFX8-NEXT: v_mov_b32_e32 v45, s2 +; GFX8-NEXT: s_add_u32 s2, s4, 0x140 +; GFX8-NEXT: s_addc_u32 s3, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s0 +; GFX8-NEXT: s_add_u32 s0, s4, 0x130 +; GFX8-NEXT: v_mov_b32_e32 v7, s1 +; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v17, s1 +; GFX8-NEXT: v_mov_b32_e32 v16, s0 +; GFX8-NEXT: s_add_u32 s0, s4, 0x120 +; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, s1 +; GFX8-NEXT: v_mov_b32_e32 v14, s0 +; GFX8-NEXT: s_add_u32 s0, s4, 0x110 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: v_mov_b32_e32 v13, s3 +; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v42, vcc_lo +; GFX8-NEXT: v_mov_b32_e32 v43, vcc_hi +; GFX8-NEXT: v_mov_b32_e32 v12, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: v_bfe_i32 v4, v24, 0, 1 -; GFX8-NEXT: v_bfe_i32 v19, v26, 0, 1 -; GFX8-NEXT: v_ashrrev_i32_e32 v26, 31, v25 -; GFX8-NEXT: v_ashrrev_i32_e32 v24, 31, v23 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[23:26] -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s12 -; GFX8-NEXT: v_mov_b32_e32 v23, s6 -; GFX8-NEXT: s_add_u32 s6, s0, 0x100 -; GFX8-NEXT: v_bfe_i32 v25, v16, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v24, s7 -; GFX8-NEXT: s_addc_u32 s7, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_lshrrev_b16_e64 v15, 7, s12 -; GFX8-NEXT: v_ashrrev_i32_e32 v26, 31, v25 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: s_add_u32 s6, s0, 0x1f0 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[23:26] -; GFX8-NEXT: v_bfe_i32 v16, v15, 0, 1 -; GFX8-NEXT: v_bfe_i32 v14, v14, 0, 1 -; GFX8-NEXT: s_addc_u32 s7, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_lshrrev_b16_e64 v12, 4, s12 -; GFX8-NEXT: v_lshrrev_b16_e64 v13, 5, s12 -; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX8-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: s_add_u32 s6, s0, 0x1e0 -; GFX8-NEXT: v_bfe_i32 v21, v27, 0, 1 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[14:17] -; GFX8-NEXT: v_bfe_i32 v29, v13, 0, 1 -; GFX8-NEXT: v_bfe_i32 v27, v12, 0, 1 -; GFX8-NEXT: s_addc_u32 s7, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_lshrrev_b16_e64 v10, 2, s12 -; GFX8-NEXT: v_lshrrev_b16_e64 v11, 3, s12 -; GFX8-NEXT: v_bfe_i32 v25, v28, 0, 1 -; GFX8-NEXT: v_ashrrev_i32_e32 v30, 31, v29 -; GFX8-NEXT: v_ashrrev_i32_e32 v28, 31, v27 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: s_add_u32 s6, s0, 0x1d0 -; GFX8-NEXT: v_bfe_i32 v23, v9, 0, 1 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[27:30] -; GFX8-NEXT: v_bfe_i32 v11, v11, 0, 1 -; GFX8-NEXT: v_bfe_i32 v9, v10, 0, 1 -; GFX8-NEXT: s_addc_u32 s7, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GFX8-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_lshrrev_b16_e64 v8, 1, s12 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[9:12] -; GFX8-NEXT: v_bfe_i32 v14, v8, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v12, s4 -; GFX8-NEXT: s_add_u32 s4, s0, 0x1c0 -; GFX8-NEXT: v_mov_b32_e32 v13, s5 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_add_u32 s4, s0, 0xf0 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[12:15] -; GFX8-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_ashrrev_i32_e32 v26, 31, v25 -; GFX8-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_add_u32 s4, s0, 0xe0 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[23:26] -; GFX8-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_add_u32 s4, s0, 0xd0 -; GFX8-NEXT: v_ashrrev_i32_e32 v22, 31, v21 -; GFX8-NEXT: v_ashrrev_i32_e32 v20, 31, v19 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[19:22] -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: s_add_u32 s0, s0, 0xc0 -; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] -; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 +; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: v_mov_b32_e32 v9, s13 +; GFX8-NEXT: v_mov_b32_e32 v10, s14 +; GFX8-NEXT: v_mov_b32_e32 v11, s15 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[28:31] +; GFX8-NEXT: flat_store_dwordx4 v[59:60], v[32:35] +; GFX8-NEXT: flat_store_dwordx4 v[61:62], v[36:39] +; GFX8-NEXT: flat_store_dwordx4 v[45:46], v[40:43] +; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[8:11] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_add_u32 s0, s4, 0x100 +; GFX8-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: v_mov_b32_e32 v2, s20 +; GFX8-NEXT: v_mov_b32_e32 v3, s21 +; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_add_u32 s0, s4, 0xf0 +; GFX8-NEXT: v_mov_b32_e32 v0, s24 +; GFX8-NEXT: v_mov_b32_e32 v1, s25 +; GFX8-NEXT: v_mov_b32_e32 v2, s22 +; GFX8-NEXT: v_mov_b32_e32 v3, s23 +; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_add_u32 s0, s4, 0xe0 +; GFX8-NEXT: v_mov_b32_e32 v0, s26 +; GFX8-NEXT: v_mov_b32_e32 v1, s27 +; GFX8-NEXT: v_mov_b32_e32 v2, s28 +; GFX8-NEXT: v_mov_b32_e32 v3, s29 +; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_add_u32 s0, s4, 0xd0 +; GFX8-NEXT: v_mov_b32_e32 v0, s34 +; GFX8-NEXT: v_mov_b32_e32 v1, s35 +; GFX8-NEXT: v_mov_b32_e32 v2, s86 +; GFX8-NEXT: v_mov_b32_e32 v3, s87 +; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_add_u32 s0, s4, 0xc0 +; GFX8-NEXT: v_mov_b32_e32 v0, s84 +; GFX8-NEXT: v_mov_b32_e32 v1, s85 +; GFX8-NEXT: v_mov_b32_e32 v2, s82 +; GFX8-NEXT: v_mov_b32_e32 v3, s83 +; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_add_u32 s0, s4, 0xb0 +; GFX8-NEXT: v_mov_b32_e32 v0, s80 +; GFX8-NEXT: v_mov_b32_e32 v1, s81 +; GFX8-NEXT: v_mov_b32_e32 v2, s78 +; GFX8-NEXT: v_mov_b32_e32 v3, s79 +; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_add_u32 s0, s4, 0xa0 +; GFX8-NEXT: v_mov_b32_e32 v0, s76 +; GFX8-NEXT: v_mov_b32_e32 v1, s77 +; GFX8-NEXT: v_mov_b32_e32 v2, s74 +; GFX8-NEXT: v_mov_b32_e32 v3, s75 +; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_add_u32 s0, s4, 0x90 +; GFX8-NEXT: v_mov_b32_e32 v0, s72 +; GFX8-NEXT: v_mov_b32_e32 v1, s73 +; GFX8-NEXT: v_mov_b32_e32 v2, s70 +; GFX8-NEXT: v_mov_b32_e32 v3, s71 +; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_add_u32 s0, s4, 0x80 +; GFX8-NEXT: v_mov_b32_e32 v0, s68 +; GFX8-NEXT: v_mov_b32_e32 v1, s69 +; GFX8-NEXT: v_mov_b32_e32 v2, s66 +; GFX8-NEXT: v_mov_b32_e32 v3, s67 +; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_add_u32 s0, s4, 0x70 +; GFX8-NEXT: v_mov_b32_e32 v0, s64 +; GFX8-NEXT: v_mov_b32_e32 v1, s65 +; GFX8-NEXT: v_mov_b32_e32 v2, s62 +; GFX8-NEXT: v_mov_b32_e32 v3, s63 +; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_add_u32 s0, s4, 0x60 +; GFX8-NEXT: v_mov_b32_e32 v0, s60 +; GFX8-NEXT: v_mov_b32_e32 v1, s61 +; GFX8-NEXT: v_mov_b32_e32 v2, s58 +; GFX8-NEXT: v_mov_b32_e32 v3, s59 +; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_add_u32 s0, s4, 0x50 +; GFX8-NEXT: v_mov_b32_e32 v0, s56 +; GFX8-NEXT: v_mov_b32_e32 v1, s57 +; GFX8-NEXT: v_mov_b32_e32 v2, s54 +; GFX8-NEXT: v_mov_b32_e32 v3, s55 +; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_add_u32 s0, s4, 64 +; GFX8-NEXT: v_mov_b32_e32 v0, s52 +; GFX8-NEXT: v_mov_b32_e32 v1, s53 +; GFX8-NEXT: v_mov_b32_e32 v2, s50 +; GFX8-NEXT: v_mov_b32_e32 v3, s51 +; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_add_u32 s0, s4, 48 +; GFX8-NEXT: v_mov_b32_e32 v0, s48 +; GFX8-NEXT: v_mov_b32_e32 v1, s49 +; GFX8-NEXT: v_mov_b32_e32 v2, s46 +; GFX8-NEXT: v_mov_b32_e32 v3, s47 +; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_add_u32 s0, s4, 32 +; GFX8-NEXT: v_mov_b32_e32 v0, s44 +; GFX8-NEXT: v_mov_b32_e32 v1, s45 +; GFX8-NEXT: v_mov_b32_e32 v2, s42 +; GFX8-NEXT: v_mov_b32_e32 v3, s43 +; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_add_u32 s0, s4, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s40 +; GFX8-NEXT: v_mov_b32_e32 v1, s41 +; GFX8-NEXT: v_mov_b32_e32 v2, s38 +; GFX8-NEXT: v_mov_b32_e32 v3, s39 +; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v2, s30 +; GFX8-NEXT: v_mov_b32_e32 v3, s31 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_readlane_b32 s0, v44, 2 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_readlane_b32 s1, v44, 3 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: v_mov_b32_e32 v1, s19 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: ; kill: killed $vgpr44 ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: constant_sextload_v64i1_to_v64i64: @@ -9744,248 +9562,251 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-LABEL: constant_sextload_v64i1_to_v64i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: s_mov_b32 s19, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s5, s19 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_load_b64 s[40:41], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[12:13], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshr_b32 s26, s41, 22 -; GFX12-NEXT: s_lshr_b32 s28, s41, 23 -; GFX12-NEXT: s_lshr_b32 s30, s41, 20 -; GFX12-NEXT: s_lshr_b32 s34, s41, 21 -; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX12-NEXT: s_lshr_b32 s20, s41, 18 -; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX12-NEXT: s_lshr_b32 s96, s13, 30 +; GFX12-NEXT: s_lshr_b32 s98, s13, 31 +; GFX12-NEXT: s_lshr_b32 s92, s13, 28 +; GFX12-NEXT: s_lshr_b32 s94, s13, 29 +; GFX12-NEXT: s_lshr_b32 s78, s13, 26 +; GFX12-NEXT: s_lshr_b32 s88, s13, 27 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_bfe_i64 s[96:97], s[96:97], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[100:101], s[98:99], 0x10000 +; GFX12-NEXT: s_lshr_b32 s66, s13, 24 +; GFX12-NEXT: s_lshr_b32 s74, s13, 25 +; GFX12-NEXT: s_bfe_i64 s[92:93], s[92:93], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[94:95], s[94:95], 0x10000 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s96 +; GFX12-NEXT: s_lshr_b32 s56, s13, 22 +; GFX12-NEXT: s_lshr_b32 s62, s13, 23 +; GFX12-NEXT: v_dual_mov_b32 v2, s97 :: v_dual_mov_b32 v3, s100 +; GFX12-NEXT: v_dual_mov_b32 v4, s101 :: v_dual_mov_b32 v5, s92 +; GFX12-NEXT: s_bfe_i64 s[78:79], s[78:79], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[88:89], s[88:89], 0x10000 +; GFX12-NEXT: s_lshr_b32 s44, s13, 20 +; GFX12-NEXT: s_lshr_b32 s52, s13, 21 +; GFX12-NEXT: s_lshr_b32 s30, s13, 18 +; GFX12-NEXT: s_lshr_b32 s40, s13, 19 +; GFX12-NEXT: s_lshr_b32 s18, s13, 16 +; GFX12-NEXT: s_lshr_b32 s26, s13, 17 +; GFX12-NEXT: s_lshr_b32 s2, s13, 14 +; GFX12-NEXT: s_lshr_b32 s4, s13, 15 +; GFX12-NEXT: v_dual_mov_b32 v6, s93 :: v_dual_mov_b32 v7, s94 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_dual_mov_b32 v8, s95 :: v_dual_mov_b32 v9, s78 +; GFX12-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000 +; GFX12-NEXT: s_lshr_b32 s6, s13, 12 +; GFX12-NEXT: s_lshr_b32 s8, s13, 13 +; GFX12-NEXT: v_dual_mov_b32 v10, s79 :: v_dual_mov_b32 v11, s88 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_dual_mov_b32 v12, s89 :: v_dual_mov_b32 v13, s66 +; GFX12-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000 +; GFX12-NEXT: s_lshr_b32 s10, s13, 10 +; GFX12-NEXT: s_lshr_b32 s14, s13, 11 +; GFX12-NEXT: v_dual_mov_b32 v14, s67 :: v_dual_mov_b32 v15, s74 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_dual_mov_b32 v16, s75 :: v_dual_mov_b32 v17, s56 +; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v69, s26 -; GFX12-NEXT: v_dual_mov_b32 v70, s27 :: v_dual_mov_b32 v71, s28 -; GFX12-NEXT: v_dual_mov_b32 v72, s29 :: v_dual_mov_b32 v73, s30 -; GFX12-NEXT: s_lshr_b32 s22, s41, 19 -; GFX12-NEXT: v_dual_mov_b32 v74, s31 :: v_dual_mov_b32 v75, s34 +; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 +; GFX12-NEXT: s_lshr_b32 s16, s13, 8 +; GFX12-NEXT: s_lshr_b32 s20, s13, 9 +; GFX12-NEXT: v_dual_mov_b32 v18, s57 :: v_dual_mov_b32 v19, s62 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_dual_mov_b32 v20, s63 :: v_dual_mov_b32 v21, s44 +; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 +; GFX12-NEXT: s_lshr_b32 s22, s13, 6 +; GFX12-NEXT: s_lshr_b32 s24, s13, 7 +; GFX12-NEXT: v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v23, s52 +; GFX12-NEXT: v_dual_mov_b32 v24, s53 :: v_dual_mov_b32 v25, s30 +; GFX12-NEXT: v_dual_mov_b32 v26, s31 :: v_dual_mov_b32 v27, s40 +; GFX12-NEXT: v_dual_mov_b32 v28, s41 :: v_dual_mov_b32 v29, s18 +; GFX12-NEXT: v_dual_mov_b32 v30, s19 :: v_dual_mov_b32 v31, s26 +; GFX12-NEXT: v_mov_b32_e32 v32, s27 +; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX12-NEXT: s_clause 0x7 +; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1] offset:496 +; GFX12-NEXT: global_store_b128 v0, v[5:8], s[0:1] offset:480 +; GFX12-NEXT: global_store_b128 v0, v[9:12], s[0:1] offset:464 +; GFX12-NEXT: global_store_b128 v0, v[13:16], s[0:1] offset:448 +; GFX12-NEXT: global_store_b128 v0, v[17:20], s[0:1] offset:432 +; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1] offset:416 +; GFX12-NEXT: global_store_b128 v0, v[25:28], s[0:1] offset:400 +; GFX12-NEXT: global_store_b128 v0, v[29:32], s[0:1] offset:384 +; GFX12-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5 +; GFX12-NEXT: v_mov_b32_e32 v5, s6 +; GFX12-NEXT: s_lshr_b32 s28, s13, 4 +; GFX12-NEXT: s_lshr_b32 s34, s13, 5 +; GFX12-NEXT: s_lshr_b32 s36, s13, 2 +; GFX12-NEXT: s_lshr_b32 s38, s13, 3 ; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX12-NEXT: v_mov_b32_e32 v76, s35 -; GFX12-NEXT: s_lshr_b32 s24, s41, 16 -; GFX12-NEXT: s_lshr_b32 s36, s41, 17 -; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX12-NEXT: s_lshr_b32 s12, s40, 22 -; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX12-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX12-NEXT: s_lshr_b32 s42, s13, 1 +; GFX12-NEXT: s_mov_b32 s46, s13 ; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v12, v[69:72], s[0:1] offset:432 -; GFX12-NEXT: global_store_b128 v12, v[73:76], s[0:1] offset:416 -; GFX12-NEXT: v_dual_mov_b32 v69, s20 :: v_dual_mov_b32 v70, s21 -; GFX12-NEXT: v_dual_mov_b32 v71, s22 :: v_dual_mov_b32 v72, s23 -; GFX12-NEXT: v_mov_b32_e32 v73, s24 -; GFX12-NEXT: s_lshr_b32 s14, s40, 23 -; GFX12-NEXT: v_dual_mov_b32 v74, s25 :: v_dual_mov_b32 v75, s36 -; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 -; GFX12-NEXT: v_mov_b32_e32 v76, s37 -; GFX12-NEXT: s_lshr_b32 s16, s40, 20 -; GFX12-NEXT: s_lshr_b32 s38, s40, 21 -; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 -; GFX12-NEXT: s_lshr_b32 s6, s40, 18 +; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s14 +; GFX12-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16 +; GFX12-NEXT: s_lshr_b32 s48, s12, 30 +; GFX12-NEXT: s_lshr_b32 s50, s12, 31 ; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v12, v[69:72], s[0:1] offset:400 -; GFX12-NEXT: global_store_b128 v12, v[73:76], s[0:1] offset:384 -; GFX12-NEXT: v_dual_mov_b32 v69, s12 :: v_dual_mov_b32 v70, s13 -; GFX12-NEXT: v_dual_mov_b32 v71, s14 :: v_dual_mov_b32 v72, s15 -; GFX12-NEXT: v_mov_b32_e32 v73, s16 -; GFX12-NEXT: s_lshr_b32 s8, s40, 19 -; GFX12-NEXT: s_lshr_b32 s10, s40, 16 -; GFX12-NEXT: s_lshr_b32 s42, s40, 17 -; GFX12-NEXT: v_dual_mov_b32 v74, s17 :: v_dual_mov_b32 v75, s38 -; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 -; GFX12-NEXT: v_mov_b32_e32 v76, s39 +; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s20 +; GFX12-NEXT: v_dual_mov_b32 v16, s21 :: v_dual_mov_b32 v17, s22 +; GFX12-NEXT: s_lshr_b32 s54, s12, 28 +; GFX12-NEXT: s_lshr_b32 s58, s12, 29 +; GFX12-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 -; GFX12-NEXT: v_lshrrev_b16 v0, 14, s40 -; GFX12-NEXT: v_lshrrev_b16 v1, 15, s40 -; GFX12-NEXT: v_lshrrev_b16 v17, 12, s40 -; GFX12-NEXT: v_lshrrev_b16 v18, 13, s40 -; GFX12-NEXT: v_lshrrev_b16 v33, 10, s40 -; GFX12-NEXT: v_lshrrev_b16 v34, 11, s40 -; GFX12-NEXT: v_lshrrev_b16 v65, 8, s40 -; GFX12-NEXT: v_lshrrev_b16 v66, 9, s40 -; GFX12-NEXT: v_lshrrev_b16 v86, 6, s40 -; GFX12-NEXT: v_lshrrev_b16 v82, 7, s40 -; GFX12-NEXT: v_lshrrev_b16 v81, 4, s40 -; GFX12-NEXT: v_lshrrev_b16 v83, 5, s40 -; GFX12-NEXT: v_lshrrev_b16 v77, 2, s40 -; GFX12-NEXT: v_lshrrev_b16 v78, 3, s40 -; GFX12-NEXT: v_lshrrev_b16 v58, 1, s40 -; GFX12-NEXT: v_lshrrev_b16 v60, 14, s41 -; GFX12-NEXT: v_lshrrev_b16 v61, 15, s41 -; GFX12-NEXT: v_lshrrev_b16 v57, 12, s41 -; GFX12-NEXT: v_lshrrev_b16 v54, 13, s41 -; GFX12-NEXT: v_lshrrev_b16 v50, 10, s41 -; GFX12-NEXT: v_lshrrev_b16 v46, 11, s41 -; GFX12-NEXT: v_lshrrev_b16 v49, 8, s41 -; GFX12-NEXT: v_lshrrev_b16 v51, 9, s41 -; GFX12-NEXT: v_lshrrev_b16 v45, 6, s41 -; GFX12-NEXT: v_lshrrev_b16 v38, 7, s41 -; GFX12-NEXT: v_lshrrev_b16 v40, 4, s41 -; GFX12-NEXT: v_lshrrev_b16 v41, 5, s41 -; GFX12-NEXT: v_lshrrev_b16 v37, 2, s41 -; GFX12-NEXT: v_lshrrev_b16 v36, 3, s41 -; GFX12-NEXT: v_lshrrev_b16 v30, 1, s41 -; GFX12-NEXT: s_lshr_b32 s4, s41, 24 -; GFX12-NEXT: s_mov_b32 s18, s41 -; GFX12-NEXT: s_lshr_b32 s2, s40, 24 -; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v12, v[69:72], s[0:1] offset:176 -; GFX12-NEXT: global_store_b128 v12, v[73:76], s[0:1] offset:160 -; GFX12-NEXT: v_dual_mov_b32 v69, s6 :: v_dual_mov_b32 v70, s7 -; GFX12-NEXT: v_dual_mov_b32 v71, s8 :: v_dual_mov_b32 v72, s9 -; GFX12-NEXT: v_dual_mov_b32 v73, s10 :: v_dual_mov_b32 v74, s11 -; GFX12-NEXT: v_dual_mov_b32 v75, s42 :: v_dual_mov_b32 v76, s43 -; GFX12-NEXT: v_bfe_i32 v79, v1, 0, 1 -; GFX12-NEXT: v_bfe_i32 v85, v65, 0, 1 +; GFX12-NEXT: v_dual_mov_b32 v18, s23 :: v_dual_mov_b32 v19, s24 +; GFX12-NEXT: v_dual_mov_b32 v20, s25 :: v_dual_mov_b32 v21, s28 +; GFX12-NEXT: s_lshr_b32 s60, s12, 26 +; GFX12-NEXT: s_lshr_b32 s64, s12, 27 +; GFX12-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v22, s29 :: v_dual_mov_b32 v23, s34 +; GFX12-NEXT: v_mov_b32_e32 v24, s35 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1] offset:368 +; GFX12-NEXT: global_store_b128 v0, v[5:8], s[0:1] offset:352 +; GFX12-NEXT: global_store_b128 v0, v[9:12], s[0:1] offset:336 +; GFX12-NEXT: global_store_b128 v0, v[13:16], s[0:1] offset:320 +; GFX12-NEXT: global_store_b128 v0, v[17:20], s[0:1] offset:304 +; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1] offset:288 +; GFX12-NEXT: v_dual_mov_b32 v1, s36 :: v_dual_mov_b32 v2, s37 +; GFX12-NEXT: v_dual_mov_b32 v3, s38 :: v_dual_mov_b32 v4, s39 +; GFX12-NEXT: v_mov_b32_e32 v5, s46 +; GFX12-NEXT: s_lshr_b32 s68, s12, 24 +; GFX12-NEXT: s_lshr_b32 s70, s12, 25 +; GFX12-NEXT: s_lshr_b32 s72, s12, 22 +; GFX12-NEXT: s_lshr_b32 s76, s12, 23 +; GFX12-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v6, s47 :: v_dual_mov_b32 v7, s42 +; GFX12-NEXT: v_dual_mov_b32 v8, s43 :: v_dual_mov_b32 v9, s48 +; GFX12-NEXT: s_lshr_b32 s80, s12, 20 +; GFX12-NEXT: s_lshr_b32 s82, s12, 21 +; GFX12-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v10, s49 :: v_dual_mov_b32 v11, s50 +; GFX12-NEXT: v_dual_mov_b32 v12, s51 :: v_dual_mov_b32 v13, s54 +; GFX12-NEXT: s_lshr_b32 s84, s12, 18 +; GFX12-NEXT: s_lshr_b32 s86, s12, 19 +; GFX12-NEXT: s_bfe_i64 s[76:77], s[76:77], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[70:71], s[70:71], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[68:69], s[68:69], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v14, s55 :: v_dual_mov_b32 v15, s58 +; GFX12-NEXT: v_dual_mov_b32 v16, s59 :: v_dual_mov_b32 v17, s60 +; GFX12-NEXT: s_lshr_b32 s90, s12, 16 +; GFX12-NEXT: s_lshr_b32 s98, s12, 17 +; GFX12-NEXT: s_bfe_i64 s[82:83], s[82:83], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[80:81], s[80:81], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v18, s61 :: v_dual_mov_b32 v19, s64 +; GFX12-NEXT: v_dual_mov_b32 v20, s65 :: v_dual_mov_b32 v21, s68 +; GFX12-NEXT: s_lshr_b32 s96, s12, 14 +; GFX12-NEXT: s_lshr_b32 s100, s12, 15 +; GFX12-NEXT: s_lshr_b32 s94, s12, 13 +; GFX12-NEXT: s_lshr_b32 s88, s12, 11 +; GFX12-NEXT: s_lshr_b32 s74, s12, 9 +; GFX12-NEXT: s_lshr_b32 s62, s12, 7 +; GFX12-NEXT: s_lshr_b32 s52, s12, 5 +; GFX12-NEXT: s_lshr_b32 s40, s12, 3 +; GFX12-NEXT: s_lshr_b32 s26, s12, 1 +; GFX12-NEXT: s_bfe_i64 s[86:87], s[86:87], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[84:85], s[84:85], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v22, s69 :: v_dual_mov_b32 v23, s70 +; GFX12-NEXT: v_mov_b32_e32 v24, s71 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1] offset:272 +; GFX12-NEXT: global_store_b128 v0, v[5:8], s[0:1] offset:256 +; GFX12-NEXT: global_store_b128 v0, v[9:12], s[0:1] offset:240 +; GFX12-NEXT: global_store_b128 v0, v[13:16], s[0:1] offset:224 +; GFX12-NEXT: global_store_b128 v0, v[17:20], s[0:1] offset:208 +; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1] offset:192 +; GFX12-NEXT: v_dual_mov_b32 v1, s72 :: v_dual_mov_b32 v2, s73 +; GFX12-NEXT: v_dual_mov_b32 v3, s76 :: v_dual_mov_b32 v4, s77 +; GFX12-NEXT: v_mov_b32_e32 v5, s80 +; GFX12-NEXT: s_lshr_b32 s92, s12, 12 +; GFX12-NEXT: s_lshr_b32 s78, s12, 10 +; GFX12-NEXT: s_bfe_i64 s[98:99], s[98:99], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[90:91], s[90:91], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v6, s81 :: v_dual_mov_b32 v7, s82 +; GFX12-NEXT: v_dual_mov_b32 v8, s83 :: v_dual_mov_b32 v9, s84 +; GFX12-NEXT: s_lshr_b32 s66, s12, 8 +; GFX12-NEXT: s_lshr_b32 s56, s12, 6 +; GFX12-NEXT: s_lshr_b32 s44, s12, 4 +; GFX12-NEXT: s_lshr_b32 s30, s12, 2 +; GFX12-NEXT: s_bfe_i64 s[18:19], s[12:13], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[26:27], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[26:27], s[40:41], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[40:41], s[52:53], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[52:53], s[62:63], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[62:63], s[74:75], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[74:75], s[88:89], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[88:89], s[94:95], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[94:95], s[100:101], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[96:97], s[96:97], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v10, s85 :: v_dual_mov_b32 v11, s86 +; GFX12-NEXT: v_dual_mov_b32 v12, s87 :: v_dual_mov_b32 v13, s90 +; GFX12-NEXT: s_bfe_i64 s[78:79], s[78:79], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[92:93], s[92:93], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v14, s91 :: v_dual_mov_b32 v15, s98 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v65, s40 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v12, v[69:72], s[0:1] offset:144 -; GFX12-NEXT: global_store_b128 v12, v[73:76], s[0:1] offset:128 -; GFX12-NEXT: v_bfe_i32 v69, v77, 0, 1 -; GFX12-NEXT: v_bfe_i32 v77, v0, 0, 1 -; GFX12-NEXT: v_bfe_i32 v75, v18, 0, 1 -; GFX12-NEXT: v_bfe_i32 v73, v17, 0, 1 -; GFX12-NEXT: v_lshrrev_b16 v26, 6, s4 -; GFX12-NEXT: v_lshrrev_b16 v28, 7, s4 -; GFX12-NEXT: v_lshrrev_b16 v20, 2, s4 -; GFX12-NEXT: v_lshrrev_b16 v14, 3, s4 -; GFX12-NEXT: v_lshrrev_b16 v22, 4, s4 -; GFX12-NEXT: v_lshrrev_b16 v25, 5, s4 -; GFX12-NEXT: v_lshrrev_b16 v19, 1, s4 -; GFX12-NEXT: v_lshrrev_b16 v9, 6, s2 -; GFX12-NEXT: v_lshrrev_b16 v5, 7, s2 -; GFX12-NEXT: v_bfe_i32 v71, v78, 0, 1 -; GFX12-NEXT: v_ashrrev_i32_e32 v80, 31, v79 -; GFX12-NEXT: v_ashrrev_i32_e32 v78, 31, v77 -; GFX12-NEXT: v_ashrrev_i32_e32 v76, 31, v75 -; GFX12-NEXT: v_ashrrev_i32_e32 v74, 31, v73 -; GFX12-NEXT: v_lshrrev_b16 v8, 4, s2 -; GFX12-NEXT: v_lshrrev_b16 v7, 5, s2 -; GFX12-NEXT: v_lshrrev_b16 v3, 3, s2 -; GFX12-NEXT: v_lshrrev_b16 v4, 2, s2 -; GFX12-NEXT: v_lshrrev_b16 v2, 1, s2 -; GFX12-NEXT: v_bfe_i32 v23, v14, 0, 1 -; GFX12-NEXT: v_bfe_i32 v21, v20, 0, 1 -; GFX12-NEXT: v_bfe_i32 v31, v28, 0, 1 -; GFX12-NEXT: v_bfe_i32 v29, v26, 0, 1 -; GFX12-NEXT: v_bfe_i32 v55, v46, 0, 1 -; GFX12-NEXT: v_bfe_i32 v53, v50, 0, 1 -; GFX12-NEXT: v_bfe_i32 v91, v34, 0, 1 -; GFX12-NEXT: v_bfe_i32 v89, v33, 0, 1 -; GFX12-NEXT: v_bfe_i32 v19, v19, 0, 1 -; GFX12-NEXT: v_bfe_i32 v27, v25, 0, 1 -; GFX12-NEXT: v_bfe_i32 v25, v22, 0, 1 -; GFX12-NEXT: v_bfe_i32 v51, v51, 0, 1 -; GFX12-NEXT: v_bfe_i32 v49, v49, 0, 1 -; GFX12-NEXT: v_bfe_i32 v87, v66, 0, 1 -; GFX12-NEXT: v_bfe_i32 v15, v5, 0, 1 -; GFX12-NEXT: v_bfe_i32 v13, v9, 0, 1 -; GFX12-NEXT: v_bfe_i32 v47, v38, 0, 1 -; GFX12-NEXT: v_bfe_i32 v45, v45, 0, 1 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v12, v[77:80], s[0:1] offset:112 -; GFX12-NEXT: global_store_b128 v12, v[73:76], s[0:1] offset:96 -; GFX12-NEXT: v_bfe_i32 v77, v82, 0, 1 -; GFX12-NEXT: v_bfe_i32 v75, v86, 0, 1 -; GFX12-NEXT: v_bfe_i32 v10, v7, 0, 1 -; GFX12-NEXT: v_bfe_i32 v8, v8, 0, 1 -; GFX12-NEXT: v_bfe_i32 v43, v41, 0, 1 -; GFX12-NEXT: v_bfe_i32 v41, v40, 0, 1 -; GFX12-NEXT: v_bfe_i32 v83, v83, 0, 1 -; GFX12-NEXT: v_bfe_i32 v81, v81, 0, 1 -; GFX12-NEXT: v_bfe_i32 v6, v3, 0, 1 -; GFX12-NEXT: v_bfe_i32 v4, v4, 0, 1 -; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX12-NEXT: v_bfe_i32 v39, v36, 0, 1 -; GFX12-NEXT: v_bfe_i32 v37, v37, 0, 1 -; GFX12-NEXT: v_bfe_i32 v2, v2, 0, 1 -; GFX12-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GFX12-NEXT: v_ashrrev_i32_e32 v22, 31, v21 -; GFX12-NEXT: v_bfe_i32 v35, v30, 0, 1 -; GFX12-NEXT: v_ashrrev_i32_e32 v32, 31, v31 -; GFX12-NEXT: v_ashrrev_i32_e32 v30, 31, v29 -; GFX12-NEXT: v_ashrrev_i32_e32 v56, 31, v55 -; GFX12-NEXT: v_bfe_i32 v59, v54, 0, 1 -; GFX12-NEXT: v_ashrrev_i32_e32 v54, 31, v53 -; GFX12-NEXT: v_bfe_i32 v57, v57, 0, 1 -; GFX12-NEXT: v_bfe_i32 v63, v61, 0, 1 -; GFX12-NEXT: v_bfe_i32 v61, v60, 0, 1 -; GFX12-NEXT: v_bfe_i32 v67, v58, 0, 1 -; GFX12-NEXT: v_ashrrev_i32_e32 v92, 31, v91 -; GFX12-NEXT: v_ashrrev_i32_e32 v90, 31, v89 -; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 -; GFX12-NEXT: v_ashrrev_i32_e32 v20, 31, v19 -; GFX12-NEXT: v_ashrrev_i32_e32 v28, 31, v27 -; GFX12-NEXT: v_ashrrev_i32_e32 v26, 31, v25 -; GFX12-NEXT: v_ashrrev_i32_e32 v52, 31, v51 -; GFX12-NEXT: v_ashrrev_i32_e32 v50, 31, v49 -; GFX12-NEXT: v_ashrrev_i32_e32 v88, 31, v87 -; GFX12-NEXT: v_ashrrev_i32_e32 v86, 31, v85 +; GFX12-NEXT: v_dual_mov_b32 v16, s99 :: v_dual_mov_b32 v17, s96 +; GFX12-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v18, s97 :: v_dual_mov_b32 v19, s94 +; GFX12-NEXT: v_dual_mov_b32 v20, s95 :: v_dual_mov_b32 v21, s92 +; GFX12-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v22, s93 :: v_dual_mov_b32 v23, s88 +; GFX12-NEXT: v_mov_b32_e32 v24, s89 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1] offset:176 +; GFX12-NEXT: global_store_b128 v0, v[5:8], s[0:1] offset:160 +; GFX12-NEXT: global_store_b128 v0, v[9:12], s[0:1] offset:144 +; GFX12-NEXT: global_store_b128 v0, v[13:16], s[0:1] offset:128 +; GFX12-NEXT: global_store_b128 v0, v[17:20], s[0:1] offset:112 +; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1] offset:96 +; GFX12-NEXT: v_dual_mov_b32 v1, s78 :: v_dual_mov_b32 v2, s79 +; GFX12-NEXT: v_dual_mov_b32 v3, s74 :: v_dual_mov_b32 v4, s75 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v34, s19 :: v_dual_mov_b32 v17, s4 -; GFX12-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GFX12-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GFX12-NEXT: v_ashrrev_i32_e32 v48, 31, v47 -; GFX12-NEXT: v_ashrrev_i32_e32 v46, 31, v45 -; GFX12-NEXT: v_ashrrev_i32_e32 v78, 31, v77 -; GFX12-NEXT: v_ashrrev_i32_e32 v76, 31, v75 -; GFX12-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GFX12-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GFX12-NEXT: v_ashrrev_i32_e32 v44, 31, v43 -; GFX12-NEXT: v_ashrrev_i32_e32 v42, 31, v41 -; GFX12-NEXT: v_ashrrev_i32_e32 v84, 31, v83 -; GFX12-NEXT: v_ashrrev_i32_e32 v82, 31, v81 -; GFX12-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX12-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX12-NEXT: v_ashrrev_i32_e32 v40, 31, v39 -; GFX12-NEXT: v_ashrrev_i32_e32 v38, 31, v37 -; GFX12-NEXT: v_ashrrev_i32_e32 v72, 31, v71 -; GFX12-NEXT: v_ashrrev_i32_e32 v70, 31, v69 -; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX12-NEXT: v_ashrrev_i32_e32 v36, 31, v35 -; GFX12-NEXT: v_ashrrev_i32_e32 v60, 31, v59 -; GFX12-NEXT: v_ashrrev_i32_e32 v58, 31, v57 -; GFX12-NEXT: v_ashrrev_i32_e32 v64, 31, v63 -; GFX12-NEXT: v_ashrrev_i32_e32 v62, 31, v61 -; GFX12-NEXT: v_ashrrev_i32_e32 v68, 31, v67 -; GFX12-NEXT: v_dual_mov_b32 v66, s41 :: v_dual_mov_b32 v33, s18 -; GFX12-NEXT: s_clause 0xf -; GFX12-NEXT: global_store_b128 v12, v[89:92], s[0:1] offset:80 -; GFX12-NEXT: global_store_b128 v12, v[85:88], s[0:1] offset:64 -; GFX12-NEXT: global_store_b128 v12, v[75:78], s[0:1] offset:48 -; GFX12-NEXT: global_store_b128 v12, v[81:84], s[0:1] offset:32 -; GFX12-NEXT: global_store_b128 v12, v[69:72], s[0:1] offset:16 -; GFX12-NEXT: global_store_b128 v12, v[65:68], s[0:1] -; GFX12-NEXT: global_store_b128 v12, v[61:64], s[0:1] offset:368 -; GFX12-NEXT: global_store_b128 v12, v[57:60], s[0:1] offset:352 -; GFX12-NEXT: global_store_b128 v12, v[53:56], s[0:1] offset:336 -; GFX12-NEXT: global_store_b128 v12, v[49:52], s[0:1] offset:320 -; GFX12-NEXT: global_store_b128 v12, v[45:48], s[0:1] offset:304 -; GFX12-NEXT: global_store_b128 v12, v[41:44], s[0:1] offset:288 -; GFX12-NEXT: global_store_b128 v12, v[37:40], s[0:1] offset:272 -; GFX12-NEXT: global_store_b128 v12, v[33:36], s[0:1] offset:256 -; GFX12-NEXT: global_store_b128 v12, v[29:32], s[0:1] offset:496 -; GFX12-NEXT: global_store_b128 v12, v[25:28], s[0:1] offset:480 -; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: v_mov_b32_e32 v5, s66 +; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v6, s67 :: v_dual_mov_b32 v7, s62 +; GFX12-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v9, s56 +; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v10, s57 :: v_dual_mov_b32 v11, s52 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_dual_mov_b32 v12, s53 :: v_dual_mov_b32 v13, s44 +; GFX12-NEXT: v_dual_mov_b32 v14, s45 :: v_dual_mov_b32 v15, s40 +; GFX12-NEXT: v_dual_mov_b32 v16, s41 :: v_dual_mov_b32 v17, s30 +; GFX12-NEXT: v_dual_mov_b32 v18, s31 :: v_dual_mov_b32 v19, s26 +; GFX12-NEXT: v_dual_mov_b32 v20, s27 :: v_dual_mov_b32 v21, s18 +; GFX12-NEXT: v_dual_mov_b32 v22, s19 :: v_dual_mov_b32 v23, s12 +; GFX12-NEXT: v_mov_b32_e32 v24, s13 ; GFX12-NEXT: s_clause 0x5 -; GFX12-NEXT: global_store_b128 v12, v[21:24], s[0:1] offset:464 -; GFX12-NEXT: global_store_b128 v12, v[17:20], s[0:1] offset:448 -; GFX12-NEXT: global_store_b128 v12, v[13:16], s[0:1] offset:240 -; GFX12-NEXT: global_store_b128 v12, v[8:11], s[0:1] offset:224 -; GFX12-NEXT: global_store_b128 v12, v[4:7], s[0:1] offset:208 -; GFX12-NEXT: global_store_b128 v12, v[0:3], s[0:1] offset:192 +; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1] offset:80 +; GFX12-NEXT: global_store_b128 v0, v[5:8], s[0:1] offset:64 +; GFX12-NEXT: global_store_b128 v0, v[9:12], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v0, v[13:16], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v0, v[17:20], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index 7f26ad7009e44a..ce17c81a24dd50 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -869,6 +869,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out ; GFX8-NOHSA-LABEL: constant_zextload_v2i8_to_v2i32: ; GFX8-NOHSA: ; %bb.0: ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 8 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -876,7 +877,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0) -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e32 v3, 8, v2 +; GFX8-NOHSA-NEXT: v_lshrrev_b32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NOHSA-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -916,10 +917,10 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v2, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v1, 8, v0 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; GFX12-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -976,9 +977,8 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0) -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e32 v3, 8, v2 +; GFX8-NOHSA-NEXT: v_bfe_i32 v3, v2, 8, 8 ; GFX8-NOHSA-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX8-NOHSA-NEXT: v_bfe_i32 v3, v3, 0, 8 ; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NOHSA-NEXT: s_endpgm ; @@ -1012,14 +1012,14 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out ; GFX12-LABEL: constant_sextload_v2i8_to_v2i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_u16 v0, v2, s[2:3] -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v1, 8, v0 -; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX12-NEXT: s_load_u16 s2, s[2:3], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_sext_i32_i8 s3, s2 +; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80008 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: v_mov_b32_e32 v0, s3 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1076,11 +1076,12 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_and_b32 s0, s2, 0xff -; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s2, 0x80010 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s2 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NOHSA-NEXT: s_bfe_u32 s0, s2, 0x80008 +; GFX8-NOHSA-NEXT: s_and_b32 s1, s2, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NOHSA-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; GFX8-NOHSA-NEXT: s_endpgm ; @@ -1116,12 +1117,11 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2 -; GFX12-NEXT: s_and_b32 s3, s2, 0xff +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x80008 +; GFX12-NEXT: s_and_b32 s4, s2, 0xff ; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_and_b32 v1, 0xffff, v1 +; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s2 ; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1179,11 +1179,11 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s2 ; GFX8-NOHSA-NEXT: s_bfe_i32 s0, s2, 0x80010 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s1, s2 -; GFX8-NOHSA-NEXT: v_bfe_i32 v1, v0, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s2, 0x80008 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s2, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; GFX8-NOHSA-NEXT: s_endpgm @@ -1220,13 +1220,11 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2 -; GFX12-NEXT: s_sext_i32_i8 s3, s2 -; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80010 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 -; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: s_bfe_i32 s3, s2, 0x80010 +; GFX12-NEXT: s_sext_i32_i8 s4, s2 +; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80008 +; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 ; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1287,10 +1285,11 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 24 -; GFX8-NOHSA-NEXT: s_and_b32 s1, s2, 0xff -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s2 +; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s2, 0x80008 +; GFX8-NOHSA-NEXT: s_and_b32 s3, s2, 0xff ; GFX8-NOHSA-NEXT: s_bfe_u32 s2, s2, 0x80010 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -1326,14 +1325,12 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2 ; GFX12-NEXT: s_lshr_b32 s3, s2, 24 -; GFX12-NEXT: s_and_b32 s4, s2, 0xff +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x80008 +; GFX12-NEXT: s_and_b32 s5, s2, 0xff ; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_and_b32 v1, 0xffff, v1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1393,12 +1390,12 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s2 ; GFX8-NOHSA-NEXT: s_ashr_i32 s0, s2, 24 ; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s2, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s2, 0x80008 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s2, s2 -; GFX8-NOHSA-NEXT: v_bfe_i32 v1, v0, 0, 8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -1436,15 +1433,14 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2 ; GFX12-NEXT: s_ashr_i32 s3, s2, 24 -; GFX12-NEXT: s_sext_i32_i8 s4, s2 -; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80010 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX12-NEXT: s_bfe_i32 s4, s2, 0x80010 +; GFX12-NEXT: s_sext_i32_i8 s5, s2 +; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80008 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1523,26 +1519,28 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s3, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s5, s2, 24 -; GFX8-NOHSA-NEXT: s_and_b32 s6, s3, 0xff -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s3 +; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s2, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s5, s2, 0x80008 +; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s3, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s7, s3, 0x80008 +; GFX8-NOHSA-NEXT: s_and_b32 s8, s2, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s2, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s2, s3, 0xff ; GFX8-NOHSA-NEXT: s_bfe_u32 s3, s3, 0x80010 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s2 -; GFX8-NOHSA-NEXT: s_and_b32 s7, s2, 0xff -; GFX8-NOHSA-NEXT: s_bfe_u32 s2, s2, 0x80010 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -1589,23 +1587,22 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2 -; GFX12-NEXT: v_lshrrev_b16 v5, 8, s3 -; GFX12-NEXT: s_lshr_b32 s5, s2, 24 -; GFX12-NEXT: s_and_b32 s7, s2, 0xff -; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010 -; GFX12-NEXT: s_lshr_b32 s4, s3, 24 -; GFX12-NEXT: s_and_b32 s6, s3, 0xff +; GFX12-NEXT: s_lshr_b32 s6, s3, 24 +; GFX12-NEXT: s_bfe_u32 s7, s3, 0x80008 +; GFX12-NEXT: s_and_b32 s9, s3, 0xff ; GFX12-NEXT: s_bfe_u32 s3, s3, 0x80010 -; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s5 -; GFX12-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_and_b32 v1, 0xffff, v1 -; GFX12-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_and_b32 v5, 0xffff, v5 -; GFX12-NEXT: v_mov_b32_e32 v6, s3 +; GFX12-NEXT: s_lshr_b32 s4, s2, 24 +; GFX12-NEXT: s_bfe_u32 s5, s2, 0x80008 +; GFX12-NEXT: s_and_b32 s8, s2, 0xff +; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_dual_mov_b32 v0, s9 :: v_dual_mov_b32 v3, s6 +; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v5, s5 +; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s4 +; GFX12-NEXT: v_mov_b32_e32 v6, s2 ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] -; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -1683,27 +1680,27 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_ashr_i32 s4, s3, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s5, s3, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s6, s2, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s7, s2, 0x80010 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s2 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s2, s2 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 16 -; GFX8-NOHSA-NEXT: v_bfe_i32 v3, v0, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s3 +; GFX8-NOHSA-NEXT: s_ashr_i32 s4, s2, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s5, s2, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_i32 s6, s2, 0x80008 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s7, s2 +; GFX8-NOHSA-NEXT: s_ashr_i32 s2, s3, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s8, s3, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_i32 s9, s3, 0x80008 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s3, s3 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5] -; GFX8-NOHSA-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1754,25 +1751,22 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2 -; GFX12-NEXT: v_lshrrev_b16 v5, 8, s3 -; GFX12-NEXT: s_ashr_i32 s6, s2, 24 -; GFX12-NEXT: s_sext_i32_i8 s7, s2 -; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80010 -; GFX12-NEXT: s_ashr_i32 s4, s3, 24 -; GFX12-NEXT: s_bfe_i32 s5, s3, 0x80010 -; GFX12-NEXT: s_sext_i32_i8 s3, s3 -; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s6 -; GFX12-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX12-NEXT: v_mov_b32_e32 v4, s3 +; GFX12-NEXT: s_ashr_i32 s7, s3, 24 +; GFX12-NEXT: s_bfe_i32 s8, s3, 0x80010 +; GFX12-NEXT: s_sext_i32_i8 s9, s3 +; GFX12-NEXT: s_bfe_i32 s3, s3, 0x80008 +; GFX12-NEXT: s_ashr_i32 s4, s2, 24 +; GFX12-NEXT: s_bfe_i32 s5, s2, 0x80010 +; GFX12-NEXT: s_bfe_i32 s6, s2, 0x80008 +; GFX12-NEXT: s_sext_i32_i8 s2, s2 +; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v0, s9 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v5, s6 +; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v7, s4 ; GFX12-NEXT: v_mov_b32_e32 v6, s5 -; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 8 ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] -; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -1898,47 +1892,51 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s4, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s9, s5, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s6, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s4, 0x80008 +; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s5, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s11, s5, 0x80008 +; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s6, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s13, s6, 0x80008 ; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s7, 24 -; GFX8-NOHSA-NEXT: s_and_b32 s11, s4, 0xff -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s4 +; GFX8-NOHSA-NEXT: s_bfe_u32 s3, s7, 0x80008 +; GFX8-NOHSA-NEXT: s_and_b32 s14, s4, 0xff ; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s4, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s12, s5, 0xff -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s5 +; GFX8-NOHSA-NEXT: s_and_b32 s15, s5, 0xff ; GFX8-NOHSA-NEXT: s_bfe_u32 s5, s5, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s13, s6, 0xff -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v5, 8, s6 +; GFX8-NOHSA-NEXT: s_and_b32 s16, s6, 0xff ; GFX8-NOHSA-NEXT: s_bfe_u32 s6, s6, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s3, s7, 0xff -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s7 +; GFX8-NOHSA-NEXT: s_and_b32 s17, s7, 0xff ; GFX8-NOHSA-NEXT: s_bfe_u32 s7, s7, 0x80010 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s17 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[6:9] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s2 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s12 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s11 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s15 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 @@ -2007,32 +2005,30 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v1, 8, s7 -; GFX12-NEXT: v_lshrrev_b16 v5, 8, s6 -; GFX12-NEXT: v_lshrrev_b16 v9, 8, s5 -; GFX12-NEXT: s_lshr_b32 s8, s6, 24 -; GFX12-NEXT: s_lshr_b32 s9, s7, 24 -; GFX12-NEXT: v_lshrrev_b16 v13, 8, s4 -; GFX12-NEXT: s_and_b32 s12, s6, 0xff -; GFX12-NEXT: s_bfe_u32 s6, s6, 0x80010 -; GFX12-NEXT: s_and_b32 s13, s7, 0xff +; GFX12-NEXT: s_lshr_b32 s12, s7, 24 +; GFX12-NEXT: s_bfe_u32 s13, s7, 0x80008 +; GFX12-NEXT: s_and_b32 s17, s7, 0xff ; GFX12-NEXT: s_bfe_u32 s7, s7, 0x80010 -; GFX12-NEXT: s_and_b32 s11, s5, 0xff -; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s9 -; GFX12-NEXT: s_lshr_b32 s3, s5, 24 +; GFX12-NEXT: s_lshr_b32 s10, s6, 24 +; GFX12-NEXT: s_bfe_u32 s11, s6, 0x80008 +; GFX12-NEXT: s_and_b32 s16, s6, 0xff +; GFX12-NEXT: s_bfe_u32 s6, s6, 0x80010 +; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s13 +; GFX12-NEXT: s_lshr_b32 s8, s5, 24 +; GFX12-NEXT: s_bfe_u32 s9, s5, 0x80008 +; GFX12-NEXT: s_and_b32 s15, s5, 0xff ; GFX12-NEXT: s_bfe_u32 s5, s5, 0x80010 -; GFX12-NEXT: v_dual_mov_b32 v0, s13 :: v_dual_mov_b32 v7, s8 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v11, s3 -; GFX12-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_and_b32 v1, 0xffff, v1 +; GFX12-NEXT: v_dual_mov_b32 v0, s17 :: v_dual_mov_b32 v3, s12 +; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s11 ; GFX12-NEXT: s_lshr_b32 s2, s4, 24 -; GFX12-NEXT: s_and_b32 s10, s4, 0xff +; GFX12-NEXT: s_bfe_u32 s3, s4, 0x80008 +; GFX12-NEXT: s_and_b32 s14, s4, 0xff ; GFX12-NEXT: s_bfe_u32 s4, s4, 0x80010 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v15, s2 -; GFX12-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_and_b32 v5, 0xffff, v5 -; GFX12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_and_b32 v9, 0xffff, v9 -; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_and_b32 v13, 0xffff, v13 +; GFX12-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s10 +; GFX12-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v9, s9 +; GFX12-NEXT: v_dual_mov_b32 v8, s15 :: v_dual_mov_b32 v11, s8 +; GFX12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v13, s3 +; GFX12-NEXT: v_dual_mov_b32 v12, s14 :: v_dual_mov_b32 v15, s2 ; GFX12-NEXT: v_mov_b32_e32 v14, s4 ; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48 @@ -2165,50 +2161,50 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_ashr_i32 s8, s4, 24 ; GFX8-NOHSA-NEXT: s_bfe_i32 s9, s4, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s10, s5, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s11, s5, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s12, s6, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s13, s6, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_i32 s10, s4, 0x80008 +; GFX8-NOHSA-NEXT: s_ashr_i32 s11, s5, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s12, s5, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_i32 s13, s5, 0x80008 +; GFX8-NOHSA-NEXT: s_ashr_i32 s14, s6, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s15, s6, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_i32 s16, s6, 0x80008 ; GFX8-NOHSA-NEXT: s_ashr_i32 s2, s7, 24 ; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s7, 0x80010 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s2 +; GFX8-NOHSA-NEXT: s_bfe_i32 s17, s7, 0x80008 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s7, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NOHSA-NEXT: v_bfe_i32 v7, v0, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[6:9] -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s3 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s6, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NOHSA-NEXT: v_bfe_i32 v5, v2, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s15 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s14 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s5 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s5, s5 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GFX8-NOHSA-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s11 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s4 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s4, s4 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5] -; GFX8-NOHSA-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 @@ -2285,37 +2281,31 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v1, 8, s7 -; GFX12-NEXT: v_lshrrev_b16 v5, 8, s6 -; GFX12-NEXT: v_lshrrev_b16 v9, 8, s5 -; GFX12-NEXT: v_lshrrev_b16 v13, 8, s4 -; GFX12-NEXT: s_ashr_i32 s12, s7, 24 -; GFX12-NEXT: s_sext_i32_i8 s13, s7 -; GFX12-NEXT: s_bfe_i32 s7, s7, 0x80010 -; GFX12-NEXT: s_ashr_i32 s10, s6, 24 -; GFX12-NEXT: s_bfe_i32 s11, s6, 0x80010 +; GFX12-NEXT: s_ashr_i32 s15, s7, 24 +; GFX12-NEXT: s_bfe_i32 s16, s7, 0x80010 +; GFX12-NEXT: s_sext_i32_i8 s17, s7 +; GFX12-NEXT: s_bfe_i32 s7, s7, 0x80008 +; GFX12-NEXT: s_ashr_i32 s12, s6, 24 +; GFX12-NEXT: s_bfe_i32 s13, s6, 0x80010 +; GFX12-NEXT: s_bfe_i32 s14, s6, 0x80008 ; GFX12-NEXT: s_sext_i32_i8 s6, s6 -; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s12 -; GFX12-NEXT: s_ashr_i32 s8, s5, 24 -; GFX12-NEXT: s_bfe_i32 s9, s5, 0x80010 +; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_ashr_i32 s9, s5, 24 +; GFX12-NEXT: s_bfe_i32 s10, s5, 0x80010 +; GFX12-NEXT: s_bfe_i32 s11, s5, 0x80008 ; GFX12-NEXT: s_sext_i32_i8 s5, s5 -; GFX12-NEXT: v_dual_mov_b32 v0, s13 :: v_dual_mov_b32 v7, s10 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v11, s8 -; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX12-NEXT: v_dual_mov_b32 v0, s17 :: v_dual_mov_b32 v3, s15 +; GFX12-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v5, s14 ; GFX12-NEXT: s_ashr_i32 s2, s4, 24 ; GFX12-NEXT: s_bfe_i32 s3, s4, 0x80010 +; GFX12-NEXT: s_bfe_i32 s8, s4, 0x80008 ; GFX12-NEXT: s_sext_i32_i8 s4, s4 -; GFX12-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v15, s2 -; GFX12-NEXT: v_mov_b32_e32 v6, s11 -; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 8 -; GFX12-NEXT: v_mov_b32_e32 v8, s5 -; GFX12-NEXT: v_mov_b32_e32 v10, s9 -; GFX12-NEXT: v_bfe_i32 v9, v9, 0, 8 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v12, s4 +; GFX12-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v7, s12 +; GFX12-NEXT: v_dual_mov_b32 v6, s13 :: v_dual_mov_b32 v9, s11 +; GFX12-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v11, s9 +; GFX12-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v13, s8 +; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s2 ; GFX12-NEXT: v_mov_b32_e32 v14, s3 -; GFX12-NEXT: v_bfe_i32 v13, v13, 0, 8 ; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48 ; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32 @@ -2533,139 +2523,147 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_zextload_v32i8_to_v32i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; GFX8-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s0, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s11, s1, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s2, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s13, s3, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s4, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s15, s5, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s4, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s13, s4, 0x80008 +; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s5, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s15, s5, 0x80008 ; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s6, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s17, s7, 24 -; GFX8-NOHSA-NEXT: s_and_b32 s18, s0, 0xff -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s0 -; GFX8-NOHSA-NEXT: s_bfe_u32 s19, s0, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s20, s1, 0xff -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s1 -; GFX8-NOHSA-NEXT: s_bfe_u32 s21, s1, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s22, s2, 0xff -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v5, 8, s2 -; GFX8-NOHSA-NEXT: s_bfe_u32 s2, s2, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s23, s3, 0xff -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s3 -; GFX8-NOHSA-NEXT: s_bfe_u32 s3, s3, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s24, s4, 0xff -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v9, 8, s4 +; GFX8-NOHSA-NEXT: s_bfe_u32 s17, s6, 0x80008 +; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s7, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s19, s7, 0x80008 +; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s8, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s21, s8, 0x80008 +; GFX8-NOHSA-NEXT: s_lshr_b32 s22, s9, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s23, s9, 0x80008 +; GFX8-NOHSA-NEXT: s_lshr_b32 s24, s10, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s25, s10, 0x80008 +; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s11, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s3, s11, 0x80008 +; GFX8-NOHSA-NEXT: s_and_b32 s26, s4, 0xff ; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s4, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s25, s5, 0xff -; GFX8-NOHSA-NEXT: s_bfe_u32 s26, s5, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s27, s6, 0xff -; GFX8-NOHSA-NEXT: s_bfe_u32 s28, s6, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s0, s7, 0xff -; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s7, 0x80010 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s0 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 0x70 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s1 -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s0 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 0x60 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s17 -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s0 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 0x50 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s27 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s28 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s16 -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s0 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 64 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s25 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s26 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s15 -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s0 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s14 -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s23 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s0 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 32 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s13 -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[6:9] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s22 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s0 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s12 -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s21 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s19 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NOHSA-NEXT: s_and_b32 s27, s5, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s5, s5, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s28, s6, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s6, s6, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s29, s7, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s7, s7, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s30, s8, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s8, s8, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s31, s9, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s9, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s33, s10, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s10, s10, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s34, s11, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s11, s11, 0x80010 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x70 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x60 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s34 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_endpgm -; -; EG-LABEL: constant_zextload_v32i8_to_v32i32: -; EG: ; %bb.0: -; EG-NEXT: ALU 0, @16, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 1 @12 -; EG-NEXT: ALU 75, @17, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T26.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T25.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T23.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T12.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T20.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T18.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T16.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T11.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: Fetch clause starting at 12: -; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1 -; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1 -; EG-NEXT: ALU clause starting at 16: -; EG-NEXT: MOV * T11.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 17: -; EG-NEXT: MOV * T0.W, literal.x, -; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT * T13.Z, T11.X, literal.x, PV.W, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T13.Y, T11.X, literal.x, T0.W, -; EG-NEXT: BFE_UINT T14.Z, T11.Y, literal.y, T0.W, -; EG-NEXT: LSHR * T13.W, T11.X, literal.z, -; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44) -; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T13.X, T11.X, literal.x, -; EG-NEXT: BFE_UINT T14.Y, T11.Y, literal.y, T0.W, -; EG-NEXT: LSHR * T11.X, KC0[2].Y, literal.z, -; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44) +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s33 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s25 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s24 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 64 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s31 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s22 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s30 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s21 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s20 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s29 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s19 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s18 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s28 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s16 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s27 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s15 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s26 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_endpgm +; +; EG-LABEL: constant_zextload_v32i8_to_v32i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @16, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @12 +; EG-NEXT: ALU 75, @17, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T26.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T25.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T23.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T12.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T20.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T18.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T16.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T11.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: Fetch clause starting at 12: +; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1 +; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1 +; EG-NEXT: ALU clause starting at 16: +; EG-NEXT: MOV * T11.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 17: +; EG-NEXT: MOV * T0.W, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT * T13.Z, T11.X, literal.x, PV.W, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T13.Y, T11.X, literal.x, T0.W, +; EG-NEXT: BFE_UINT T14.Z, T11.Y, literal.y, T0.W, +; EG-NEXT: LSHR * T13.W, T11.X, literal.z, +; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44) +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T13.X, T11.X, literal.x, +; EG-NEXT: BFE_UINT T14.Y, T11.Y, literal.y, T0.W, +; EG-NEXT: LSHR * T11.X, KC0[2].Y, literal.z, +; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44) ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; EG-NEXT: BFE_UINT T15.Z, T11.Z, literal.x, T0.W, ; EG-NEXT: LSHR * T14.W, T11.Y, literal.y, @@ -2736,69 +2734,67 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v1, 8, s11 -; GFX12-NEXT: v_lshrrev_b16 v5, 8, s10 -; GFX12-NEXT: v_lshrrev_b16 v9, 8, s9 -; GFX12-NEXT: v_lshrrev_b16 v10, 8, s8 -; GFX12-NEXT: v_lshrrev_b16 v11, 8, s7 -; GFX12-NEXT: s_lshr_b32 s15, s9, 24 -; GFX12-NEXT: s_lshr_b32 s17, s11, 24 -; GFX12-NEXT: v_lshrrev_b16 v12, 8, s6 -; GFX12-NEXT: s_and_b32 s23, s9, 0xff -; GFX12-NEXT: s_bfe_u32 s9, s9, 0x80010 -; GFX12-NEXT: s_and_b32 s25, s11, 0xff +; GFX12-NEXT: s_lshr_b32 s24, s11, 24 +; GFX12-NEXT: s_bfe_u32 s25, s11, 0x80008 +; GFX12-NEXT: s_and_b32 s34, s11, 0xff ; GFX12-NEXT: s_bfe_u32 s11, s11, 0x80010 -; GFX12-NEXT: s_lshr_b32 s14, s8, 24 -; GFX12-NEXT: s_lshr_b32 s16, s10, 24 -; GFX12-NEXT: v_lshrrev_b16 v14, 8, s5 -; GFX12-NEXT: s_and_b32 s22, s8, 0xff -; GFX12-NEXT: s_bfe_u32 s8, s8, 0x80010 -; GFX12-NEXT: s_and_b32 s24, s10, 0xff +; GFX12-NEXT: s_lshr_b32 s22, s10, 24 +; GFX12-NEXT: s_bfe_u32 s23, s10, 0x80008 +; GFX12-NEXT: s_and_b32 s33, s10, 0xff ; GFX12-NEXT: s_bfe_u32 s10, s10, 0x80010 -; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v3, s17 -; GFX12-NEXT: s_lshr_b32 s13, s7, 24 -; GFX12-NEXT: v_lshrrev_b16 v13, 8, s4 -; GFX12-NEXT: s_and_b32 s21, s7, 0xff +; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s25 +; GFX12-NEXT: v_dual_mov_b32 v0, s34 :: v_dual_mov_b32 v3, s24 +; GFX12-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v5, s23 +; GFX12-NEXT: s_bfe_u32 s21, s9, 0x80008 +; GFX12-NEXT: v_dual_mov_b32 v4, s33 :: v_dual_mov_b32 v7, s22 +; GFX12-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v9, s21 +; GFX12-NEXT: s_lshr_b32 s20, s9, 24 +; GFX12-NEXT: s_and_b32 s31, s9, 0xff +; GFX12-NEXT: s_bfe_u32 s9, s9, 0x80010 +; GFX12-NEXT: s_lshr_b32 s18, s8, 24 +; GFX12-NEXT: s_bfe_u32 s19, s8, 0x80008 +; GFX12-NEXT: s_and_b32 s30, s8, 0xff +; GFX12-NEXT: s_bfe_u32 s8, s8, 0x80010 +; GFX12-NEXT: s_lshr_b32 s16, s7, 24 +; GFX12-NEXT: s_bfe_u32 s17, s7, 0x80008 +; GFX12-NEXT: s_and_b32 s29, s7, 0xff ; GFX12-NEXT: s_bfe_u32 s7, s7, 0x80010 -; GFX12-NEXT: v_dual_mov_b32 v0, s25 :: v_dual_mov_b32 v7, s16 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_and_b32 v13, 0xffff, v13 -; GFX12-NEXT: v_dual_mov_b32 v8, s23 :: v_dual_and_b32 v1, 0xffff, v1 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: v_dual_mov_b32 v28, s22 :: v_dual_and_b32 v25, 0xffff, v11 -; GFX12-NEXT: v_dual_mov_b32 v30, s8 :: v_dual_and_b32 v29, 0xffff, v10 -; GFX12-NEXT: v_dual_mov_b32 v24, s21 :: v_dual_and_b32 v9, 0xffff, v9 -; GFX12-NEXT: v_dual_mov_b32 v10, s9 :: v_dual_mov_b32 v11, s15 -; GFX12-NEXT: v_mov_b32_e32 v26, s7 -; GFX12-NEXT: s_lshr_b32 s12, s6, 24 -; GFX12-NEXT: s_and_b32 s20, s6, 0xff +; GFX12-NEXT: v_dual_mov_b32 v8, s31 :: v_dual_mov_b32 v11, s20 +; GFX12-NEXT: v_mov_b32_e32 v10, s9 +; GFX12-NEXT: s_lshr_b32 s14, s6, 24 +; GFX12-NEXT: s_bfe_u32 s15, s6, 0x80008 +; GFX12-NEXT: s_and_b32 s28, s6, 0xff ; GFX12-NEXT: s_bfe_u32 s6, s6, 0x80010 -; GFX12-NEXT: v_dual_mov_b32 v4, s24 :: v_dual_and_b32 v17, 0xffff, v14 -; GFX12-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_and_b32 v21, 0xffff, v12 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v31, s14 :: v_dual_mov_b32 v20, s20 -; GFX12-NEXT: s_lshr_b32 s3, s5, 24 -; GFX12-NEXT: s_and_b32 s19, s5, 0xff +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:112 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:96 +; GFX12-NEXT: v_dual_mov_b32 v1, s19 :: v_dual_mov_b32 v0, s30 +; GFX12-NEXT: v_dual_mov_b32 v3, s18 :: v_dual_mov_b32 v2, s8 +; GFX12-NEXT: v_mov_b32_e32 v5, s17 +; GFX12-NEXT: s_lshr_b32 s12, s5, 24 +; GFX12-NEXT: s_bfe_u32 s13, s5, 0x80008 +; GFX12-NEXT: s_and_b32 s27, s5, 0xff ; GFX12-NEXT: s_bfe_u32 s5, s5, 0x80010 -; GFX12-NEXT: v_dual_mov_b32 v27, s13 :: v_dual_mov_b32 v22, s6 +; GFX12-NEXT: v_dual_mov_b32 v4, s29 :: v_dual_mov_b32 v7, s16 +; GFX12-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v13, s15 ; GFX12-NEXT: s_lshr_b32 s2, s4, 24 -; GFX12-NEXT: s_and_b32 s18, s4, 0xff +; GFX12-NEXT: s_bfe_u32 s3, s4, 0x80008 +; GFX12-NEXT: s_and_b32 s26, s4, 0xff ; GFX12-NEXT: s_bfe_u32 s4, s4, 0x80010 -; GFX12-NEXT: v_dual_mov_b32 v23, s12 :: v_dual_mov_b32 v16, s19 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s3 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:112 -; GFX12-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:96 -; GFX12-NEXT: v_dual_mov_b32 v12, s18 :: v_dual_mov_b32 v15, s2 -; GFX12-NEXT: v_mov_b32_e32 v14, s4 +; GFX12-NEXT: v_dual_mov_b32 v12, s28 :: v_dual_mov_b32 v15, s14 +; GFX12-NEXT: v_dual_mov_b32 v14, s6 :: v_dual_mov_b32 v17, s13 +; GFX12-NEXT: v_dual_mov_b32 v16, s27 :: v_dual_mov_b32 v19, s12 +; GFX12-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v21, s3 +; GFX12-NEXT: v_dual_mov_b32 v20, s26 :: v_dual_mov_b32 v23, s2 +; GFX12-NEXT: v_mov_b32_e32 v22, s4 ; GFX12-NEXT: s_clause 0x5 -; GFX12-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:80 -; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:64 -; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:48 -; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:32 -; GFX12-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:16 -; GFX12-NEXT: global_store_b128 v32, v[12:15], s[0:1] +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:80 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:64 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -3011,111 +3007,111 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_sextload_v32i8_to_v32i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; GFX8-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s1 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s0 -; GFX8-NOHSA-NEXT: s_ashr_i32 s10, s0, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s11, s0, 0x80010 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s12, s0 -; GFX8-NOHSA-NEXT: s_ashr_i32 s13, s1, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s14, s1, 0x80010 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s15, s1 -; GFX8-NOHSA-NEXT: s_ashr_i32 s16, s2, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s17, s2, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s18, s3, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s19, s3, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s20, s4, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s21, s4, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s22, s5, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s23, s5, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s24, s6, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s25, s6, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s0, s7, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s7, 0x80010 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s0 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 0x70 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s1 -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s1 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s7 +; GFX8-NOHSA-NEXT: s_ashr_i32 s12, s4, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s13, s4, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_i32 s14, s4, 0x80008 +; GFX8-NOHSA-NEXT: s_ashr_i32 s15, s5, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s16, s5, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_i32 s17, s5, 0x80008 +; GFX8-NOHSA-NEXT: s_ashr_i32 s18, s6, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s19, s6, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_i32 s20, s6, 0x80008 +; GFX8-NOHSA-NEXT: s_ashr_i32 s21, s7, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s22, s7, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_i32 s23, s7, 0x80008 +; GFX8-NOHSA-NEXT: s_ashr_i32 s24, s8, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s25, s8, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_i32 s26, s8, 0x80008 +; GFX8-NOHSA-NEXT: s_ashr_i32 s27, s9, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s28, s9, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_i32 s29, s9, 0x80008 +; GFX8-NOHSA-NEXT: s_ashr_i32 s30, s10, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s31, s10, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_i32 s33, s10, 0x80008 +; GFX8-NOHSA-NEXT: s_ashr_i32 s2, s11, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s11, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_i32 s34, s11, 0x80008 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x70 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s11, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x60 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s34 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s10, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s33 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s31 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s30 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s9, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 64 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s29 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s28 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s27 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s8, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s26 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s25 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s24 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s7, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s0 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 0x60 -; GFX8-NOHSA-NEXT: v_bfe_i32 v11, v0, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s7 -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s1 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s22 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s6, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s0 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 0x50 -; GFX8-NOHSA-NEXT: v_bfe_i32 v11, v2, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s25 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s24 -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s1 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v4, 8, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s20 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s19 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s18 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s5, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s0 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 64 -; GFX8-NOHSA-NEXT: v_bfe_i32 v11, v4, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s23 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s22 -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s1 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s4, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s0 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 48 -; GFX8-NOHSA-NEXT: v_bfe_i32 v9, v6, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s21 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s20 -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s1 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s3, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s0 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 32 -; GFX8-NOHSA-NEXT: v_bfe_i32 v7, v7, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s19 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s18 -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[6:9] -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v5, 8, s2 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s1 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s2, s2 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s0 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 16 -; GFX8-NOHSA-NEXT: v_bfe_i32 v5, v5, 0, 8 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s17 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s16 -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GFX8-NOHSA-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s14 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5] -; GFX8-NOHSA-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s4, s4 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm ; @@ -3245,77 +3241,67 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v1, 8, s11 -; GFX12-NEXT: v_lshrrev_b16 v9, 8, s9 -; GFX12-NEXT: v_lshrrev_b16 v10, 8, s8 -; GFX12-NEXT: v_lshrrev_b16 v11, 8, s7 -; GFX12-NEXT: v_lshrrev_b16 v5, 8, s10 -; GFX12-NEXT: v_lshrrev_b16 v12, 8, s6 -; GFX12-NEXT: s_ashr_i32 s20, s9, 24 -; GFX12-NEXT: s_bfe_i32 s21, s9, 0x80010 +; GFX12-NEXT: s_ashr_i32 s31, s11, 24 +; GFX12-NEXT: s_bfe_i32 s33, s11, 0x80010 +; GFX12-NEXT: s_sext_i32_i8 s34, s11 +; GFX12-NEXT: s_bfe_i32 s11, s11, 0x80008 +; GFX12-NEXT: s_ashr_i32 s28, s10, 24 +; GFX12-NEXT: s_bfe_i32 s29, s10, 0x80010 +; GFX12-NEXT: s_bfe_i32 s30, s10, 0x80008 +; GFX12-NEXT: s_sext_i32_i8 s10, s10 +; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s11 +; GFX12-NEXT: v_dual_mov_b32 v0, s34 :: v_dual_mov_b32 v3, s31 +; GFX12-NEXT: v_dual_mov_b32 v2, s33 :: v_dual_mov_b32 v5, s30 +; GFX12-NEXT: s_bfe_i32 s27, s9, 0x80008 +; GFX12-NEXT: v_dual_mov_b32 v4, s10 :: v_dual_mov_b32 v7, s28 +; GFX12-NEXT: v_dual_mov_b32 v6, s29 :: v_dual_mov_b32 v9, s27 +; GFX12-NEXT: s_ashr_i32 s25, s9, 24 +; GFX12-NEXT: s_bfe_i32 s26, s9, 0x80010 ; GFX12-NEXT: s_sext_i32_i8 s9, s9 -; GFX12-NEXT: s_ashr_i32 s24, s11, 24 -; GFX12-NEXT: s_sext_i32_i8 s25, s11 -; GFX12-NEXT: s_bfe_i32 s11, s11, 0x80010 -; GFX12-NEXT: v_lshrrev_b16 v14, 8, s5 -; GFX12-NEXT: s_ashr_i32 s18, s8, 24 -; GFX12-NEXT: s_bfe_i32 s19, s8, 0x80010 +; GFX12-NEXT: s_ashr_i32 s22, s8, 24 +; GFX12-NEXT: s_bfe_i32 s23, s8, 0x80010 +; GFX12-NEXT: s_bfe_i32 s24, s8, 0x80008 ; GFX12-NEXT: s_sext_i32_i8 s8, s8 -; GFX12-NEXT: s_ashr_i32 s22, s10, 24 -; GFX12-NEXT: s_bfe_i32 s23, s10, 0x80010 -; GFX12-NEXT: s_sext_i32_i8 s10, s10 -; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v3, s24 -; GFX12-NEXT: v_lshrrev_b16 v13, 8, s4 -; GFX12-NEXT: s_ashr_i32 s12, s5, 24 -; GFX12-NEXT: s_ashr_i32 s14, s6, 24 -; GFX12-NEXT: s_ashr_i32 s16, s7, 24 -; GFX12-NEXT: s_bfe_i32 s17, s7, 0x80010 +; GFX12-NEXT: s_ashr_i32 s19, s7, 24 +; GFX12-NEXT: s_bfe_i32 s20, s7, 0x80010 +; GFX12-NEXT: s_bfe_i32 s21, s7, 0x80008 ; GFX12-NEXT: s_sext_i32_i8 s7, s7 -; GFX12-NEXT: v_dual_mov_b32 v0, s25 :: v_dual_mov_b32 v7, s22 -; GFX12-NEXT: v_mov_b32_e32 v2, s11 +; GFX12-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v11, s25 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v23, s14 -; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX12-NEXT: v_bfe_i32 v25, v11, 0, 8 -; GFX12-NEXT: v_bfe_i32 v29, v10, 0, 8 -; GFX12-NEXT: v_bfe_i32 v9, v9, 0, 8 -; GFX12-NEXT: v_dual_mov_b32 v10, s21 :: v_dual_mov_b32 v19, s12 -; GFX12-NEXT: v_mov_b32_e32 v11, s20 -; GFX12-NEXT: s_ashr_i32 s2, s4, 24 -; GFX12-NEXT: s_bfe_i32 s15, s6, 0x80010 +; GFX12-NEXT: v_mov_b32_e32 v10, s26 +; GFX12-NEXT: s_ashr_i32 s16, s6, 24 +; GFX12-NEXT: s_bfe_i32 s17, s6, 0x80010 +; GFX12-NEXT: s_bfe_i32 s18, s6, 0x80008 ; GFX12-NEXT: s_sext_i32_i8 s6, s6 -; GFX12-NEXT: v_dual_mov_b32 v4, s10 :: v_dual_mov_b32 v31, s18 -; GFX12-NEXT: v_dual_mov_b32 v6, s23 :: v_dual_mov_b32 v27, s16 -; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 8 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_mov_b32 v15, s2 -; GFX12-NEXT: v_mov_b32_e32 v30, s19 -; GFX12-NEXT: s_bfe_i32 s13, s5, 0x80010 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:112 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:96 +; GFX12-NEXT: v_dual_mov_b32 v1, s24 :: v_dual_mov_b32 v0, s8 +; GFX12-NEXT: v_dual_mov_b32 v3, s22 :: v_dual_mov_b32 v2, s23 +; GFX12-NEXT: v_mov_b32_e32 v5, s21 +; GFX12-NEXT: s_ashr_i32 s13, s5, 24 +; GFX12-NEXT: s_bfe_i32 s14, s5, 0x80010 +; GFX12-NEXT: s_bfe_i32 s15, s5, 0x80008 ; GFX12-NEXT: s_sext_i32_i8 s5, s5 -; GFX12-NEXT: v_mov_b32_e32 v24, s7 -; GFX12-NEXT: v_mov_b32_e32 v26, s17 +; GFX12-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v7, s19 +; GFX12-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v13, s18 +; GFX12-NEXT: s_ashr_i32 s2, s4, 24 ; GFX12-NEXT: s_bfe_i32 s3, s4, 0x80010 +; GFX12-NEXT: s_bfe_i32 s12, s4, 0x80008 ; GFX12-NEXT: s_sext_i32_i8 s4, s4 -; GFX12-NEXT: v_bfe_i32 v21, v12, 0, 8 -; GFX12-NEXT: v_mov_b32_e32 v20, s6 -; GFX12-NEXT: v_mov_b32_e32 v22, s15 -; GFX12-NEXT: v_bfe_i32 v17, v14, 0, 8 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v16, s5 -; GFX12-NEXT: v_mov_b32_e32 v18, s13 -; GFX12-NEXT: v_bfe_i32 v13, v13, 0, 8 -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:112 -; GFX12-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:96 -; GFX12-NEXT: v_mov_b32_e32 v12, s4 -; GFX12-NEXT: v_mov_b32_e32 v14, s3 +; GFX12-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v15, s16 +; GFX12-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v17, s15 +; GFX12-NEXT: v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v19, s13 +; GFX12-NEXT: v_dual_mov_b32 v18, s14 :: v_dual_mov_b32 v21, s12 +; GFX12-NEXT: v_dual_mov_b32 v20, s4 :: v_dual_mov_b32 v23, s2 +; GFX12-NEXT: v_mov_b32_e32 v22, s3 ; GFX12-NEXT: s_clause 0x5 -; GFX12-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:80 -; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:64 -; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:48 -; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:32 -; GFX12-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:16 -; GFX12-NEXT: global_store_b128 v32, v[12:15], s[0:1] +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:80 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:64 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -3717,196 +3703,209 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s0, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s19, s1, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s21, s2, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s23, s3, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s25, s4, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s27, s5, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s29, s6, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s33, s7, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s34, s8, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s35, s9, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s36, s10, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s37, s11, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s38, s12, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s39, s13, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s40, s14, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s30, s15, 24 -; GFX8-NOHSA-NEXT: s_and_b32 s20, s0, 0xff -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s0 +; GFX8-NOHSA-NEXT: s_bfe_u32 s19, s0, 0x80008 +; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s1, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s21, s1, 0x80008 +; GFX8-NOHSA-NEXT: s_lshr_b32 s22, s2, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s23, s2, 0x80008 +; GFX8-NOHSA-NEXT: s_lshr_b32 s25, s3, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s26, s3, 0x80008 +; GFX8-NOHSA-NEXT: s_lshr_b32 s28, s4, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s29, s4, 0x80008 +; GFX8-NOHSA-NEXT: s_lshr_b32 s31, s5, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s33, s5, 0x80008 +; GFX8-NOHSA-NEXT: s_lshr_b32 s35, s6, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s36, s6, 0x80008 +; GFX8-NOHSA-NEXT: s_lshr_b32 s37, s7, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s38, s7, 0x80008 +; GFX8-NOHSA-NEXT: s_lshr_b32 s39, s8, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s40, s8, 0x80008 +; GFX8-NOHSA-NEXT: s_lshr_b32 s41, s9, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s42, s9, 0x80008 +; GFX8-NOHSA-NEXT: s_lshr_b32 s43, s10, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s44, s10, 0x80008 +; GFX8-NOHSA-NEXT: s_lshr_b32 s45, s11, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s46, s11, 0x80008 +; GFX8-NOHSA-NEXT: s_lshr_b32 s47, s12, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s48, s12, 0x80008 +; GFX8-NOHSA-NEXT: s_lshr_b32 s49, s13, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s50, s13, 0x80008 +; GFX8-NOHSA-NEXT: s_lshr_b32 s51, s14, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s52, s14, 0x80008 +; GFX8-NOHSA-NEXT: s_lshr_b32 s53, s15, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s54, s15, 0x80008 +; GFX8-NOHSA-NEXT: s_and_b32 s24, s0, 0xff ; GFX8-NOHSA-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s22, s1, 0xff -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s1 +; GFX8-NOHSA-NEXT: s_and_b32 s27, s1, 0xff ; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s1, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s24, s2, 0xff -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v5, 8, s2 +; GFX8-NOHSA-NEXT: s_and_b32 s30, s2, 0xff ; GFX8-NOHSA-NEXT: s_bfe_u32 s2, s2, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s26, s3, 0xff -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s3 +; GFX8-NOHSA-NEXT: s_and_b32 s34, s3, 0xff ; GFX8-NOHSA-NEXT: s_bfe_u32 s3, s3, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s28, s4, 0xff -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v9, 8, s4 +; GFX8-NOHSA-NEXT: s_and_b32 s55, s4, 0xff ; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s4, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s41, s5, 0xff -; GFX8-NOHSA-NEXT: s_bfe_u32 s42, s5, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s43, s6, 0xff -; GFX8-NOHSA-NEXT: s_bfe_u32 s44, s6, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s45, s7, 0xff -; GFX8-NOHSA-NEXT: s_bfe_u32 s46, s7, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s47, s8, 0xff -; GFX8-NOHSA-NEXT: s_bfe_u32 s48, s8, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s49, s9, 0xff -; GFX8-NOHSA-NEXT: s_bfe_u32 s50, s9, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s51, s10, 0xff -; GFX8-NOHSA-NEXT: s_bfe_u32 s52, s10, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s53, s11, 0xff -; GFX8-NOHSA-NEXT: s_bfe_u32 s54, s11, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s55, s12, 0xff -; GFX8-NOHSA-NEXT: s_bfe_u32 s56, s12, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s57, s13, 0xff -; GFX8-NOHSA-NEXT: s_bfe_u32 s58, s13, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s59, s14, 0xff -; GFX8-NOHSA-NEXT: s_bfe_u32 s60, s14, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s31, s15, 0xff -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s15 -; GFX8-NOHSA-NEXT: s_bfe_u32 s15, s15, 0x80010 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s30 -; GFX8-NOHSA-NEXT: s_add_u32 s30, s16, 0xf0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s31 -; GFX8-NOHSA-NEXT: s_addc_u32 s31, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s30 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s15 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s31 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s28 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s14 -; GFX8-NOHSA-NEXT: s_add_u32 s14, s16, 0xe0 -; GFX8-NOHSA-NEXT: s_addc_u32 s15, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s14 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s59 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s60 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s40 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s15 -; GFX8-NOHSA-NEXT: s_add_u32 s14, s16, 0xd0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NOHSA-NEXT: s_addc_u32 s15, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s14 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s57 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s58 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s39 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s15 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s26 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s12 -; GFX8-NOHSA-NEXT: s_add_u32 s12, s16, 0xc0 -; GFX8-NOHSA-NEXT: s_addc_u32 s13, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s12 -; GFX8-NOHSA-NEXT: s_add_u32 s12, s16, 0xb0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s55 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s56 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s38 -; GFX8-NOHSA-NEXT: s_addc_u32 s13, s17, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s13 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s11 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s53 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s54 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s37 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s12 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s24 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s10 -; GFX8-NOHSA-NEXT: s_add_u32 s10, s16, 0xa0 -; GFX8-NOHSA-NEXT: s_addc_u32 s11, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s11 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s10 -; GFX8-NOHSA-NEXT: s_add_u32 s10, s16, 0x90 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s51 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s52 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s36 -; GFX8-NOHSA-NEXT: s_addc_u32 s11, s17, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s11 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s9 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s49 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s50 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s35 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s10 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s22 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s8 -; GFX8-NOHSA-NEXT: s_add_u32 s8, s16, 0x80 -; GFX8-NOHSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s9 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s8 -; GFX8-NOHSA-NEXT: s_add_u32 s8, s16, 0x70 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s47 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s34 -; GFX8-NOHSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s9 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s45 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s46 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s33 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s8 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s20 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s6 +; GFX8-NOHSA-NEXT: s_and_b32 s56, s5, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s5, s5, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s57, s6, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s58, s6, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s59, s7, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s60, s7, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s61, s8, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s8, s8, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s62, s9, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s9, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s63, s10, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s10, s10, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s64, s11, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s11, s11, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s65, s12, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s12, s12, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s66, s13, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s13, s13, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s67, s14, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s14, s14, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s6, s15, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s7, s15, 0x80010 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xf0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s54 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s53 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xe0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s67 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s52 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s51 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xd0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s66 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s49 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xc0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s65 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s48 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s47 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xb0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s64 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s46 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s45 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xa0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s63 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s44 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s43 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x90 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s62 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s42 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s41 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x80 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s61 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s40 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s39 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x70 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s59 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s38 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s60 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s37 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x60 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s57 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s36 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s58 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x50 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s43 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s44 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s29 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s7 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s41 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s42 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s27 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s6 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s56 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s33 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s31 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 64 ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s55 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s29 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s28 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s25 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] -; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s23 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[6:9] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s34 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s26 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s25 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s16, 32 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s16, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s21 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s30 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s22 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s17, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s19 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5] -; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s27 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s21 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s20 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s24 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s19 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s18 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s17 @@ -4103,136 +4102,125 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v0, 8, s15 -; GFX12-NEXT: v_lshrrev_b16 v9, 8, s2 -; GFX12-NEXT: v_lshrrev_b16 v5, 8, s1 -; GFX12-NEXT: v_lshrrev_b16 v2, 8, s14 -; GFX12-NEXT: v_lshrrev_b16 v15, 8, s4 -; GFX12-NEXT: v_lshrrev_b16 v13, 8, s3 -; GFX12-NEXT: v_lshrrev_b16 v3, 8, s13 -; GFX12-NEXT: v_lshrrev_b16 v12, 8, s6 -; GFX12-NEXT: v_lshrrev_b16 v14, 8, s5 -; GFX12-NEXT: s_lshr_b32 s34, s15, 24 -; GFX12-NEXT: v_lshrrev_b16 v4, 8, s12 -; GFX12-NEXT: v_lshrrev_b16 v10, 8, s8 -; GFX12-NEXT: v_lshrrev_b16 v11, 8, s7 -; GFX12-NEXT: s_and_b32 s50, s15, 0xff +; GFX12-NEXT: s_lshr_b32 s49, s15, 24 +; GFX12-NEXT: s_bfe_u32 s50, s15, 0x80008 +; GFX12-NEXT: s_and_b32 s66, s15, 0xff ; GFX12-NEXT: s_bfe_u32 s15, s15, 0x80010 -; GFX12-NEXT: s_lshr_b32 s33, s14, 24 -; GFX12-NEXT: s_and_b32 s49, s14, 0xff +; GFX12-NEXT: s_lshr_b32 s47, s14, 24 +; GFX12-NEXT: s_bfe_u32 s48, s14, 0x80008 +; GFX12-NEXT: s_and_b32 s65, s14, 0xff ; GFX12-NEXT: s_bfe_u32 s14, s14, 0x80010 -; GFX12-NEXT: s_lshr_b32 s26, s8, 24 -; GFX12-NEXT: s_lshr_b32 s31, s13, 24 -; GFX12-NEXT: v_lshrrev_b16 v6, 8, s11 -; GFX12-NEXT: v_lshrrev_b16 v7, 8, s10 -; GFX12-NEXT: v_lshrrev_b16 v8, 8, s9 -; GFX12-NEXT: v_lshrrev_b16 v1, 8, s0 -; GFX12-NEXT: v_dual_mov_b32 v60, 0 :: v_dual_and_b32 v5, 0xffff, v5 -; GFX12-NEXT: v_dual_mov_b32 v56, s50 :: v_dual_and_b32 v9, 0xffff, v9 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v58, s15 -; GFX12-NEXT: s_and_b32 s43, s8, 0xff -; GFX12-NEXT: s_bfe_u32 s8, s8, 0x80010 -; GFX12-NEXT: s_and_b32 s48, s13, 0xff +; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s50 +; GFX12-NEXT: s_lshr_b32 s45, s13, 24 +; GFX12-NEXT: s_bfe_u32 s46, s13, 0x80008 +; GFX12-NEXT: s_and_b32 s64, s13, 0xff ; GFX12-NEXT: s_bfe_u32 s13, s13, 0x80010 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v30, s43 :: v_dual_and_b32 v57, 0xffff, v0 -; GFX12-NEXT: v_dual_mov_b32 v59, s34 :: v_dual_mov_b32 v32, s8 -; GFX12-NEXT: s_lshr_b32 s27, s9, 24 -; GFX12-NEXT: s_lshr_b32 s30, s12, 24 -; GFX12-NEXT: v_dual_mov_b32 v52, s49 :: v_dual_and_b32 v13, 0xffff, v13 -; GFX12-NEXT: v_dual_mov_b32 v54, s14 :: v_dual_and_b32 v17, 0xffff, v15 -; GFX12-NEXT: s_and_b32 s42, s7, 0xff -; GFX12-NEXT: s_and_b32 s44, s9, 0xff -; GFX12-NEXT: s_bfe_u32 s9, s9, 0x80010 -; GFX12-NEXT: s_and_b32 s47, s12, 0xff +; GFX12-NEXT: v_dual_mov_b32 v0, s66 :: v_dual_mov_b32 v3, s49 +; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s48 +; GFX12-NEXT: s_lshr_b32 s43, s12, 24 +; GFX12-NEXT: s_bfe_u32 s44, s12, 0x80008 +; GFX12-NEXT: s_and_b32 s63, s12, 0xff ; GFX12-NEXT: s_bfe_u32 s12, s12, 0x80010 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v36, s9 :: v_dual_and_b32 v53, 0xffff, v2 -; GFX12-NEXT: v_dual_mov_b32 v55, s33 :: v_dual_mov_b32 v26, s42 -; GFX12-NEXT: s_lshr_b32 s25, s7, 24 -; GFX12-NEXT: v_dual_mov_b32 v48, s48 :: v_dual_and_b32 v21, 0xffff, v14 -; GFX12-NEXT: v_dual_mov_b32 v50, s13 :: v_dual_and_b32 v23, 0xffff, v12 -; GFX12-NEXT: s_bfe_u32 s7, s7, 0x80010 -; GFX12-NEXT: v_dual_mov_b32 v34, s44 :: v_dual_and_b32 v49, 0xffff, v3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v51, s31 :: v_dual_mov_b32 v28, s7 -; GFX12-NEXT: s_lshr_b32 s28, s10, 24 -; GFX12-NEXT: s_lshr_b32 s29, s11, 24 -; GFX12-NEXT: s_and_b32 s41, s6, 0xff -; GFX12-NEXT: v_dual_mov_b32 v44, s47 :: v_dual_and_b32 v27, 0xffff, v11 -; GFX12-NEXT: v_dual_mov_b32 v46, s12 :: v_dual_and_b32 v31, 0xffff, v10 -; GFX12-NEXT: s_and_b32 s45, s10, 0xff -; GFX12-NEXT: s_bfe_u32 s10, s10, 0x80010 -; GFX12-NEXT: s_and_b32 s46, s11, 0xff -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v40, s10 :: v_dual_and_b32 v45, 0xffff, v4 -; GFX12-NEXT: v_dual_mov_b32 v47, s30 :: v_dual_mov_b32 v22, s41 +; GFX12-NEXT: v_dual_mov_b32 v4, s65 :: v_dual_mov_b32 v7, s47 +; GFX12-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v9, s46 +; GFX12-NEXT: v_dual_mov_b32 v8, s64 :: v_dual_mov_b32 v11, s45 +; GFX12-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v13, s44 +; GFX12-NEXT: s_lshr_b32 s41, s11, 24 +; GFX12-NEXT: s_bfe_u32 s42, s11, 0x80008 +; GFX12-NEXT: s_and_b32 s62, s11, 0xff +; GFX12-NEXT: v_dual_mov_b32 v12, s63 :: v_dual_mov_b32 v15, s43 +; GFX12-NEXT: v_mov_b32_e32 v14, s12 ; GFX12-NEXT: s_bfe_u32 s11, s11, 0x80010 -; GFX12-NEXT: s_lshr_b32 s24, s6, 24 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-NEXT: s_bfe_u32 s6, s6, 0x80010 -; GFX12-NEXT: v_dual_mov_b32 v42, s46 :: v_dual_and_b32 v35, 0xffff, v8 -; GFX12-NEXT: v_and_b32_e32 v39, 0xffff, v7 -; GFX12-NEXT: v_dual_mov_b32 v38, s45 :: v_dual_and_b32 v43, 0xffff, v6 +; GFX12-NEXT: s_lshr_b32 s39, s10, 24 +; GFX12-NEXT: s_bfe_u32 s40, s10, 0x80008 +; GFX12-NEXT: s_and_b32 s61, s10, 0xff +; GFX12-NEXT: s_bfe_u32 s10, s10, 0x80010 +; GFX12-NEXT: s_lshr_b32 s37, s9, 24 +; GFX12-NEXT: s_bfe_u32 s38, s9, 0x80008 +; GFX12-NEXT: s_and_b32 s60, s9, 0xff +; GFX12-NEXT: s_bfe_u32 s9, s9, 0x80010 ; GFX12-NEXT: s_clause 0x3 -; GFX12-NEXT: global_store_b128 v60, v[56:59], s[16:17] offset:240 -; GFX12-NEXT: global_store_b128 v60, v[52:55], s[16:17] offset:224 -; GFX12-NEXT: global_store_b128 v60, v[48:51], s[16:17] offset:208 -; GFX12-NEXT: global_store_b128 v60, v[44:47], s[16:17] offset:192 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v44, s11 :: v_dual_mov_b32 v45, s29 -; GFX12-NEXT: v_mov_b32_e32 v24, s6 -; GFX12-NEXT: s_and_b32 s40, s5, 0xff -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v41, s28 :: v_dual_mov_b32 v20, s40 -; GFX12-NEXT: s_lshr_b32 s23, s5, 24 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[16:17] offset:240 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[16:17] offset:224 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[16:17] offset:208 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[16:17] offset:192 +; GFX12-NEXT: v_dual_mov_b32 v1, s42 :: v_dual_mov_b32 v0, s62 +; GFX12-NEXT: v_dual_mov_b32 v3, s41 :: v_dual_mov_b32 v2, s11 +; GFX12-NEXT: v_mov_b32_e32 v5, s40 +; GFX12-NEXT: s_lshr_b32 s35, s8, 24 +; GFX12-NEXT: s_bfe_u32 s36, s8, 0x80008 +; GFX12-NEXT: s_and_b32 s59, s8, 0xff +; GFX12-NEXT: s_bfe_u32 s8, s8, 0x80010 +; GFX12-NEXT: v_dual_mov_b32 v4, s61 :: v_dual_mov_b32 v7, s39 +; GFX12-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v9, s38 +; GFX12-NEXT: s_lshr_b32 s33, s7, 24 +; GFX12-NEXT: s_bfe_u32 s34, s7, 0x80008 +; GFX12-NEXT: s_and_b32 s58, s7, 0xff +; GFX12-NEXT: s_bfe_u32 s7, s7, 0x80010 +; GFX12-NEXT: v_dual_mov_b32 v8, s60 :: v_dual_mov_b32 v11, s37 +; GFX12-NEXT: v_dual_mov_b32 v10, s9 :: v_dual_mov_b32 v13, s36 +; GFX12-NEXT: s_lshr_b32 s28, s5, 24 +; GFX12-NEXT: s_bfe_u32 s29, s5, 0x80008 +; GFX12-NEXT: s_lshr_b32 s30, s6, 24 +; GFX12-NEXT: s_bfe_u32 s31, s6, 0x80008 +; GFX12-NEXT: s_and_b32 s56, s5, 0xff ; GFX12-NEXT: s_bfe_u32 s5, s5, 0x80010 -; GFX12-NEXT: v_mov_b32_e32 v37, s27 -; GFX12-NEXT: s_lshr_b32 s22, s4, 24 -; GFX12-NEXT: s_and_b32 s38, s3, 0xff -; GFX12-NEXT: s_and_b32 s39, s4, 0xff +; GFX12-NEXT: s_and_b32 s57, s6, 0xff +; GFX12-NEXT: s_bfe_u32 s6, s6, 0x80010 +; GFX12-NEXT: v_dual_mov_b32 v12, s59 :: v_dual_mov_b32 v15, s35 +; GFX12-NEXT: v_dual_mov_b32 v14, s8 :: v_dual_mov_b32 v17, s34 +; GFX12-NEXT: s_lshr_b32 s26, s4, 24 +; GFX12-NEXT: s_bfe_u32 s27, s4, 0x80008 +; GFX12-NEXT: s_and_b32 s55, s4, 0xff ; GFX12-NEXT: s_bfe_u32 s4, s4, 0x80010 -; GFX12-NEXT: v_dual_mov_b32 v33, s26 :: v_dual_mov_b32 v16, s39 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v29, s25 :: v_dual_mov_b32 v18, s4 -; GFX12-NEXT: s_lshr_b32 s21, s3, 24 +; GFX12-NEXT: v_dual_mov_b32 v16, s58 :: v_dual_mov_b32 v19, s33 +; GFX12-NEXT: v_dual_mov_b32 v18, s7 :: v_dual_mov_b32 v21, s31 +; GFX12-NEXT: s_lshr_b32 s24, s3, 24 +; GFX12-NEXT: s_bfe_u32 s25, s3, 0x80008 +; GFX12-NEXT: s_and_b32 s54, s3, 0xff ; GFX12-NEXT: s_bfe_u32 s3, s3, 0x80010 -; GFX12-NEXT: v_dual_mov_b32 v25, s24 :: v_dual_mov_b32 v12, s38 +; GFX12-NEXT: v_dual_mov_b32 v20, s57 :: v_dual_mov_b32 v23, s30 +; GFX12-NEXT: v_mov_b32_e32 v22, s6 ; GFX12-NEXT: s_clause 0x5 -; GFX12-NEXT: global_store_b128 v60, v[42:45], s[16:17] offset:176 -; GFX12-NEXT: global_store_b128 v60, v[38:41], s[16:17] offset:160 -; GFX12-NEXT: global_store_b128 v60, v[34:37], s[16:17] offset:144 -; GFX12-NEXT: global_store_b128 v60, v[30:33], s[16:17] offset:128 -; GFX12-NEXT: global_store_b128 v60, v[26:29], s[16:17] offset:112 -; GFX12-NEXT: global_store_b128 v60, v[22:25], s[16:17] offset:96 -; GFX12-NEXT: v_dual_mov_b32 v22, s5 :: v_dual_mov_b32 v23, s23 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v14, s3 -; GFX12-NEXT: s_lshr_b32 s20, s2, 24 -; GFX12-NEXT: s_and_b32 s37, s2, 0xff +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[16:17] offset:176 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[16:17] offset:160 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[16:17] offset:144 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[16:17] offset:128 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[16:17] offset:112 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[16:17] offset:96 +; GFX12-NEXT: v_dual_mov_b32 v1, s29 :: v_dual_mov_b32 v0, s56 +; GFX12-NEXT: v_dual_mov_b32 v3, s28 :: v_dual_mov_b32 v2, s5 +; GFX12-NEXT: v_mov_b32_e32 v5, s27 +; GFX12-NEXT: s_lshr_b32 s22, s2, 24 +; GFX12-NEXT: s_bfe_u32 s23, s2, 0x80008 +; GFX12-NEXT: s_and_b32 s53, s2, 0xff ; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v19, s22 :: v_dual_mov_b32 v8, s37 -; GFX12-NEXT: s_lshr_b32 s19, s1, 24 -; GFX12-NEXT: s_and_b32 s36, s1, 0xff +; GFX12-NEXT: v_dual_mov_b32 v4, s55 :: v_dual_mov_b32 v7, s26 +; GFX12-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v9, s25 +; GFX12-NEXT: s_lshr_b32 s20, s1, 24 +; GFX12-NEXT: s_bfe_u32 s21, s1, 0x80008 +; GFX12-NEXT: s_and_b32 s52, s1, 0xff ; GFX12-NEXT: s_bfe_u32 s1, s1, 0x80010 -; GFX12-NEXT: v_dual_mov_b32 v15, s21 :: v_dual_mov_b32 v10, s2 +; GFX12-NEXT: v_dual_mov_b32 v8, s54 :: v_dual_mov_b32 v11, s24 +; GFX12-NEXT: v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v13, s23 ; GFX12-NEXT: s_lshr_b32 s18, s0, 24 -; GFX12-NEXT: s_and_b32 s35, s0, 0xff +; GFX12-NEXT: s_bfe_u32 s19, s0, 0x80008 +; GFX12-NEXT: s_and_b32 s51, s0, 0xff ; GFX12-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX12-NEXT: v_dual_mov_b32 v12, s53 :: v_dual_mov_b32 v15, s22 +; GFX12-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v17, s21 +; GFX12-NEXT: v_dual_mov_b32 v16, s52 :: v_dual_mov_b32 v19, s20 +; GFX12-NEXT: v_dual_mov_b32 v18, s1 :: v_dual_mov_b32 v21, s19 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v11, s20 :: v_dual_mov_b32 v4, s36 -; GFX12-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, s19 -; GFX12-NEXT: v_dual_mov_b32 v0, s35 :: v_dual_mov_b32 v3, s18 -; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v20, s51 :: v_dual_mov_b32 v23, s18 +; GFX12-NEXT: v_mov_b32_e32 v22, s0 ; GFX12-NEXT: s_clause 0x5 -; GFX12-NEXT: global_store_b128 v60, v[20:23], s[16:17] offset:80 -; GFX12-NEXT: global_store_b128 v60, v[16:19], s[16:17] offset:64 -; GFX12-NEXT: global_store_b128 v60, v[12:15], s[16:17] offset:48 -; GFX12-NEXT: global_store_b128 v60, v[8:11], s[16:17] offset:32 -; GFX12-NEXT: global_store_b128 v60, v[4:7], s[16:17] offset:16 -; GFX12-NEXT: global_store_b128 v60, v[0:3], s[16:17] +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[16:17] offset:80 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[16:17] offset:64 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[16:17] offset:48 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[16:17] offset:32 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[16:17] offset:16 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[16:17] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -4631,208 +4619,208 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v18, 8, s14 ; GFX8-NOHSA-NEXT: s_ashr_i32 s18, s0, 24 ; GFX8-NOHSA-NEXT: s_bfe_i32 s19, s0, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s20, s1, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s21, s1, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s22, s2, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s23, s2, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s24, s3, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s25, s3, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s26, s4, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s27, s4, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s28, s5, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s29, s5, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s30, s6, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s31, s6, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s33, s7, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s34, s7, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s35, s8, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s36, s8, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s37, s9, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s38, s9, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s39, s10, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s40, s10, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s41, s11, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s42, s11, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s43, s12, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s44, s12, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s45, s13, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s46, s13, 0x80010 -; GFX8-NOHSA-NEXT: s_ashr_i32 s47, s14, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s48, s14, 0x80010 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s49, s14 -; GFX8-NOHSA-NEXT: s_ashr_i32 s14, s15, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s50, s15, 0x80010 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v5, 8, s15 +; GFX8-NOHSA-NEXT: s_bfe_i32 s20, s0, 0x80008 +; GFX8-NOHSA-NEXT: s_ashr_i32 s21, s1, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s22, s1, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_i32 s23, s1, 0x80008 +; GFX8-NOHSA-NEXT: s_ashr_i32 s24, s2, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s25, s2, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_i32 s26, s2, 0x80008 +; GFX8-NOHSA-NEXT: s_ashr_i32 s27, s3, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s28, s3, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_i32 s29, s3, 0x80008 +; GFX8-NOHSA-NEXT: s_ashr_i32 s30, s4, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s31, s4, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_i32 s33, s4, 0x80008 +; GFX8-NOHSA-NEXT: s_ashr_i32 s34, s5, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s35, s5, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_i32 s36, s5, 0x80008 +; GFX8-NOHSA-NEXT: s_ashr_i32 s37, s6, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s38, s6, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_i32 s39, s6, 0x80008 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s40, s6 +; GFX8-NOHSA-NEXT: s_ashr_i32 s41, s7, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s42, s7, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_i32 s43, s7, 0x80008 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s44, s7 +; GFX8-NOHSA-NEXT: s_ashr_i32 s45, s8, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s46, s8, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_i32 s47, s8, 0x80008 +; GFX8-NOHSA-NEXT: s_ashr_i32 s48, s9, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s49, s9, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_i32 s50, s9, 0x80008 +; GFX8-NOHSA-NEXT: s_ashr_i32 s51, s10, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s52, s10, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_i32 s53, s10, 0x80008 +; GFX8-NOHSA-NEXT: s_ashr_i32 s54, s11, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s55, s11, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_i32 s56, s11, 0x80008 +; GFX8-NOHSA-NEXT: s_ashr_i32 s57, s12, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s58, s12, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_i32 s59, s12, 0x80008 +; GFX8-NOHSA-NEXT: s_ashr_i32 s60, s13, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s61, s13, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_i32 s62, s13, 0x80008 +; GFX8-NOHSA-NEXT: s_ashr_i32 s63, s14, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s64, s14, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_i32 s65, s14, 0x80008 +; GFX8-NOHSA-NEXT: s_ashr_i32 s6, s15, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s7, s15, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_i32 s66, s15, 0x80008 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xf0 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s15, s15 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s14 -; GFX8-NOHSA-NEXT: s_add_u32 s14, s16, 0xf0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s15 -; GFX8-NOHSA-NEXT: s_addc_u32 s15, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s15 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s14 -; GFX8-NOHSA-NEXT: s_add_u32 s14, s16, 0xe0 -; GFX8-NOHSA-NEXT: v_bfe_i32 v13, v5, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s50 -; GFX8-NOHSA-NEXT: s_addc_u32 s15, s17, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s15 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v20, 8, s12 -; GFX8-NOHSA-NEXT: v_bfe_i32 v13, v18, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s49 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s47 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s14 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s14, s12 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s12, s13 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v19, 8, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s12 -; GFX8-NOHSA-NEXT: s_add_u32 s12, s16, 0xd0 -; GFX8-NOHSA-NEXT: s_addc_u32 s13, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s12 -; GFX8-NOHSA-NEXT: s_add_u32 s12, s16, 0xc0 -; GFX8-NOHSA-NEXT: v_bfe_i32 v13, v19, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s46 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s45 -; GFX8-NOHSA-NEXT: s_addc_u32 s13, s17, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s12 -; GFX8-NOHSA-NEXT: s_add_u32 s12, s16, 0xb0 -; GFX8-NOHSA-NEXT: v_bfe_i32 v13, v20, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s14 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s44 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s43 -; GFX8-NOHSA-NEXT: s_addc_u32 s13, s17, 0 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s15 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s66 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xe0 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s14, s14 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s65 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s64 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s63 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xd0 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s13, s13 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s62 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s61 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s60 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xc0 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s12, s12 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s59 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s58 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s57 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xb0 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s11, s11 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s13 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s8 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s4 -; GFX8-NOHSA-NEXT: v_bfe_i32 v12, v11, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s11 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s42 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s41 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s12 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s11, s4 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s4, s8 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s8, s10 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v8, 8, s9 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s8 -; GFX8-NOHSA-NEXT: s_add_u32 s8, s16, 0xa0 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s12, s9 -; GFX8-NOHSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v9, 8, s10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s9 -; GFX8-NOHSA-NEXT: v_bfe_i32 v12, v9, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s40 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s39 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s8 -; GFX8-NOHSA-NEXT: s_add_u32 s8, s16, 0x90 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NOHSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX8-NOHSA-NEXT: v_bfe_i32 v12, v8, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s12 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s38 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s37 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s9 -; GFX8-NOHSA-NEXT: s_add_u32 s8, s16, 0x80 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[11:14] -; GFX8-NOHSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX8-NOHSA-NEXT: v_bfe_i32 v12, v7, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s36 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s35 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s9 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s4, s7 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[11:14] -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 0x70 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s8, s5 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s5 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v4, 8, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 0x60 -; GFX8-NOHSA-NEXT: v_bfe_i32 v12, v4, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s34 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s33 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[11:14] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s5 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s6 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s6, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 0x50 -; GFX8-NOHSA-NEXT: v_bfe_i32 v12, v2, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s31 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s30 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[11:14] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s4 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s56 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s55 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s54 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xa0 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s10, s10 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s53 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s52 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s51 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x90 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s9, s9 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s49 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s48 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x80 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s8, s8 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s47 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s46 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s45 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x70 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s44 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s43 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s42 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s41 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x60 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s40 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s39 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s38 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s37 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x50 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s5, s5 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s4, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s36 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s35 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s34 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s3, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 64 -; GFX8-NOHSA-NEXT: v_bfe_i32 v12, v0, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s29 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s28 ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[11:14] -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v21, 8, s2 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s33 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s31 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s30 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 48 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s2, s2 -; GFX8-NOHSA-NEXT: v_bfe_i32 v7, v6, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s11 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s27 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s26 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v10, 8, s3 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s3, s3 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[11:12], v[6:9] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s2, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s29 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s28 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s27 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s1, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s16, 32 -; GFX8-NOHSA-NEXT: v_bfe_i32 v10, v10, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s25 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s4 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s17, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[9:12] -; GFX8-NOHSA-NEXT: v_bfe_i32 v5, v21, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s16, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s23 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s22 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s26 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s25 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s24 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s17, 0 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s1 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s1, s1 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GFX8-NOHSA-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s21 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s20 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s22 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s0, s0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5] -; GFX8-NOHSA-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s20 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s19 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s18 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s17 @@ -5074,145 +5062,124 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v0, 8, s15 -; GFX12-NEXT: v_lshrrev_b16 v2, 8, s14 -; GFX12-NEXT: v_lshrrev_b16 v3, 8, s13 -; GFX12-NEXT: v_lshrrev_b16 v4, 8, s12 -; GFX12-NEXT: s_ashr_i32 s49, s15, 24 -; GFX12-NEXT: s_bfe_i32 s50, s15, 0x80010 -; GFX12-NEXT: s_sext_i32_i8 s15, s15 -; GFX12-NEXT: s_ashr_i32 s47, s14, 24 -; GFX12-NEXT: s_bfe_i32 s48, s14, 0x80010 +; GFX12-NEXT: s_ashr_i32 s64, s15, 24 +; GFX12-NEXT: s_bfe_i32 s65, s15, 0x80010 +; GFX12-NEXT: s_sext_i32_i8 s66, s15 +; GFX12-NEXT: s_bfe_i32 s15, s15, 0x80008 +; GFX12-NEXT: s_ashr_i32 s61, s14, 24 +; GFX12-NEXT: s_bfe_i32 s62, s14, 0x80010 +; GFX12-NEXT: s_bfe_i32 s63, s14, 0x80008 ; GFX12-NEXT: s_sext_i32_i8 s14, s14 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v59, 0 :: v_dual_mov_b32 v52, s15 -; GFX12-NEXT: v_lshrrev_b16 v6, 8, s11 -; GFX12-NEXT: s_ashr_i32 s45, s13, 24 -; GFX12-NEXT: s_bfe_i32 s46, s13, 0x80010 +; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s15 +; GFX12-NEXT: s_ashr_i32 s58, s13, 24 +; GFX12-NEXT: s_bfe_i32 s59, s13, 0x80010 +; GFX12-NEXT: s_bfe_i32 s60, s13, 0x80008 ; GFX12-NEXT: s_sext_i32_i8 s13, s13 -; GFX12-NEXT: v_bfe_i32 v53, v0, 0, 8 -; GFX12-NEXT: v_dual_mov_b32 v54, s50 :: v_dual_mov_b32 v55, s49 -; GFX12-NEXT: v_lshrrev_b16 v7, 8, s10 -; GFX12-NEXT: s_ashr_i32 s43, s12, 24 -; GFX12-NEXT: s_bfe_i32 s44, s12, 0x80010 +; GFX12-NEXT: v_dual_mov_b32 v0, s66 :: v_dual_mov_b32 v3, s64 +; GFX12-NEXT: v_dual_mov_b32 v2, s65 :: v_dual_mov_b32 v5, s63 +; GFX12-NEXT: s_ashr_i32 s55, s12, 24 +; GFX12-NEXT: s_bfe_i32 s56, s12, 0x80010 +; GFX12-NEXT: s_bfe_i32 s57, s12, 0x80008 ; GFX12-NEXT: s_sext_i32_i8 s12, s12 -; GFX12-NEXT: v_bfe_i32 v49, v2, 0, 8 -; GFX12-NEXT: v_dual_mov_b32 v48, s14 :: v_dual_mov_b32 v51, s47 -; GFX12-NEXT: v_dual_mov_b32 v50, s48 :: v_dual_mov_b32 v47, s45 -; GFX12-NEXT: v_lshrrev_b16 v8, 8, s9 -; GFX12-NEXT: s_ashr_i32 s41, s11, 24 -; GFX12-NEXT: s_bfe_i32 s42, s11, 0x80010 +; GFX12-NEXT: v_dual_mov_b32 v4, s14 :: v_dual_mov_b32 v7, s61 +; GFX12-NEXT: v_dual_mov_b32 v6, s62 :: v_dual_mov_b32 v9, s60 +; GFX12-NEXT: v_dual_mov_b32 v8, s13 :: v_dual_mov_b32 v11, s58 +; GFX12-NEXT: v_dual_mov_b32 v10, s59 :: v_dual_mov_b32 v13, s57 +; GFX12-NEXT: s_ashr_i32 s52, s11, 24 +; GFX12-NEXT: s_bfe_i32 s53, s11, 0x80010 +; GFX12-NEXT: s_bfe_i32 s54, s11, 0x80008 +; GFX12-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v15, s55 +; GFX12-NEXT: v_mov_b32_e32 v14, s56 ; GFX12-NEXT: s_sext_i32_i8 s11, s11 -; GFX12-NEXT: v_bfe_i32 v45, v3, 0, 8 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v44, s13 :: v_dual_mov_b32 v43, s43 -; GFX12-NEXT: v_mov_b32_e32 v46, s46 -; GFX12-NEXT: v_lshrrev_b16 v10, 8, s8 -; GFX12-NEXT: s_ashr_i32 s39, s10, 24 -; GFX12-NEXT: v_bfe_i32 v41, v4, 0, 8 -; GFX12-NEXT: v_dual_mov_b32 v40, s12 :: v_dual_mov_b32 v57, s42 -; GFX12-NEXT: v_mov_b32_e32 v42, s44 -; GFX12-NEXT: v_lshrrev_b16 v11, 8, s7 -; GFX12-NEXT: v_lshrrev_b16 v12, 8, s6 -; GFX12-NEXT: s_bfe_i32 s40, s10, 0x80010 +; GFX12-NEXT: s_ashr_i32 s49, s10, 24 +; GFX12-NEXT: s_bfe_i32 s50, s10, 0x80010 +; GFX12-NEXT: s_bfe_i32 s51, s10, 0x80008 ; GFX12-NEXT: s_sext_i32_i8 s10, s10 -; GFX12-NEXT: v_lshrrev_b16 v14, 8, s5 -; GFX12-NEXT: v_lshrrev_b16 v15, 8, s4 -; GFX12-NEXT: s_ashr_i32 s37, s9, 24 -; GFX12-NEXT: s_bfe_i32 s38, s9, 0x80010 +; GFX12-NEXT: s_ashr_i32 s46, s9, 24 +; GFX12-NEXT: s_bfe_i32 s47, s9, 0x80010 +; GFX12-NEXT: s_bfe_i32 s48, s9, 0x80008 ; GFX12-NEXT: s_sext_i32_i8 s9, s9 -; GFX12-NEXT: v_bfe_i32 v56, v6, 0, 8 ; GFX12-NEXT: s_clause 0x3 -; GFX12-NEXT: global_store_b128 v59, v[52:55], s[16:17] offset:240 -; GFX12-NEXT: global_store_b128 v59, v[48:51], s[16:17] offset:224 -; GFX12-NEXT: global_store_b128 v59, v[44:47], s[16:17] offset:208 -; GFX12-NEXT: global_store_b128 v59, v[40:43], s[16:17] offset:192 -; GFX12-NEXT: v_mov_b32_e32 v41, s39 -; GFX12-NEXT: v_dual_mov_b32 v55, s11 :: v_dual_mov_b32 v58, s41 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v37, s37 -; GFX12-NEXT: s_ashr_i32 s33, s7, 24 -; GFX12-NEXT: s_ashr_i32 s35, s8, 24 -; GFX12-NEXT: s_bfe_i32 s36, s8, 0x80010 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[16:17] offset:240 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[16:17] offset:224 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[16:17] offset:208 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[16:17] offset:192 +; GFX12-NEXT: v_dual_mov_b32 v1, s54 :: v_dual_mov_b32 v0, s11 +; GFX12-NEXT: v_dual_mov_b32 v3, s52 :: v_dual_mov_b32 v2, s53 +; GFX12-NEXT: v_mov_b32_e32 v5, s51 +; GFX12-NEXT: s_ashr_i32 s43, s8, 24 +; GFX12-NEXT: s_bfe_i32 s44, s8, 0x80010 +; GFX12-NEXT: s_bfe_i32 s45, s8, 0x80008 ; GFX12-NEXT: s_sext_i32_i8 s8, s8 -; GFX12-NEXT: v_bfe_i32 v39, v7, 0, 8 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v38, s10 :: v_dual_mov_b32 v33, s35 -; GFX12-NEXT: v_dual_mov_b32 v40, s40 :: v_dual_mov_b32 v29, s33 -; GFX12-NEXT: v_lshrrev_b16 v13, 8, s3 -; GFX12-NEXT: s_ashr_i32 s28, s5, 24 -; GFX12-NEXT: s_ashr_i32 s30, s6, 24 -; GFX12-NEXT: s_bfe_i32 s31, s6, 0x80010 -; GFX12-NEXT: s_sext_i32_i8 s6, s6 -; GFX12-NEXT: s_bfe_i32 s34, s7, 0x80010 +; GFX12-NEXT: v_dual_mov_b32 v4, s10 :: v_dual_mov_b32 v7, s49 +; GFX12-NEXT: v_dual_mov_b32 v6, s50 :: v_dual_mov_b32 v9, s48 +; GFX12-NEXT: s_ashr_i32 s40, s7, 24 +; GFX12-NEXT: s_bfe_i32 s41, s7, 0x80010 +; GFX12-NEXT: s_bfe_i32 s42, s7, 0x80008 ; GFX12-NEXT: s_sext_i32_i8 s7, s7 -; GFX12-NEXT: v_bfe_i32 v35, v8, 0, 8 -; GFX12-NEXT: v_dual_mov_b32 v34, s9 :: v_dual_mov_b32 v25, s30 -; GFX12-NEXT: v_mov_b32_e32 v36, s38 -; GFX12-NEXT: v_lshrrev_b16 v9, 8, s2 -; GFX12-NEXT: s_ashr_i32 s18, s0, 24 -; GFX12-NEXT: s_ashr_i32 s20, s1, 24 -; GFX12-NEXT: s_ashr_i32 s22, s2, 24 -; GFX12-NEXT: s_ashr_i32 s24, s3, 24 -; GFX12-NEXT: s_ashr_i32 s26, s4, 24 -; GFX12-NEXT: s_bfe_i32 s29, s5, 0x80010 +; GFX12-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v11, s46 +; GFX12-NEXT: v_dual_mov_b32 v10, s47 :: v_dual_mov_b32 v13, s45 +; GFX12-NEXT: s_ashr_i32 s34, s5, 24 +; GFX12-NEXT: s_bfe_i32 s35, s5, 0x80010 +; GFX12-NEXT: s_bfe_i32 s36, s5, 0x80008 ; GFX12-NEXT: s_sext_i32_i8 s5, s5 -; GFX12-NEXT: v_bfe_i32 v31, v10, 0, 8 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v30, s8 :: v_dual_mov_b32 v19, s26 -; GFX12-NEXT: v_mov_b32_e32 v32, s36 -; GFX12-NEXT: v_lshrrev_b16 v5, 8, s1 -; GFX12-NEXT: s_bfe_i32 s27, s4, 0x80010 +; GFX12-NEXT: s_ashr_i32 s37, s6, 24 +; GFX12-NEXT: s_bfe_i32 s38, s6, 0x80010 +; GFX12-NEXT: s_bfe_i32 s39, s6, 0x80008 +; GFX12-NEXT: s_sext_i32_i8 s6, s6 +; GFX12-NEXT: v_dual_mov_b32 v12, s8 :: v_dual_mov_b32 v15, s43 +; GFX12-NEXT: v_dual_mov_b32 v14, s44 :: v_dual_mov_b32 v17, s42 +; GFX12-NEXT: s_ashr_i32 s30, s4, 24 +; GFX12-NEXT: s_bfe_i32 s31, s4, 0x80010 +; GFX12-NEXT: s_bfe_i32 s33, s4, 0x80008 ; GFX12-NEXT: s_sext_i32_i8 s4, s4 -; GFX12-NEXT: v_bfe_i32 v23, v12, 0, 8 -; GFX12-NEXT: v_bfe_i32 v27, v11, 0, 8 -; GFX12-NEXT: v_dual_mov_b32 v26, s7 :: v_dual_mov_b32 v11, s22 -; GFX12-NEXT: v_dual_mov_b32 v28, s34 :: v_dual_mov_b32 v7, s20 -; GFX12-NEXT: v_dual_mov_b32 v22, s6 :: v_dual_mov_b32 v3, s18 -; GFX12-NEXT: v_lshrrev_b16 v1, 8, s0 -; GFX12-NEXT: s_bfe_i32 s25, s3, 0x80010 +; GFX12-NEXT: v_dual_mov_b32 v16, s7 :: v_dual_mov_b32 v19, s40 +; GFX12-NEXT: v_dual_mov_b32 v18, s41 :: v_dual_mov_b32 v21, s39 +; GFX12-NEXT: s_ashr_i32 s27, s3, 24 +; GFX12-NEXT: s_bfe_i32 s28, s3, 0x80010 +; GFX12-NEXT: s_bfe_i32 s29, s3, 0x80008 ; GFX12-NEXT: s_sext_i32_i8 s3, s3 -; GFX12-NEXT: v_bfe_i32 v17, v15, 0, 8 -; GFX12-NEXT: v_bfe_i32 v21, v14, 0, 8 -; GFX12-NEXT: v_mov_b32_e32 v24, s31 -; GFX12-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v15, s24 +; GFX12-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v23, s37 +; GFX12-NEXT: v_mov_b32_e32 v22, s38 ; GFX12-NEXT: s_clause 0x5 -; GFX12-NEXT: global_store_b128 v59, v[55:58], s[16:17] offset:176 -; GFX12-NEXT: global_store_b128 v59, v[38:41], s[16:17] offset:160 -; GFX12-NEXT: global_store_b128 v59, v[34:37], s[16:17] offset:144 -; GFX12-NEXT: global_store_b128 v59, v[30:33], s[16:17] offset:128 -; GFX12-NEXT: global_store_b128 v59, v[26:29], s[16:17] offset:112 -; GFX12-NEXT: global_store_b128 v59, v[22:25], s[16:17] offset:96 -; GFX12-NEXT: v_dual_mov_b32 v22, s29 :: v_dual_mov_b32 v23, s28 -; GFX12-NEXT: s_bfe_i32 s23, s2, 0x80010 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[16:17] offset:176 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[16:17] offset:160 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[16:17] offset:144 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[16:17] offset:128 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[16:17] offset:112 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[16:17] offset:96 +; GFX12-NEXT: v_dual_mov_b32 v1, s36 :: v_dual_mov_b32 v0, s5 +; GFX12-NEXT: v_dual_mov_b32 v3, s34 :: v_dual_mov_b32 v2, s35 +; GFX12-NEXT: v_mov_b32_e32 v5, s33 +; GFX12-NEXT: s_ashr_i32 s24, s2, 24 +; GFX12-NEXT: s_bfe_i32 s25, s2, 0x80010 +; GFX12-NEXT: s_bfe_i32 s26, s2, 0x80008 ; GFX12-NEXT: s_sext_i32_i8 s2, s2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v16, s4 -; GFX12-NEXT: v_mov_b32_e32 v18, s27 -; GFX12-NEXT: s_bfe_i32 s21, s1, 0x80010 +; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s30 +; GFX12-NEXT: v_dual_mov_b32 v6, s31 :: v_dual_mov_b32 v9, s29 +; GFX12-NEXT: s_ashr_i32 s21, s1, 24 +; GFX12-NEXT: s_bfe_i32 s22, s1, 0x80010 +; GFX12-NEXT: s_bfe_i32 s23, s1, 0x80008 ; GFX12-NEXT: s_sext_i32_i8 s1, s1 -; GFX12-NEXT: v_bfe_i32 v13, v13, 0, 8 -; GFX12-NEXT: v_mov_b32_e32 v12, s3 -; GFX12-NEXT: v_mov_b32_e32 v14, s25 +; GFX12-NEXT: v_dual_mov_b32 v8, s3 :: v_dual_mov_b32 v11, s27 +; GFX12-NEXT: v_dual_mov_b32 v10, s28 :: v_dual_mov_b32 v13, s26 +; GFX12-NEXT: s_ashr_i32 s18, s0, 24 ; GFX12-NEXT: s_bfe_i32 s19, s0, 0x80010 +; GFX12-NEXT: s_bfe_i32 s20, s0, 0x80008 ; GFX12-NEXT: s_sext_i32_i8 s0, s0 -; GFX12-NEXT: v_bfe_i32 v9, v9, 0, 8 -; GFX12-NEXT: v_mov_b32_e32 v8, s2 -; GFX12-NEXT: v_mov_b32_e32 v10, s23 -; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 8 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v4, s1 -; GFX12-NEXT: v_mov_b32_e32 v6, s21 -; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-NEXT: v_mov_b32_e32 v2, s19 +; GFX12-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v15, s24 +; GFX12-NEXT: v_dual_mov_b32 v14, s25 :: v_dual_mov_b32 v17, s23 +; GFX12-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v19, s21 +; GFX12-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v21, s20 +; GFX12-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v23, s18 +; GFX12-NEXT: v_mov_b32_e32 v22, s19 ; GFX12-NEXT: s_clause 0x5 -; GFX12-NEXT: global_store_b128 v59, v[20:23], s[16:17] offset:80 -; GFX12-NEXT: global_store_b128 v59, v[16:19], s[16:17] offset:64 -; GFX12-NEXT: global_store_b128 v59, v[12:15], s[16:17] offset:48 -; GFX12-NEXT: global_store_b128 v59, v[8:11], s[16:17] offset:32 -; GFX12-NEXT: global_store_b128 v59, v[4:7], s[16:17] offset:16 -; GFX12-NEXT: global_store_b128 v59, v[0:3], s[16:17] +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[16:17] offset:80 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[16:17] offset:64 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[16:17] offset:48 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[16:17] offset:32 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[16:17] offset:16 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[16:17] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -5604,17 +5571,18 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out ; GFX8-NOHSA-LABEL: constant_zextload_v2i8_to_v2i64: ; GFX8-NOHSA: ; %bb.0: ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, 8 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NOHSA-NEXT: flat_load_ushort v2, v[0:1] +; GFX8-NOHSA-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0) -; GFX8-NOHSA-NEXT: v_and_b32_e32 v0, 0xff, v2 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e32 v2, 8, v2 +; GFX8-NOHSA-NEXT: v_lshrrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NOHSA-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm ; @@ -5655,10 +5623,10 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v1, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v2, 8, v0 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 8, v2 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -5720,7 +5688,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0) -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e32 v2, 8, v0 +; GFX8-NOHSA-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX8-NOHSA-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX8-NOHSA-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -5766,7 +5734,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v4, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v1, 8, v0 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_bfe_i32 v2, v1, 0, 8 @@ -5842,8 +5810,8 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s2, 24 -; GFX8-NOHSA-NEXT: s_and_b32 s4, s2, 0xff -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s2 +; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s2, 0x80008 +; GFX8-NOHSA-NEXT: s_and_b32 s5, s2, 0xff ; GFX8-NOHSA-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 @@ -5853,8 +5821,8 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -5897,19 +5865,17 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v4, 8, s2 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x80010 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3 ; GFX12-NEXT: s_lshr_b32 s4, s2, 24 -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x80008 ; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: s_and_b32 s2, s2, 0xff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -5988,24 +5954,25 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s2, 16 ; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s2, 24 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s2 +; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s2, 8 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 16 -; GFX8-NOHSA-NEXT: v_bfe_i32 v2, v0, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] -; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -6049,20 +6016,18 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2 ; GFX12-NEXT: s_lshr_b32 s4, s2, 16 ; GFX12-NEXT: s_lshr_b32 s6, s2, 24 +; GFX12-NEXT: s_lshr_b32 s8, s2, 8 ; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 -; GFX12-NEXT: v_bfe_i32 v2, v1, 0, 8 -; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v5, s5 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s7 -; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: v_mov_b32_e32 v6, s6 -; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s5 +; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s7 +; GFX12-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v3, s9 +; GFX12-NEXT: v_mov_b32_e32 v2, s8 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] @@ -6164,37 +6129,37 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s3, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s5, s2, 24 -; GFX8-NOHSA-NEXT: s_and_b32 s6, s3, 0xff -; GFX8-NOHSA-NEXT: s_and_b32 s7, s2, 0xff -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s2 -; GFX8-NOHSA-NEXT: s_bfe_u32 s8, s2, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_u32 s5, s3, 0x80008 +; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s2, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s7, s2, 0x80008 +; GFX8-NOHSA-NEXT: s_and_b32 s8, s2, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s2, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s10, s3, 0xff ; GFX8-NOHSA-NEXT: s_bfe_u32 s2, s3, 0x80010 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v7 -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -6257,32 +6222,29 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_bfe_u32 s4, s3, 0x80010 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; GFX12-NEXT: s_lshr_b32 s5, s3, 24 -; GFX12-NEXT: v_lshrrev_b16 v4, 8, s2 +; GFX12-NEXT: s_bfe_u32 s4, s3, 0x80008 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, v1 -; GFX12-NEXT: s_lshr_b32 s4, s2, 24 -; GFX12-NEXT: s_bfe_u32 s5, s2, 0x80010 -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-NEXT: v_lshrrev_b16 v5, 8, s3 +; GFX12-NEXT: s_and_b32 s3, s3, 0xff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v0, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s3 ; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_lshr_b32 s3, s2, 24 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x80010 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x80008 ; GFX12-NEXT: s_and_b32 s2, s2, 0xff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GFX12-NEXT: s_and_b32 s2, s3, 0xff +; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32 ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -6397,54 +6359,55 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX8-NOHSA-LABEL: constant_sextload_v8i8_to_v8i64: ; GFX8-NOHSA: ; %bb.0: ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX8-NOHSA-NEXT: s_mov_b32 s5, 0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s3, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s2, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s2, 24 -; GFX8-NOHSA-NEXT: s_mov_b32 s4, s3 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s2 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s3 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s3, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s3, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s8, s3 +; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s2, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s2, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s2, 8 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GFX8-NOHSA-NEXT: s_ashr_i64 s[2:3], s[2:3], 56 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2 +; GFX8-NOHSA-NEXT: s_ashr_i64 s[2:3], s[2:3], 56 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48 -; GFX8-NOHSA-NEXT: v_bfe_i32 v2, v1, 0, 8 -; GFX8-NOHSA-NEXT: v_bfe_i32 v6, v0, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s9 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s11 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32 -; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13 -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] -; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -6508,39 +6471,37 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX12-LABEL: constant_sextload_v8i8_to_v8i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v6, 8, s2 -; GFX12-NEXT: v_lshrrev_b16 v7, 8, s3 -; GFX12-NEXT: s_lshr_b32 s6, s3, 16 -; GFX12-NEXT: s_lshr_b32 s8, s2, 16 -; GFX12-NEXT: s_lshr_b32 s10, s2, 24 -; GFX12-NEXT: v_bfe_i32 v6, v6, 0, 8 -; GFX12-NEXT: s_mov_b32 s4, s3 -; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GFX12-NEXT: s_lshr_b32 s4, s3, 16 +; GFX12-NEXT: s_lshr_b32 s6, s3, 8 +; GFX12-NEXT: s_mov_b32 s8, s3 +; GFX12-NEXT: s_lshr_b32 s10, s2, 16 +; GFX12-NEXT: s_lshr_b32 s12, s2, 24 +; GFX12-NEXT: s_lshr_b32 s14, s2, 8 +; GFX12-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000 +; GFX12-NEXT: s_ashr_i64 s[2:3], s[2:3], 56 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 -; GFX12-NEXT: v_bfe_i32 v14, v7, 0, 8 -; GFX12-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000 -; GFX12-NEXT: s_ashr_i64 s[2:3], s[2:3], 56 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v9, s9 -; GFX12-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s11 -; GFX12-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v13, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s13 -; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX12-NEXT: v_mov_b32_e32 v12, s4 -; GFX12-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX12-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v9, s9 +; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s7 +; GFX12-NEXT: v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v13, s11 +; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v15, s13 +; GFX12-NEXT: v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v7, s15 +; GFX12-NEXT: v_mov_b32_e32 v6, s14 ; GFX12-NEXT: s_clause 0x3 -; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 -; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] ; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48 -; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -6701,15 +6662,16 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s5, 24 ; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s7, 24 ; GFX8-NOHSA-NEXT: s_lshr_b32 s9, s6, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s4, 24 -; GFX8-NOHSA-NEXT: s_and_b32 s11, s4, 0xff -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s4 -; GFX8-NOHSA-NEXT: s_and_b32 s12, s5, 0xff -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s5 -; GFX8-NOHSA-NEXT: s_and_b32 s13, s7, 0xff -; GFX8-NOHSA-NEXT: s_and_b32 s14, s6, 0xff -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v9, 8, s6 +; GFX8-NOHSA-NEXT: s_bfe_u32 s10, s6, 0x80008 +; GFX8-NOHSA-NEXT: s_bfe_u32 s11, s7, 0x80008 +; GFX8-NOHSA-NEXT: s_bfe_u32 s12, s5, 0x80008 +; GFX8-NOHSA-NEXT: s_lshr_b32 s13, s4, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s14, s4, 0x80008 +; GFX8-NOHSA-NEXT: s_and_b32 s15, s4, 0xff ; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s4, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s16, s5, 0xff +; GFX8-NOHSA-NEXT: s_and_b32 s17, s7, 0xff +; GFX8-NOHSA-NEXT: s_and_b32 s18, s6, 0xff ; GFX8-NOHSA-NEXT: s_bfe_u32 s6, s6, 0x80010 ; GFX8-NOHSA-NEXT: s_bfe_u32 s5, s5, 0x80010 ; GFX8-NOHSA-NEXT: s_bfe_u32 s3, s7, 0x80010 @@ -6731,41 +6693,40 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 64 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s9 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 64 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x60 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s18 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x60 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s17 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v8, 8, s7 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v8 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s13 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s11 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s15 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -6873,50 +6834,46 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: s_bfe_u32 s3, s5, 0x80010 -; GFX12-NEXT: v_lshrrev_b16 v4, 8, s6 -; GFX12-NEXT: v_lshrrev_b16 v5, 8, s7 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s3 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-NEXT: s_lshr_b32 s2, s6, 24 ; GFX12-NEXT: s_bfe_u32 s3, s6, 0x80010 -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s3 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-NEXT: s_lshr_b32 s2, s4, 24 -; GFX12-NEXT: s_bfe_u32 s3, s4, 0x80010 +; GFX12-NEXT: s_bfe_u32 s2, s6, 0x80008 +; GFX12-NEXT: s_and_b32 s3, s6, 0xff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s3 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-NEXT: s_and_b32 s2, s6, 0xff -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GFX12-NEXT: v_lshrrev_b16 v5, 8, s5 -; GFX12-NEXT: s_and_b32 s2, s7, 0xff +; GFX12-NEXT: s_bfe_u32 s2, s7, 0x80008 +; GFX12-NEXT: s_and_b32 s3, s7, 0xff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:64 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GFX12-NEXT: v_lshrrev_b16 v5, 8, s4 -; GFX12-NEXT: s_and_b32 s2, s5, 0xff +; GFX12-NEXT: v_mov_b32_e32 v0, s3 +; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: s_bfe_u32 s2, s5, 0x80008 +; GFX12-NEXT: s_and_b32 s3, s5, 0xff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:96 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GFX12-NEXT: s_and_b32 s2, s4, 0xff +; GFX12-NEXT: v_mov_b32_e32 v0, s3 +; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: s_lshr_b32 s2, s4, 24 +; GFX12-NEXT: s_bfe_u32 s3, s4, 0x80010 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-NEXT: v_mov_b32_e32 v0, s3 +; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: s_bfe_u32 s2, s4, 0x80008 +; GFX12-NEXT: s_and_b32 s3, s4, 0xff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s3 +; GFX12-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7119,25 +7076,28 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX8-NOHSA: ; %bb.0: ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s11, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s10, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s10, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s9, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s8, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s22, s8, 24 -; GFX8-NOHSA-NEXT: s_mov_b32 s24, s11 -; GFX8-NOHSA-NEXT: s_mov_b32 s4, s9 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s11 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s10 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v4, 8, s9 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s8 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[8:9], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[10:11], 0x80000 -; GFX8-NOHSA-NEXT: s_ashr_i64 s[8:9], s[8:9], 56 -; GFX8-NOHSA-NEXT: s_ashr_i64 s[10:11], s[10:11], 56 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s7, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s7, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s14, s7 +; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s6, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s6, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s6, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s22, s5, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s24, s5, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s26, s5 +; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s4, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s28, s4, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s30, s4, 8 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i64 s[34:35], s[4:5], 56 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[36:37], s[6:7], 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i64 s[38:39], s[6:7], 56 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 @@ -7145,74 +7105,76 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v18, s10 -; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 0x70 -; GFX8-NOHSA-NEXT: v_bfe_i32 v10, v1, 0, 8 -; GFX8-NOHSA-NEXT: v_bfe_i32 v14, v0, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v19, s11 -; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s12 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s13 +; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 0x70 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s38 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s39 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 0x60 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s15 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 0x50 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s14 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s15 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v18, s16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v19, s17 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s11 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] -; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v18, s8 -; GFX8-NOHSA-NEXT: s_add_u32 s8, s0, 48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v19, s9 -; GFX8-NOHSA-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s18 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s19 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NOHSA-NEXT: s_add_u32 s8, s0, 16 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] -; GFX8-NOHSA-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s20 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s21 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v18, s22 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v19, s23 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NOHSA-NEXT: s_add_u32 s8, s0, 0x60 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] -; GFX8-NOHSA-NEXT: s_addc_u32 s9, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s19 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 64 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 48 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s22 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 32 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s26 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s27 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s24 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s25 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s25 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s6 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 64 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s7 -; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NOHSA-NEXT: v_bfe_i32 v6, v4, 0, 8 -; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 32 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NOHSA-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] -; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -7322,63 +7284,61 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v10, 8, s7 -; GFX12-NEXT: v_lshrrev_b16 v11, 8, s6 -; GFX12-NEXT: v_lshrrev_b16 v21, 8, s5 -; GFX12-NEXT: v_lshrrev_b16 v23, 8, s4 -; GFX12-NEXT: s_lshr_b32 s8, s7, 16 -; GFX12-NEXT: s_lshr_b32 s10, s6, 16 -; GFX12-NEXT: s_lshr_b32 s12, s6, 24 -; GFX12-NEXT: v_bfe_i32 v22, v10, 0, 8 -; GFX12-NEXT: v_bfe_i32 v10, v11, 0, 8 -; GFX12-NEXT: s_lshr_b32 s18, s4, 24 -; GFX12-NEXT: s_mov_b32 s20, s7 -; GFX12-NEXT: s_lshr_b32 s14, s5, 16 -; GFX12-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x80000 +; GFX12-NEXT: s_lshr_b32 s2, s7, 16 +; GFX12-NEXT: s_lshr_b32 s8, s7, 8 +; GFX12-NEXT: s_mov_b32 s10, s7 +; GFX12-NEXT: s_lshr_b32 s12, s6, 16 +; GFX12-NEXT: s_lshr_b32 s14, s6, 24 +; GFX12-NEXT: s_lshr_b32 s16, s6, 8 +; GFX12-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 ; GFX12-NEXT: s_ashr_i64 s[6:7], s[6:7], 56 -; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GFX12-NEXT: v_bfe_i32 v28, v21, 0, 8 -; GFX12-NEXT: s_lshr_b32 s16, s4, 16 +; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: s_lshr_b32 s18, s5, 16 +; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v5, s35 +; GFX12-NEXT: v_dual_mov_b32 v4, s34 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v9, s11 +; GFX12-NEXT: s_lshr_b32 s20, s5, 8 ; GFX12-NEXT: s_mov_b32 s22, s5 -; GFX12-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v11, s9 +; GFX12-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v13, s13 +; GFX12-NEXT: s_lshr_b32 s24, s4, 16 +; GFX12-NEXT: s_lshr_b32 s26, s4, 24 +; GFX12-NEXT: s_lshr_b32 s28, s4, 8 +; GFX12-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000 ; GFX12-NEXT: s_ashr_i64 s[4:5], s[4:5], 56 -; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v30, 0 :: v_dual_mov_b32 v3, s7 -; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v7, s5 -; GFX12-NEXT: v_dual_mov_b32 v8, s24 :: v_dual_mov_b32 v1, s9 -; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v13, s11 -; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v15, s13 -; GFX12-NEXT: v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v5, s15 -; GFX12-NEXT: v_bfe_i32 v24, v23, 0, 8 +; GFX12-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v15, s15 +; GFX12-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v7, s17 ; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v9, s25 -; GFX12-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v21, s21 -; GFX12-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v27, s23 -; GFX12-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GFX12-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v4, s14 :: v_dual_mov_b32 v17, s17 -; GFX12-NEXT: v_ashrrev_i32_e32 v29, 31, v28 -; GFX12-NEXT: v_mov_b32_e32 v26, s22 -; GFX12-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v19, s19 -; GFX12-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v6, s16 :: v_dual_mov_b32 v17, s19 +; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v16, s18 :: v_dual_mov_b32 v19, s5 +; GFX12-NEXT: v_mov_b32_e32 v18, s4 +; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v30, v[0:3], s[0:1] offset:112 -; GFX12-NEXT: global_store_b128 v30, v[20:23], s[0:1] offset:96 -; GFX12-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v23, s3 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:112 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:96 +; GFX12-NEXT: v_dual_mov_b32 v1, s23 :: v_dual_mov_b32 v0, s22 +; GFX12-NEXT: v_dual_mov_b32 v3, s21 :: v_dual_mov_b32 v2, s20 +; GFX12-NEXT: v_dual_mov_b32 v9, s25 :: v_dual_mov_b32 v8, s24 +; GFX12-NEXT: v_dual_mov_b32 v11, s27 :: v_dual_mov_b32 v10, s26 +; GFX12-NEXT: v_dual_mov_b32 v21, s31 :: v_dual_mov_b32 v20, s30 +; GFX12-NEXT: v_dual_mov_b32 v23, s29 :: v_dual_mov_b32 v22, s28 ; GFX12-NEXT: s_clause 0x5 -; GFX12-NEXT: global_store_b128 v30, v[12:15], s[0:1] offset:80 -; GFX12-NEXT: global_store_b128 v30, v[8:11], s[0:1] offset:64 -; GFX12-NEXT: global_store_b128 v30, v[4:7], s[0:1] offset:48 -; GFX12-NEXT: global_store_b128 v30, v[26:29], s[0:1] offset:32 -; GFX12-NEXT: global_store_b128 v30, v[16:19], s[0:1] offset:16 -; GFX12-NEXT: global_store_b128 v30, v[22:25], s[0:1] +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:80 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:64 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -7654,154 +7614,149 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; GFX8-NOHSA-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s5, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s13, s7, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s9, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s15, s11, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s10, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s17, s8, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s6, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s19, s4, 24 -; GFX8-NOHSA-NEXT: s_and_b32 s2, s4, 0xff -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v4, 8, s4 -; GFX8-NOHSA-NEXT: s_and_b32 s3, s5, 0xff -; GFX8-NOHSA-NEXT: s_and_b32 s20, s6, 0xff -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s6 -; GFX8-NOHSA-NEXT: s_and_b32 s21, s7, 0xff -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s7 -; GFX8-NOHSA-NEXT: s_and_b32 s22, s8, 0xff -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v8, 8, s8 -; GFX8-NOHSA-NEXT: s_and_b32 s23, s9, 0xff -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v9, 8, s9 -; GFX8-NOHSA-NEXT: s_and_b32 s24, s10, 0xff -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v12, 8, s10 -; GFX8-NOHSA-NEXT: s_and_b32 s25, s11, 0xff -; GFX8-NOHSA-NEXT: s_bfe_u32 s26, s4, 0x80010 -; GFX8-NOHSA-NEXT: s_bfe_u32 s6, s6, 0x80010 -; GFX8-NOHSA-NEXT: s_bfe_u32 s8, s8, 0x80010 +; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s9, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s17, s11, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s13, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s15, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s19, s15, 0x80008 +; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s14, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s21, s14, 0x80008 +; GFX8-NOHSA-NEXT: s_bfe_u32 s22, s13, 0x80008 +; GFX8-NOHSA-NEXT: s_lshr_b32 s23, s12, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s24, s12, 0x80008 +; GFX8-NOHSA-NEXT: s_bfe_u32 s25, s11, 0x80008 +; GFX8-NOHSA-NEXT: s_lshr_b32 s26, s10, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s27, s10, 0x80008 +; GFX8-NOHSA-NEXT: s_bfe_u32 s28, s9, 0x80008 +; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s8, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s2, s8, 0x80008 +; GFX8-NOHSA-NEXT: s_and_b32 s3, s8, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s5, s8, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s8, s9, 0xff +; GFX8-NOHSA-NEXT: s_and_b32 s29, s10, 0xff ; GFX8-NOHSA-NEXT: s_bfe_u32 s10, s10, 0x80010 -; GFX8-NOHSA-NEXT: s_bfe_u32 s27, s5, 0x80010 -; GFX8-NOHSA-NEXT: s_bfe_u32 s7, s7, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s30, s11, 0xff +; GFX8-NOHSA-NEXT: s_and_b32 s31, s12, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s12, s12, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s33, s13, 0xff +; GFX8-NOHSA-NEXT: s_and_b32 s34, s14, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s14, s14, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s35, s15, 0xff ; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s9, 0x80010 -; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s11, 0x80010 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0xf0 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v5, 8, s5 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0xb0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0x70 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s9 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 48 +; GFX8-NOHSA-NEXT: s_bfe_u32 s11, s11, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_u32 s13, s13, 0x80010 +; GFX8-NOHSA-NEXT: s_bfe_u32 s7, s15, 0x80010 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0xf0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s13 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0xd0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s27 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0x90 +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0xb0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x70 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s17 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 48 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0xe0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s35 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s19 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0xd0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0xc0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s34 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s21 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0xa0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s33 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s22 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x90 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s23 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x80 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s31 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s24 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x60 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s30 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s25 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x50 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0x50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s26 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 64 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s29 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s27 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 32 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s17 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s18 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0xe0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s26 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s19 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v13, 8, s11 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0xc0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s25 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v13 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0xa0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v12 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] -; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0x80 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s23 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] -; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0x60 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s22 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 64 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s21 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s28 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 32 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s5 ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] -; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] -; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -7990,16 +7945,12 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: s_bfe_u32 s11, s5, 0x80010 -; GFX12-NEXT: v_lshrrev_b16 v4, 8, s7 -; GFX12-NEXT: v_lshrrev_b16 v5, 8, s6 -; GFX12-NEXT: s_and_b32 s7, s7, 0xff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:240 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s11 ; GFX12-NEXT: v_mov_b32_e32 v2, s10 ; GFX12-NEXT: s_lshr_b32 s10, s3, 24 ; GFX12-NEXT: s_bfe_u32 s11, s3, 0x80010 -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:176 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s11 @@ -8010,77 +7961,78 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s11 ; GFX12-NEXT: v_mov_b32_e32 v2, s10 -; GFX12-NEXT: s_lshr_b32 s10, s6, 24 -; GFX12-NEXT: s_bfe_u32 s11, s6, 0x80010 -; GFX12-NEXT: s_and_b32 s6, s6, 0xff +; GFX12-NEXT: s_bfe_u32 s10, s7, 0x80008 +; GFX12-NEXT: s_and_b32 s7, s7, 0xff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:48 +; GFX12-NEXT: v_mov_b32_e32 v0, s7 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v0, s11 ; GFX12-NEXT: v_mov_b32_e32 v2, s10 -; GFX12-NEXT: s_lshr_b32 s10, s4, 24 -; GFX12-NEXT: s_bfe_u32 s11, s4, 0x80010 +; GFX12-NEXT: s_lshr_b32 s7, s6, 24 +; GFX12-NEXT: s_bfe_u32 s10, s6, 0x80010 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:224 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s10 +; GFX12-NEXT: v_mov_b32_e32 v2, s7 +; GFX12-NEXT: s_bfe_u32 s7, s6, 0x80008 +; GFX12-NEXT: s_and_b32 s6, s6, 0xff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:208 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v0, s11 -; GFX12-NEXT: v_mov_b32_e32 v2, s10 -; GFX12-NEXT: s_lshr_b32 s10, s2, 24 -; GFX12-NEXT: s_bfe_u32 s11, s2, 0x80010 -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:144 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_mov_b32_e32 v2, s7 +; GFX12-NEXT: s_bfe_u32 s6, s5, 0x80008 +; GFX12-NEXT: s_and_b32 s5, s5, 0xff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:192 +; GFX12-NEXT: v_mov_b32_e32 v0, s5 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v0, s11 -; GFX12-NEXT: v_mov_b32_e32 v2, s10 -; GFX12-NEXT: s_lshr_b32 s10, s0, 24 -; GFX12-NEXT: s_bfe_u32 s11, s0, 0x80010 -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:80 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-NEXT: s_lshr_b32 s5, s4, 24 +; GFX12-NEXT: s_bfe_u32 s6, s4, 0x80010 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:160 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v0, s11 -; GFX12-NEXT: v_mov_b32_e32 v2, s10 -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:16 -; GFX12-NEXT: v_mov_b32_e32 v0, s7 -; GFX12-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GFX12-NEXT: v_lshrrev_b16 v5, 8, s5 -; GFX12-NEXT: s_and_b32 s5, s5, 0xff -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:224 ; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GFX12-NEXT: v_lshrrev_b16 v5, 8, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s5 +; GFX12-NEXT: s_bfe_u32 s5, s4, 0x80008 ; GFX12-NEXT: s_and_b32 s4, s4, 0xff -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:192 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:144 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v0, s5 -; GFX12-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GFX12-NEXT: v_lshrrev_b16 v5, 8, s3 -; GFX12-NEXT: s_and_b32 s3, s3, 0xff -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:160 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GFX12-NEXT: v_lshrrev_b16 v5, 8, s2 -; GFX12-NEXT: s_and_b32 s2, s2, 0xff +; GFX12-NEXT: v_mov_b32_e32 v2, s5 +; GFX12-NEXT: s_bfe_u32 s4, s3, 0x80008 +; GFX12-NEXT: s_and_b32 s3, s3, 0xff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:128 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s3 -; GFX12-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GFX12-NEXT: v_lshrrev_b16 v5, 8, s1 -; GFX12-NEXT: s_and_b32 s1, s1, 0xff +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_lshr_b32 s3, s2, 24 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x80010 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:96 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x80008 +; GFX12-NEXT: s_and_b32 s2, s2, 0xff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:80 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GFX12-NEXT: v_lshrrev_b16 v5, 8, s0 -; GFX12-NEXT: s_and_b32 s0, s0, 0xff +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: s_bfe_u32 s2, s1, 0x80008 +; GFX12-NEXT: s_and_b32 s1, s1, 0xff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:64 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s1 -; GFX12-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: s_lshr_b32 s1, s0, 24 +; GFX12-NEXT: s_bfe_u32 s2, s0, 0x80010 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-NEXT: s_bfe_u32 s1, s0, 0x80008 +; GFX12-NEXT: s_and_b32 s0, s0, 0xff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:16 +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-NEXT: v_mov_b32_e32 v2, s1 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -8465,194 +8417,211 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_lshr_b32 s28, s7, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s30, s6, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s34, s6, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s36, s5, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s38, s4, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s46, s7, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s48, s7, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s50, s7 +; GFX8-NOHSA-NEXT: s_lshr_b32 s52, s6, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s54, s6, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s56, s6, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s58, s5, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s60, s5, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s62, s5 +; GFX8-NOHSA-NEXT: s_lshr_b32 s44, s4, 16 ; GFX8-NOHSA-NEXT: s_lshr_b32 s40, s4, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s42, s3, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s44, s2, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s46, s2, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s26, s1, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s22, s0, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s0, 24 -; GFX8-NOHSA-NEXT: s_mov_b32 s48, s7 -; GFX8-NOHSA-NEXT: s_mov_b32 s50, s5 -; GFX8-NOHSA-NEXT: s_mov_b32 s52, s3 -; GFX8-NOHSA-NEXT: s_mov_b32 s54, s1 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v4, 8, s7 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s6 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s5 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v5, 8, s4 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v8, 8, s3 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v9, 8, s2 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s1 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s0 +; GFX8-NOHSA-NEXT: s_lshr_b32 s38, s4, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s36, s3, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s30, s3, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s28, s3 +; GFX8-NOHSA-NEXT: s_lshr_b32 s24, s2, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s22, s2, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s2, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s1, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s1, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s64, s1 +; GFX8-NOHSA-NEXT: s_lshr_b32 s66, s0, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s68, s0, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s70, s0, 8 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[4:5], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x80000 -; GFX8-NOHSA-NEXT: s_ashr_i64 s[18:19], s[0:1], 56 -; GFX8-NOHSA-NEXT: s_ashr_i64 s[24:25], s[2:3], 56 -; GFX8-NOHSA-NEXT: s_ashr_i64 s[56:57], s[4:5], 56 -; GFX8-NOHSA-NEXT: s_ashr_i64 s[58:59], s[6:7], 56 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[0:1], s[54:55], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[52:53], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[50:51], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[48:49], 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i64 s[12:13], s[0:1], 56 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i64 s[26:27], s[2:3], 56 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[34:35], s[4:5], 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i64 s[42:43], s[4:5], 56 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[72:73], s[6:7], 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i64 s[74:75], s[6:7], 56 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[0:1], s[70:71], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[68:69], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[66:67], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[64:65], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s28 -; GFX8-NOHSA-NEXT: s_add_u32 s28, s8, 0xf0 -; GFX8-NOHSA-NEXT: v_bfe_i32 v10, v9, 0, 8 -; GFX8-NOHSA-NEXT: v_bfe_i32 v14, v8, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s29 -; GFX8-NOHSA-NEXT: s_addc_u32 s29, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s28 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v18, s58 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v19, s59 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s29 -; GFX8-NOHSA-NEXT: s_add_u32 s28, s8, 0xd0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[16:19] -; GFX8-NOHSA-NEXT: s_addc_u32 s29, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s28 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s30 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s31 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v18, s34 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v19, s35 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s29 -; GFX8-NOHSA-NEXT: s_add_u32 s28, s8, 0xb0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[16:19] -; GFX8-NOHSA-NEXT: s_addc_u32 s29, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s28 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s36 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s37 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v18, s56 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v19, s57 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s29 -; GFX8-NOHSA-NEXT: s_add_u32 s28, s8, 0x90 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[16:19] -; GFX8-NOHSA-NEXT: s_addc_u32 s29, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s28 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s38 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s39 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v18, s40 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v19, s41 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s29 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[16:19] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v20, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v18, s24 -; GFX8-NOHSA-NEXT: s_add_u32 s24, s8, 0x70 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v19, s25 -; GFX8-NOHSA-NEXT: s_addc_u32 s25, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s42 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s43 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s25 -; GFX8-NOHSA-NEXT: s_add_u32 s24, s8, 0x50 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[16:19] -; GFX8-NOHSA-NEXT: s_addc_u32 s25, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s44 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s45 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v18, s46 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v19, s47 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s25 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[16:19] -; GFX8-NOHSA-NEXT: v_bfe_i32 v22, v4, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v18, s18 -; GFX8-NOHSA-NEXT: s_add_u32 s18, s8, 48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v19, s19 -; GFX8-NOHSA-NEXT: s_addc_u32 s19, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s18 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s26 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s27 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s19 -; GFX8-NOHSA-NEXT: s_add_u32 s18, s8, 16 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[16:19] -; GFX8-NOHSA-NEXT: s_addc_u32 s19, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s18 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s22 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s23 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v18, s20 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v19, s21 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s19 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s8, 0xe0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[16:19] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v21, s7 -; GFX8-NOHSA-NEXT: v_bfe_i32 v18, v5, 0, 8 -; GFX8-NOHSA-NEXT: s_addc_u32 s7, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s8, 0xc0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[20:23] +; GFX8-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s46 +; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0xf0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s47 +; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s74 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s75 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47 +; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0xe0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s51 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s48 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s49 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47 +; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0xd0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s52 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s53 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s54 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s55 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47 +; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0xc0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s72 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s73 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s56 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s57 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s42 +; GFX8-NOHSA-NEXT: s_add_u32 s42, s8, 0xb0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s43 +; GFX8-NOHSA-NEXT: s_addc_u32 s43, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s58 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s59 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s43 +; GFX8-NOHSA-NEXT: s_add_u32 s42, s8, 0xa0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s43, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s62 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s63 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s60 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s61 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s43 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s40 +; GFX8-NOHSA-NEXT: s_add_u32 s40, s8, 0x90 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s41 +; GFX8-NOHSA-NEXT: s_addc_u32 s41, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s40 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s44 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s45 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s41 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s34 +; GFX8-NOHSA-NEXT: s_add_u32 s34, s8, 0x80 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s35 +; GFX8-NOHSA-NEXT: s_addc_u32 s35, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s38 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s39 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s26 +; GFX8-NOHSA-NEXT: s_add_u32 s26, s8, 0x70 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s27 +; GFX8-NOHSA-NEXT: s_addc_u32 s27, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s27 +; GFX8-NOHSA-NEXT: s_add_u32 s26, s8, 0x60 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s27, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s28 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s29 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s30 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s31 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s27 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s22 +; GFX8-NOHSA-NEXT: s_add_u32 s22, s8, 0x50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s23 +; GFX8-NOHSA-NEXT: s_addc_u32 s23, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s22 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s24 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s25 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s23 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NOHSA-NEXT: s_add_u32 s16, s8, 64 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NOHSA-NEXT: s_addc_u32 s17, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s17 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NOHSA-NEXT: s_add_u32 s12, s8, 48 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s13 +; GFX8-NOHSA-NEXT: s_addc_u32 s13, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s19 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s8, 32 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s9, 0 -; GFX8-NOHSA-NEXT: v_bfe_i32 v22, v1, 0, 8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v20, s16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v21, s17 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[20:23] -; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v20, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 0xa0 -; GFX8-NOHSA-NEXT: v_bfe_i32 v22, v0, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v21, s5 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 0x80 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[20:23] -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s9, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s8, 16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s14 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s15 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s2 -; GFX8-NOHSA-NEXT: s_add_u32 s2, s8, 0x60 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s3 -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NOHSA-NEXT: s_add_u32 s2, s8, 64 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s12 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 32 -; GFX8-NOHSA-NEXT: v_bfe_i32 v6, v6, 0, 8 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NOHSA-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] -; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -8852,114 +8821,118 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshrrev_b16 v0, 8, s7 -; GFX12-NEXT: v_lshrrev_b16 v3, 8, s5 -; GFX12-NEXT: v_lshrrev_b16 v7, 8, s2 -; GFX12-NEXT: v_lshrrev_b16 v1, 8, s6 -; GFX12-NEXT: v_lshrrev_b16 v4, 8, s4 -; GFX12-NEXT: v_lshrrev_b16 v6, 8, s1 -; GFX12-NEXT: v_lshrrev_b16 v5, 8, s3 -; GFX12-NEXT: v_lshrrev_b16 v2, 8, s0 -; GFX12-NEXT: s_lshr_b32 s20, s7, 16 -; GFX12-NEXT: s_lshr_b32 s24, s6, 24 -; GFX12-NEXT: s_lshr_b32 s26, s5, 16 -; GFX12-NEXT: s_lshr_b32 s36, s2, 16 -; GFX12-NEXT: s_lshr_b32 s38, s2, 24 -; GFX12-NEXT: v_bfe_i32 v10, v7, 0, 8 -; GFX12-NEXT: v_bfe_i32 v22, v3, 0, 8 -; GFX12-NEXT: v_bfe_i32 v30, v0, 0, 8 -; GFX12-NEXT: s_lshr_b32 s42, s0, 16 -; GFX12-NEXT: s_mov_b32 s46, s7 -; GFX12-NEXT: s_mov_b32 s48, s5 -; GFX12-NEXT: s_mov_b32 s50, s3 -; GFX12-NEXT: s_lshr_b32 s22, s6, 16 -; GFX12-NEXT: s_lshr_b32 s28, s4, 16 -; GFX12-NEXT: s_lshr_b32 s30, s4, 24 -; GFX12-NEXT: s_lshr_b32 s40, s1, 16 -; GFX12-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x80000 -; GFX12-NEXT: s_ashr_i64 s[54:55], s[2:3], 56 -; GFX12-NEXT: s_ashr_i64 s[56:57], s[4:5], 56 -; GFX12-NEXT: s_ashr_i64 s[6:7], s[6:7], 56 -; GFX12-NEXT: v_bfe_i32 v6, v6, 0, 8 -; GFX12-NEXT: v_bfe_i32 v18, v4, 0, 8 -; GFX12-NEXT: v_bfe_i32 v26, v1, 0, 8 -; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 +; GFX12-NEXT: s_lshr_b32 s36, s7, 16 +; GFX12-NEXT: s_lshr_b32 s38, s7, 8 +; GFX12-NEXT: s_mov_b32 s40, s7 +; GFX12-NEXT: s_lshr_b32 s42, s6, 16 +; GFX12-NEXT: s_lshr_b32 s44, s6, 24 +; GFX12-NEXT: s_ashr_i64 s[74:75], s[6:7], 56 ; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GFX12-NEXT: s_lshr_b32 s34, s3, 16 -; GFX12-NEXT: s_lshr_b32 s44, s0, 24 -; GFX12-NEXT: s_mov_b32 s52, s1 -; GFX12-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[14:15], s[4:5], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[2:3], s[50:51], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[4:5], s[48:49], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v64, 0 :: v_dual_mov_b32 v33, s21 -; GFX12-NEXT: s_ashr_i64 s[18:19], s[0:1], 56 -; GFX12-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX12-NEXT: v_bfe_i32 v14, v5, 0, 8 +; GFX12-NEXT: s_lshr_b32 s46, s6, 8 ; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v32, s20 :: v_dual_mov_b32 v35, s7 -; GFX12-NEXT: v_dual_mov_b32 v34, s6 :: v_dual_mov_b32 v37, s23 -; GFX12-NEXT: v_dual_mov_b32 v38, s24 :: v_dual_mov_b32 v41, s27 -; GFX12-NEXT: v_dual_mov_b32 v40, s26 :: v_dual_mov_b32 v43, s57 -; GFX12-NEXT: v_dual_mov_b32 v42, s56 :: v_dual_mov_b32 v45, s29 -; GFX12-NEXT: v_dual_mov_b32 v50, s54 :: v_dual_mov_b32 v53, s37 -; GFX12-NEXT: v_dual_mov_b32 v52, s36 :: v_dual_mov_b32 v55, s39 -; GFX12-NEXT: v_dual_mov_b32 v54, s38 :: v_dual_mov_b32 v57, s41 -; GFX12-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[0:1], s[52:53], 0x80000 -; GFX12-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GFX12-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GFX12-NEXT: v_ashrrev_i32_e32 v31, 31, v30 +; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s37 ; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v60, s42 :: v_dual_mov_b32 v29, s47 -; GFX12-NEXT: v_dual_mov_b32 v28, s46 :: v_dual_mov_b32 v63, s45 +; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v3, s75 +; GFX12-NEXT: v_dual_mov_b32 v2, s74 :: v_dual_mov_b32 v5, s41 +; GFX12-NEXT: s_lshr_b32 s48, s5, 16 +; GFX12-NEXT: s_bfe_i64 s[72:73], s[6:7], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v7, s39 +; GFX12-NEXT: v_dual_mov_b32 v6, s38 :: v_dual_mov_b32 v9, s43 +; GFX12-NEXT: s_lshr_b32 s50, s5, 8 +; GFX12-NEXT: s_mov_b32 s52, s5 +; GFX12-NEXT: v_dual_mov_b32 v8, s42 :: v_dual_mov_b32 v11, s45 +; GFX12-NEXT: v_dual_mov_b32 v10, s44 :: v_dual_mov_b32 v13, s73 +; GFX12-NEXT: s_lshr_b32 s54, s4, 16 +; GFX12-NEXT: s_lshr_b32 s56, s4, 24 +; GFX12-NEXT: s_ashr_i64 s[70:71], s[4:5], 56 +; GFX12-NEXT: v_dual_mov_b32 v12, s72 :: v_dual_mov_b32 v15, s47 +; GFX12-NEXT: s_bfe_i64 s[36:37], s[48:49], 0x80000 +; GFX12-NEXT: v_mov_b32_e32 v14, s46 +; GFX12-NEXT: s_lshr_b32 s58, s4, 8 +; GFX12-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 +; GFX12-NEXT: s_lshr_b32 s60, s3, 16 +; GFX12-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:240 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:224 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:208 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:192 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v24, s16 :: v_dual_mov_b32 v21, s5 -; GFX12-NEXT: v_dual_mov_b32 v20, s4 :: v_dual_mov_b32 v17, s15 -; GFX12-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v9, s13 -; GFX12-NEXT: v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v5, s1 +; GFX12-NEXT: v_dual_mov_b32 v1, s37 :: v_dual_mov_b32 v0, s36 +; GFX12-NEXT: v_dual_mov_b32 v3, s71 :: v_dual_mov_b32 v2, s70 +; GFX12-NEXT: v_mov_b32_e32 v5, s53 +; GFX12-NEXT: s_lshr_b32 s34, s3, 8 +; GFX12-NEXT: s_mov_b32 s30, s3 +; GFX12-NEXT: s_lshr_b32 s24, s2, 16 +; GFX12-NEXT: s_lshr_b32 s22, s2, 24 +; GFX12-NEXT: s_bfe_i64 s[28:29], s[4:5], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v4, s52 :: v_dual_mov_b32 v7, s51 +; GFX12-NEXT: v_dual_mov_b32 v6, s50 :: v_dual_mov_b32 v9, s55 +; GFX12-NEXT: s_lshr_b32 s20, s2, 8 +; GFX12-NEXT: s_ashr_i64 s[26:27], s[2:3], 56 +; GFX12-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v8, s54 :: v_dual_mov_b32 v11, s57 +; GFX12-NEXT: v_dual_mov_b32 v10, s56 :: v_dual_mov_b32 v13, s29 +; GFX12-NEXT: s_lshr_b32 s18, s1, 16 +; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v36, s22 :: v_dual_mov_b32 v39, s25 -; GFX12-NEXT: v_dual_mov_b32 v44, s28 :: v_dual_mov_b32 v47, s31 -; GFX12-NEXT: v_dual_mov_b32 v46, s30 :: v_dual_mov_b32 v49, s35 -; GFX12-NEXT: v_dual_mov_b32 v56, s40 :: v_dual_mov_b32 v59, s19 -; GFX12-NEXT: v_dual_mov_b32 v58, s18 :: v_dual_mov_b32 v61, s43 -; GFX12-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX12-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GFX12-NEXT: v_ashrrev_i32_e32 v27, 31, v26 -; GFX12-NEXT: v_dual_mov_b32 v62, s44 :: v_dual_mov_b32 v25, s17 -; GFX12-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v13, s3 -; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v1, s11 -; GFX12-NEXT: v_dual_mov_b32 v48, s34 :: v_dual_mov_b32 v51, s55 -; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX12-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GFX12-NEXT: s_clause 0x9 -; GFX12-NEXT: global_store_b128 v64, v[32:35], s[8:9] offset:240 -; GFX12-NEXT: global_store_b128 v64, v[28:31], s[8:9] offset:224 -; GFX12-NEXT: global_store_b128 v64, v[36:39], s[8:9] offset:208 -; GFX12-NEXT: global_store_b128 v64, v[24:27], s[8:9] offset:192 -; GFX12-NEXT: global_store_b128 v64, v[40:43], s[8:9] offset:176 -; GFX12-NEXT: global_store_b128 v64, v[20:23], s[8:9] offset:160 -; GFX12-NEXT: global_store_b128 v64, v[44:47], s[8:9] offset:144 -; GFX12-NEXT: global_store_b128 v64, v[16:19], s[8:9] offset:128 -; GFX12-NEXT: global_store_b128 v64, v[48:51], s[8:9] offset:112 -; GFX12-NEXT: global_store_b128 v64, v[12:15], s[8:9] offset:96 -; GFX12-NEXT: v_mov_b32_e32 v0, s10 +; GFX12-NEXT: v_dual_mov_b32 v12, s28 :: v_dual_mov_b32 v15, s59 +; GFX12-NEXT: v_dual_mov_b32 v14, s58 :: v_dual_mov_b32 v17, s61 +; GFX12-NEXT: s_lshr_b32 s14, s1, 8 +; GFX12-NEXT: s_mov_b32 s62, s1 +; GFX12-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v16, s60 :: v_dual_mov_b32 v19, s27 +; GFX12-NEXT: v_dual_mov_b32 v18, s26 :: v_dual_mov_b32 v21, s31 +; GFX12-NEXT: s_lshr_b32 s64, s0, 16 +; GFX12-NEXT: s_lshr_b32 s66, s0, 24 +; GFX12-NEXT: s_ashr_i64 s[12:13], s[0:1], 56 +; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v20, s30 :: v_dual_mov_b32 v23, s35 +; GFX12-NEXT: v_mov_b32_e32 v22, s34 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:176 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:160 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:144 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:128 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[8:9] offset:112 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[8:9] offset:96 +; GFX12-NEXT: v_dual_mov_b32 v1, s25 :: v_dual_mov_b32 v0, s24 +; GFX12-NEXT: v_dual_mov_b32 v3, s23 :: v_dual_mov_b32 v2, s22 +; GFX12-NEXT: v_mov_b32_e32 v5, s17 +; GFX12-NEXT: s_lshr_b32 s68, s0, 8 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[62:63], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s21 +; GFX12-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v9, s19 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[66:67], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[64:65], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v11, s13 +; GFX12-NEXT: v_dual_mov_b32 v10, s12 :: v_dual_mov_b32 v13, s7 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[0:1], s[68:69], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v15, s15 +; GFX12-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v17, s5 +; GFX12-NEXT: v_dual_mov_b32 v16, s4 :: v_dual_mov_b32 v19, s3 +; GFX12-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v21, s11 +; GFX12-NEXT: v_dual_mov_b32 v20, s10 :: v_dual_mov_b32 v23, s1 +; GFX12-NEXT: v_mov_b32_e32 v22, s0 ; GFX12-NEXT: s_clause 0x5 -; GFX12-NEXT: global_store_b128 v64, v[52:55], s[8:9] offset:80 -; GFX12-NEXT: global_store_b128 v64, v[8:11], s[8:9] offset:64 -; GFX12-NEXT: global_store_b128 v64, v[56:59], s[8:9] offset:48 -; GFX12-NEXT: global_store_b128 v64, v[4:7], s[8:9] offset:32 -; GFX12-NEXT: global_store_b128 v64, v[60:63], s[8:9] offset:16 -; GFX12-NEXT: global_store_b128 v64, v[0:3], s[8:9] +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:80 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:64 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:48 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:32 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[8:9] offset:16 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[8:9] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -9373,7 +9346,6 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out ; GFX8-NOHSA-LABEL: constant_zextload_v2i8_to_v2i16: ; GFX8-NOHSA: ; %bb.0: ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9381,10 +9353,9 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0) -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e32 v4, 8, v2 -; GFX8-NOHSA-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v3, 8, v2 +; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff0000, v3 +; GFX8-NOHSA-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NOHSA-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NOHSA-NEXT: s_endpgm ; @@ -9417,11 +9388,13 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v2, 0xff, v1 -; GFX12-NEXT: v_lshrrev_b16 v1, 8, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v1 +; GFX12-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 8, v2 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -9477,6 +9450,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out ; GFX8-NOHSA: ; %bb.0: ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, 8 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9485,8 +9459,8 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0) ; GFX8-NOHSA-NEXT: v_and_b32_sdwa v3, v3, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NOHSA-NEXT: v_ashrrev_i16_e32 v2, 8, v2 -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NOHSA-NEXT: v_lshlrev_b32_sdwa v2, v4, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NOHSA-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NOHSA-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NOHSA-NEXT: s_endpgm @@ -9530,11 +9504,13 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_bfe_i32 v2, v1, 0, 8 -; GFX12-NEXT: v_ashrrev_i16 v1, 8, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX12-NEXT: v_bfe_i32 v2, v1, 0, 16 +; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 8, v2 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -9594,14 +9570,15 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s2 ; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NOHSA-NEXT: s_and_b32 s1, s2, 0xff -; GFX8-NOHSA-NEXT: v_alignbit_b32 v3, s0, v3, 16 -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v3 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s1, v2 +; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 8 +; GFX8-NOHSA-NEXT: v_alignbit_b32 v2, s0, v2, 16 +; GFX8-NOHSA-NEXT: s_and_b32 s0, s2, 0xff0000 +; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NOHSA-NEXT: s_endpgm ; @@ -9655,24 +9632,19 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; GFX12-LABEL: constant_zextload_v4i8_to_v4i16: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshr_b32 s3, s2, 16 -; GFX12-NEXT: v_and_b32_e64 v0, 0xff, s2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_and_b32_e64 v1, 0xff, s3 -; GFX12-NEXT: v_lshrrev_b16 v2, 8, s2 -; GFX12-NEXT: s_lshr_b32 s2, s2, 24 -; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_lshl_or_b32 v1, s2, 16, v1 -; GFX12-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x80008 +; GFX12-NEXT: s_lshr_b32 s4, s2, 24 +; GFX12-NEXT: s_and_b32 s5, s2, 0xff +; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX12-NEXT: s_pack_ll_b32_b16 s3, s5, s3 +; GFX12-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: v_mov_b32_e32 v0, s3 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -9738,16 +9710,18 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 16 -; GFX8-NOHSA-NEXT: s_ashr_i32 s1, s2, 24 -; GFX8-NOHSA-NEXT: s_bfe_i32 s0, s0, 0x80000 +; GFX8-NOHSA-NEXT: s_sext_i32_i16 s1, s2 ; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s2, 0x80000 -; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v2, 8, s2 -; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NOHSA-NEXT: s_ashr_i32 s2, s2, 24 +; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NOHSA-NEXT: s_bfe_i32 s0, s0, 0x80000 +; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xffff0000 ; GFX8-NOHSA-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX8-NOHSA-NEXT: s_and_b32 s2, 0xffff, s3 -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NOHSA-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX8-NOHSA-NEXT: s_or_b32 s1, s3, s1 +; GFX8-NOHSA-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -9811,19 +9785,17 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_bfe_i32 s4, s2, 0x80000 ; GFX12-NEXT: s_lshr_b32 s3, s2, 16 -; GFX12-NEXT: v_ashrrev_i16 v0, 8, s2 -; GFX12-NEXT: v_and_b32_e64 v1, 0xffff, s4 -; GFX12-NEXT: s_ashr_i32 s2, s2, 24 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_sext_i32_i16 s5, s2 +; GFX12-NEXT: s_ashr_i32 s4, s2, 24 +; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80000 +; GFX12-NEXT: s_lshr_b32 s5, s5, 8 ; GFX12-NEXT: s_bfe_i32 s3, s3, 0x80000 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_pack_ll_b32_b16 s2, s3, s2 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: v_lshl_or_b32 v0, v0, 16, v1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-NEXT: s_pack_ll_b32_b16 s2, s2, s5 +; GFX12-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -9901,23 +9873,25 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 24 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: s_lshr_b32 s1, s3, 24 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s2 ; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s3, 0x80010 -; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NOHSA-NEXT: s_and_b32 s5, s3, 0xff +; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 8 ; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s0, v0, 16 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s3 -; GFX8-NOHSA-NEXT: s_or_b32 s0, s4, s1 +; GFX8-NOHSA-NEXT: s_and_b32 s0, s2, 0xff +; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 8 +; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NOHSA-NEXT: s_and_b32 s3, s3, 0xff0000 +; GFX8-NOHSA-NEXT: s_and_b32 s2, s2, 0xff0000 +; GFX8-NOHSA-NEXT: s_or_b32 s1, s4, s1 +; GFX8-NOHSA-NEXT: s_or_b32 s3, s5, s3 +; GFX8-NOHSA-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GFX8-NOHSA-NEXT: s_and_b32 s1, s2, 0xff -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s1, v0 -; GFX8-NOHSA-NEXT: s_and_b32 s1, s3, 0xff -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s1, v2 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm ; @@ -10004,26 +9978,22 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshr_b32 s5, s2, 16 -; GFX12-NEXT: s_lshr_b32 s6, s3, 16 -; GFX12-NEXT: v_and_b32_e64 v0, 0xff, s2 -; GFX12-NEXT: v_and_b32_e64 v2, 0xff, s3 -; GFX12-NEXT: v_and_b32_e64 v3, 0xff, s6 -; GFX12-NEXT: v_and_b32_e64 v5, 0xff, s5 -; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: v_lshrrev_b16 v1, 8, s3 -; GFX12-NEXT: v_lshrrev_b16 v6, 8, s2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: s_lshr_b32 s4, s2, 24 -; GFX12-NEXT: s_lshr_b32 s2, s3, 24 -; GFX12-NEXT: v_lshl_or_b32 v0, v6, 16, v0 -; GFX12-NEXT: v_lshl_or_b32 v2, v1, 16, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_lshl_or_b32 v3, s2, 16, v3 -; GFX12-NEXT: v_lshl_or_b32 v1, s4, 16, v5 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x80008 +; GFX12-NEXT: s_lshr_b32 s5, s2, 24 +; GFX12-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX12-NEXT: s_lshr_b32 s7, s3, 24 +; GFX12-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX12-NEXT: s_and_b32 s3, s3, 0xff +; GFX12-NEXT: s_bfe_u32 s9, s2, 0x80010 +; GFX12-NEXT: s_and_b32 s2, s2, 0xff +; GFX12-NEXT: s_pack_ll_b32_b16 s7, s8, s7 +; GFX12-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX12-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX12-NEXT: s_pack_ll_b32_b16 s4, s9, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -10113,29 +10083,33 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOHSA-NEXT: s_sext_i32_i16 s0, s3 +; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s3, 0x80000 +; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 8 +; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX8-NOHSA-NEXT: s_or_b32 s7, s1, s0 +; GFX8-NOHSA-NEXT: s_sext_i32_i16 s0, s2 ; GFX8-NOHSA-NEXT: s_bfe_i32 s6, s2, 0x80000 -; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v0, 8, s2 -; GFX8-NOHSA-NEXT: s_ashr_i64 s[0:1], s[2:3], 56 +; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 8 +; GFX8-NOHSA-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xffff0000 ; GFX8-NOHSA-NEXT: s_lshr_b32 s5, s3, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s6 -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s1, v0 +; GFX8-NOHSA-NEXT: s_or_b32 s6, s6, s0 +; GFX8-NOHSA-NEXT: s_ashr_i64 s[0:1], s[2:3], 56 ; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s5, 0x80000 +; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s2, 16 ; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0 -; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s3, 0x80000 -; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v1, 8, s3 -; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s2, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s1, v1 ; GFX8-NOHSA-NEXT: s_ashr_i32 s1, s2, 24 ; GFX8-NOHSA-NEXT: s_bfe_i32 s2, s4, 0x80000 ; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NOHSA-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NOHSA-NEXT: s_or_b32 s1, s2, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -10236,26 +10210,26 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_bfe_i32 s8, s2, 0x80000 -; GFX12-NEXT: s_bfe_i32 s9, s3, 0x80000 +; GFX12-NEXT: s_ashr_i64 s[4:5], s[2:3], 56 ; GFX12-NEXT: s_lshr_b32 s6, s2, 16 ; GFX12-NEXT: s_lshr_b32 s7, s3, 16 -; GFX12-NEXT: v_ashrrev_i16 v0, 8, s2 -; GFX12-NEXT: v_ashrrev_i16 v2, 8, s3 -; GFX12-NEXT: s_ashr_i64 s[4:5], s[2:3], 56 -; GFX12-NEXT: v_and_b32_e64 v3, 0xffff, s8 -; GFX12-NEXT: v_and_b32_e64 v5, 0xffff, s9 -; GFX12-NEXT: s_ashr_i32 s2, s2, 24 -; GFX12-NEXT: s_bfe_i32 s3, s6, 0x80000 -; GFX12-NEXT: s_bfe_i32 s5, s7, 0x80000 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_pack_ll_b32_b16 s2, s3, s2 -; GFX12-NEXT: s_pack_ll_b32_b16 s3, s5, s4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2 -; GFX12-NEXT: v_lshl_or_b32 v0, v0, 16, v3 -; GFX12-NEXT: v_lshl_or_b32 v2, v2, 16, v5 -; GFX12-NEXT: v_mov_b32_e32 v3, s3 +; GFX12-NEXT: s_bfe_i32 s5, s3, 0x80000 +; GFX12-NEXT: s_sext_i32_i16 s3, s3 +; GFX12-NEXT: s_ashr_i32 s8, s2, 24 +; GFX12-NEXT: s_bfe_i32 s9, s2, 0x80000 +; GFX12-NEXT: s_sext_i32_i16 s2, s2 +; GFX12-NEXT: s_bfe_i32 s7, s7, 0x80000 +; GFX12-NEXT: s_lshr_b32 s3, s3, 8 +; GFX12-NEXT: s_bfe_i32 s6, s6, 0x80000 +; GFX12-NEXT: s_lshr_b32 s2, s2, 8 +; GFX12-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX12-NEXT: s_pack_ll_b32_b16 s3, s5, s3 +; GFX12-NEXT: s_pack_ll_b32_b16 s2, s9, s2 +; GFX12-NEXT: s_pack_ll_b32_b16 s5, s6, s8 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -10373,45 +10347,47 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s4 ; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s4, 24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v1, s3, v1, 16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s3, v0, 16 ; GFX8-NOHSA-NEXT: s_and_b32 s3, s4, 0xff -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s3, v0 -; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s7, 24 -; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s7, 0x80010 +; GFX8-NOHSA-NEXT: s_lshl_b32 s4, s4, 8 ; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s5, 24 +; GFX8-NOHSA-NEXT: s_and_b32 s4, s4, 0xff0000 ; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s5, 0x80010 +; GFX8-NOHSA-NEXT: s_lshl_b32 s8, s8, 16 +; GFX8-NOHSA-NEXT: s_or_b32 s4, s3, s4 +; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s7, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s6, 24 +; GFX8-NOHSA-NEXT: s_or_b32 s8, s9, s8 +; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 +; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 16 +; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s7, 0x80010 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NOHSA-NEXT: s_and_b32 s10, s5, 0xff ; GFX8-NOHSA-NEXT: s_lshl_b32 s5, s5, 8 -; GFX8-NOHSA-NEXT: s_or_b32 s3, s4, s3 -; GFX8-NOHSA-NEXT: s_and_b32 s4, s7, 0xff +; GFX8-NOHSA-NEXT: s_or_b32 s3, s9, s3 +; GFX8-NOHSA-NEXT: s_and_b32 s9, s7, 0xff ; GFX8-NOHSA-NEXT: s_lshl_b32 s7, s7, 8 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s6 -; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s6, 24 -; GFX8-NOHSA-NEXT: s_lshl_b32 s8, s8, 16 +; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s2, v0, 16 +; GFX8-NOHSA-NEXT: s_and_b32 s2, s6, 0xff +; GFX8-NOHSA-NEXT: s_lshl_b32 s6, s6, 8 ; GFX8-NOHSA-NEXT: s_and_b32 s5, s5, 0xff0000 ; GFX8-NOHSA-NEXT: s_and_b32 s7, s7, 0xff0000 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s6 -; GFX8-NOHSA-NEXT: s_or_b32 s8, s9, s8 +; GFX8-NOHSA-NEXT: s_and_b32 s6, s6, 0xff0000 ; GFX8-NOHSA-NEXT: s_or_b32 s5, s10, s5 -; GFX8-NOHSA-NEXT: s_or_b32 s4, s4, s7 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v3, s2, v3, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s2, s6, 0xff -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX8-NOHSA-NEXT: s_or_b32 s7, s9, s7 +; GFX8-NOHSA-NEXT: s_or_b32 s2, s2, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s3 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s7 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5] -; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s8 @@ -10568,46 +10544,38 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshr_b32 s3, s6, 16 -; GFX12-NEXT: s_lshr_b32 s9, s7, 16 -; GFX12-NEXT: s_lshr_b32 s11, s4, 16 -; GFX12-NEXT: s_lshr_b32 s13, s5, 16 -; GFX12-NEXT: v_and_b32_e64 v4, 0xff, s5 -; GFX12-NEXT: v_and_b32_e64 v5, 0xff, s4 -; GFX12-NEXT: v_and_b32_e64 v6, 0xff, s7 -; GFX12-NEXT: v_and_b32_e64 v7, 0xff, s6 -; GFX12-NEXT: v_and_b32_e64 v11, 0xff, s9 -; GFX12-NEXT: v_and_b32_e64 v12, 0xff, s3 -; GFX12-NEXT: v_and_b32_e64 v9, 0xff, s13 -; GFX12-NEXT: v_and_b32_e64 v10, 0xff, s11 -; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v5, 0xffff, v5 -; GFX12-NEXT: v_lshrrev_b16 v1, 8, s6 -; GFX12-NEXT: v_lshrrev_b16 v3, 8, s7 -; GFX12-NEXT: v_lshrrev_b16 v0, 8, s4 -; GFX12-NEXT: v_lshrrev_b16 v2, 8, s5 -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX12-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX12-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX12-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX12-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX12-NEXT: s_lshr_b32 s2, s6, 24 -; GFX12-NEXT: s_lshr_b32 s8, s7, 24 -; GFX12-NEXT: s_lshr_b32 s10, s4, 24 -; GFX12-NEXT: s_lshr_b32 s12, s5, 24 -; GFX12-NEXT: v_lshl_or_b32 v2, v2, 16, v4 -; GFX12-NEXT: v_lshl_or_b32 v0, v0, 16, v5 -; GFX12-NEXT: v_lshl_or_b32 v6, v3, 16, v6 -; GFX12-NEXT: v_lshl_or_b32 v4, v1, 16, v7 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_lshl_or_b32 v7, s8, 16, v11 -; GFX12-NEXT: v_lshl_or_b32 v5, s2, 16, v12 -; GFX12-NEXT: v_lshl_or_b32 v3, s12, 16, v9 -; GFX12-NEXT: v_lshl_or_b32 v1, s10, 16, v10 +; GFX12-NEXT: s_bfe_u32 s2, s6, 0x80008 +; GFX12-NEXT: s_lshr_b32 s3, s6, 24 +; GFX12-NEXT: s_bfe_u32 s8, s7, 0x80008 +; GFX12-NEXT: s_lshr_b32 s9, s7, 24 +; GFX12-NEXT: s_bfe_u32 s16, s7, 0x80010 +; GFX12-NEXT: s_and_b32 s7, s7, 0xff +; GFX12-NEXT: s_bfe_u32 s17, s6, 0x80010 +; GFX12-NEXT: s_and_b32 s6, s6, 0xff +; GFX12-NEXT: s_bfe_u32 s10, s4, 0x80008 +; GFX12-NEXT: s_lshr_b32 s11, s4, 24 +; GFX12-NEXT: s_bfe_u32 s12, s5, 0x80008 +; GFX12-NEXT: s_lshr_b32 s13, s5, 24 +; GFX12-NEXT: s_bfe_u32 s14, s5, 0x80010 +; GFX12-NEXT: s_and_b32 s5, s5, 0xff +; GFX12-NEXT: s_bfe_u32 s15, s4, 0x80010 +; GFX12-NEXT: s_and_b32 s4, s4, 0xff +; GFX12-NEXT: s_pack_ll_b32_b16 s9, s16, s9 +; GFX12-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX12-NEXT: s_pack_ll_b32_b16 s2, s6, s2 +; GFX12-NEXT: s_pack_ll_b32_b16 s3, s17, s3 +; GFX12-NEXT: s_pack_ll_b32_b16 s13, s14, s13 +; GFX12-NEXT: s_pack_ll_b32_b16 s5, s5, s12 +; GFX12-NEXT: s_pack_ll_b32_b16 s11, s15, s11 +; GFX12-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s9 +; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s11 +; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s13 +; GFX12-NEXT: v_mov_b32_e32 v6, s5 ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 -; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -10749,57 +10717,69 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_bfe_i32 s10, s5, 0x80000 -; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v0, 8, s5 ; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s5, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s10, 0xffff, s10 -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-NOHSA-NEXT: s_sext_i32_i16 s10, s5 +; GFX8-NOHSA-NEXT: s_bfe_i32 s11, s5, 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i32 s5, s5, 16 +; GFX8-NOHSA-NEXT: s_lshl_b32 s5, s5, 8 +; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s3, 0x80000 ; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s4, 16 -; GFX8-NOHSA-NEXT: s_bfe_i32 s5, s4, 0x80000 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s10, v0 -; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v0, 8, s4 -; GFX8-NOHSA-NEXT: s_bfe_i32 s4, s3, 0x80000 -; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v1, 8, s3 -; GFX8-NOHSA-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v3, s4, v1 -; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s2, 0x80000 -; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v1, 8, s2 +; GFX8-NOHSA-NEXT: s_and_b32 s5, s5, 0xffff0000 +; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NOHSA-NEXT: s_sext_i32_i16 s12, s4 +; GFX8-NOHSA-NEXT: s_lshl_b32 s10, s10, 8 +; GFX8-NOHSA-NEXT: s_or_b32 s5, s3, s5 +; GFX8-NOHSA-NEXT: s_ashr_i32 s3, s4, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s2, s2, 0x80000 +; GFX8-NOHSA-NEXT: s_and_b32 s11, 0xffff, s11 +; GFX8-NOHSA-NEXT: s_lshl_b32 s12, s12, 8 +; GFX8-NOHSA-NEXT: s_and_b32 s10, s10, 0xffff0000 +; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 16 +; GFX8-NOHSA-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NOHSA-NEXT: s_or_b32 s10, s11, s10 +; GFX8-NOHSA-NEXT: s_and_b32 s11, s12, 0xffff0000 +; GFX8-NOHSA-NEXT: s_bfe_i32 s12, s4, 0x80000 +; GFX8-NOHSA-NEXT: s_or_b32 s4, s2, s3 +; GFX8-NOHSA-NEXT: s_sext_i32_i16 s2, s7 +; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 8 +; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s7, 0x80000 +; GFX8-NOHSA-NEXT: s_and_b32 s12, 0xffff, s12 +; GFX8-NOHSA-NEXT: s_and_b32 s2, s2, 0xffff0000 +; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NOHSA-NEXT: s_or_b32 s11, s12, s11 +; GFX8-NOHSA-NEXT: s_or_b32 s12, s3, s2 +; GFX8-NOHSA-NEXT: s_sext_i32_i16 s2, s6 +; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 8 +; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s6, 0x80000 +; GFX8-NOHSA-NEXT: s_and_b32 s2, s2, 0xffff0000 ; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NOHSA-NEXT: s_lshr_b32 s9, s7, 16 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v1, s3, v1 +; GFX8-NOHSA-NEXT: s_or_b32 s13, s3, s2 ; GFX8-NOHSA-NEXT: s_ashr_i64 s[2:3], s[6:7], 56 ; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s9, 0x80000 +; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s6, 16 ; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NOHSA-NEXT: s_or_b32 s2, s3, s2 -; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s7, 0x80000 -; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v4, 8, s7 -; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v6, s3, v4 -; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s6, 0x80000 -; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v4, 8, s6 -; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s6, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v4, s3, v4 -; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s8, 0x80000 -; GFX8-NOHSA-NEXT: s_and_b32 s5, 0xffff, s5 -; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v5, 8, s8 -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NOHSA-NEXT: s_ashr_i32 s3, s6, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s6, s8, 0x80000 +; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 16 +; GFX8-NOHSA-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX8-NOHSA-NEXT: s_or_b32 s3, s6, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v5, s3, v5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s2 -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s5, v0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -10980,46 +10960,44 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_ashr_i64 s[2:3], s[6:7], 56 ; GFX12-NEXT: s_lshr_b32 s8, s6, 16 -; GFX12-NEXT: v_ashrrev_i16 v5, 8, s6 -; GFX12-NEXT: s_bfe_i32 s6, s6, 0x80000 +; GFX12-NEXT: s_lshr_b32 s9, s7, 16 +; GFX12-NEXT: s_bfe_i32 s3, s7, 0x80000 +; GFX12-NEXT: s_sext_i32_i16 s7, s7 +; GFX12-NEXT: s_ashr_i32 s16, s6, 24 +; GFX12-NEXT: s_bfe_i32 s17, s6, 0x80000 +; GFX12-NEXT: s_sext_i32_i16 s6, s6 ; GFX12-NEXT: s_lshr_b32 s10, s4, 16 ; GFX12-NEXT: s_lshr_b32 s11, s5, 16 -; GFX12-NEXT: v_ashrrev_i16 v1, 8, s4 -; GFX12-NEXT: s_bfe_i32 s4, s4, 0x80000 -; GFX12-NEXT: v_ashrrev_i16 v0, 8, s5 -; GFX12-NEXT: s_bfe_i32 s5, s5, 0x80000 -; GFX12-NEXT: s_bfe_i32 s12, s7, 0x80000 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_ashr_i64 s[2:3], s[6:7], 56 -; GFX12-NEXT: v_and_b32_e64 v12, 0xffff, s6 -; GFX12-NEXT: s_bfe_i32 s6, s8, 0x80000 -; GFX12-NEXT: s_lshr_b32 s9, s7, 16 -; GFX12-NEXT: v_and_b32_e64 v7, 0xffff, s4 -; GFX12-NEXT: s_bfe_i32 s3, s11, 0x80000 -; GFX12-NEXT: s_bfe_i32 s4, s10, 0x80000 -; GFX12-NEXT: v_ashrrev_i16 v2, 8, s7 -; GFX12-NEXT: v_and_b32_e64 v4, 0xffff, s5 -; GFX12-NEXT: v_and_b32_e64 v11, 0xffff, s12 -; GFX12-NEXT: v_ashrrev_i16 v13, 8, s8 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_and_b32_e64 v16, 0xffff, s6 -; GFX12-NEXT: v_ashrrev_i16 v9, 8, s11 -; GFX12-NEXT: v_ashrrev_i16 v10, 8, s10 -; GFX12-NEXT: s_bfe_i32 s5, s9, 0x80000 -; GFX12-NEXT: v_and_b32_e64 v14, 0xffff, s3 -; GFX12-NEXT: v_and_b32_e64 v15, 0xffff, s4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_pack_ll_b32_b16 s2, s5, s2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s2 -; GFX12-NEXT: v_lshl_or_b32 v6, v0, 16, v4 -; GFX12-NEXT: v_lshl_or_b32 v4, v1, 16, v7 -; GFX12-NEXT: v_lshl_or_b32 v2, v2, 16, v11 -; GFX12-NEXT: v_lshl_or_b32 v0, v5, 16, v12 -; GFX12-NEXT: v_lshl_or_b32 v1, v13, 16, v16 -; GFX12-NEXT: v_lshl_or_b32 v7, v9, 16, v14 -; GFX12-NEXT: v_lshl_or_b32 v5, v10, 16, v15 +; GFX12-NEXT: s_ashr_i32 s12, s5, 16 +; GFX12-NEXT: s_bfe_i32 s13, s5, 0x80000 +; GFX12-NEXT: s_sext_i32_i16 s5, s5 +; GFX12-NEXT: s_ashr_i32 s14, s4, 24 +; GFX12-NEXT: s_bfe_i32 s15, s4, 0x80000 +; GFX12-NEXT: s_sext_i32_i16 s4, s4 +; GFX12-NEXT: s_bfe_i32 s9, s9, 0x80000 +; GFX12-NEXT: s_lshr_b32 s7, s7, 8 +; GFX12-NEXT: s_bfe_i32 s8, s8, 0x80000 +; GFX12-NEXT: s_lshr_b32 s6, s6, 8 +; GFX12-NEXT: s_lshr_b32 s12, s12, 8 +; GFX12-NEXT: s_bfe_i32 s11, s11, 0x80000 +; GFX12-NEXT: s_lshr_b32 s5, s5, 8 +; GFX12-NEXT: s_bfe_i32 s10, s10, 0x80000 +; GFX12-NEXT: s_lshr_b32 s4, s4, 8 +; GFX12-NEXT: s_pack_ll_b32_b16 s2, s9, s2 +; GFX12-NEXT: s_pack_ll_b32_b16 s3, s3, s7 +; GFX12-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX12-NEXT: s_pack_ll_b32_b16 s7, s8, s16 +; GFX12-NEXT: s_pack_ll_b32_b16 s11, s11, s12 +; GFX12-NEXT: s_pack_ll_b32_b16 s5, s13, s5 +; GFX12-NEXT: s_pack_ll_b32_b16 s10, s10, s14 +; GFX12-NEXT: s_pack_ll_b32_b16 s4, s15, s4 +; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s2 +; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v5, s10 +; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s11 +; GFX12-NEXT: v_mov_b32_e32 v6, s5 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] @@ -11224,90 +11202,94 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s1, 24 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s0 -; GFX8-NOHSA-NEXT: s_lshl_b32 s14, s14, 16 ; GFX8-NOHSA-NEXT: s_bfe_u32 s15, s1, 0x80010 -; GFX8-NOHSA-NEXT: s_lshr_b32 s13, s0, 24 -; GFX8-NOHSA-NEXT: s_or_b32 s14, s15, s14 -; GFX8-NOHSA-NEXT: s_and_b32 s15, s1, 0xff +; GFX8-NOHSA-NEXT: s_and_b32 s16, s1, 0xff ; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xff -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-NOHSA-NEXT: s_lshl_b32 s14, s14, 16 ; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX8-NOHSA-NEXT: s_lshr_b32 s13, s0, 24 +; GFX8-NOHSA-NEXT: s_or_b32 s14, s15, s14 +; GFX8-NOHSA-NEXT: s_or_b32 s15, s16, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NOHSA-NEXT: s_and_b32 s1, s0, 0xff +; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 8 +; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xff0000 +; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s13, v0, 16 +; GFX8-NOHSA-NEXT: s_or_b32 s13, s1, s0 ; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s3, 24 -; GFX8-NOHSA-NEXT: s_or_b32 s15, s15, s1 ; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s3, 0x80010 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v1, s13, v1, 16 -; GFX8-NOHSA-NEXT: s_or_b32 s13, s1, s0 +; GFX8-NOHSA-NEXT: s_or_b32 s16, s1, s0 ; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s3, 8 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s2 ; GFX8-NOHSA-NEXT: s_and_b32 s0, s3, 0xff ; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 ; GFX8-NOHSA-NEXT: s_or_b32 s3, s0, s1 +; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s2, 8 ; GFX8-NOHSA-NEXT: s_and_b32 s0, s2, 0xff -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s0, v2 +; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 +; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s2, 24 +; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NOHSA-NEXT: s_or_b32 s2, s0, s1 ; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s5, 24 ; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s5, 0x80010 -; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s2, 24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NOHSA-NEXT: s_or_b32 s2, s1, s0 +; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s12, v0, 16 +; GFX8-NOHSA-NEXT: s_or_b32 s12, s1, s0 ; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s5, 8 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v4, 8, s4 ; GFX8-NOHSA-NEXT: s_and_b32 s0, s5, 0xff ; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 ; GFX8-NOHSA-NEXT: s_or_b32 s5, s0, s1 +; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s4, 8 ; GFX8-NOHSA-NEXT: s_and_b32 s0, s4, 0xff -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v4, s0, v4 -; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s7, 24 +; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 ; GFX8-NOHSA-NEXT: s_lshr_b32 s11, s4, 24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NOHSA-NEXT: s_or_b32 s4, s0, s1 +; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s7, 24 ; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s7, 0x80010 -; GFX8-NOHSA-NEXT: s_lshl_b32 s4, s7, 8 ; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NOHSA-NEXT: s_and_b32 s1, s7, 0xff -; GFX8-NOHSA-NEXT: s_and_b32 s4, s4, 0xff0000 +; GFX8-NOHSA-NEXT: s_lshl_b32 s7, s7, 8 +; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s11, v0, 16 +; GFX8-NOHSA-NEXT: s_and_b32 s7, s7, 0xff0000 ; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s6, 24 -; GFX8-NOHSA-NEXT: s_or_b32 s1, s1, s4 -; GFX8-NOHSA-NEXT: s_and_b32 s4, s6, 0xff +; GFX8-NOHSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v0 +; GFX8-NOHSA-NEXT: s_or_b32 s1, s1, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NOHSA-NEXT: s_and_b32 s7, s6, 0xff +; GFX8-NOHSA-NEXT: s_lshl_b32 s6, s6, 8 +; GFX8-NOHSA-NEXT: s_and_b32 s6, s6, 0xff0000 +; GFX8-NOHSA-NEXT: s_or_b32 s6, s7, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s1 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s1 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v7, s10, v7, 16 -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s10, v0, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 32 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v7 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v6, s4, v6 +; GFX8-NOHSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[6:9] -; GFX8-NOHSA-NEXT: v_alignbit_b32 v5, s11, v5, 16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 16 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v5 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s12 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v3, s12, v3, 16 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5] -; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s15 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s14 @@ -11594,87 +11576,68 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshr_b32 s25, s1, 16 -; GFX12-NEXT: s_lshr_b32 s21, s3, 16 -; GFX12-NEXT: s_lshr_b32 s23, s0, 16 -; GFX12-NEXT: v_and_b32_e64 v6, 0xff, s1 -; GFX12-NEXT: v_and_b32_e64 v10, 0xff, s3 -; GFX12-NEXT: v_and_b32_e64 v11, 0xff, s2 -; GFX12-NEXT: v_and_b32_e64 v12, 0xff, s5 -; GFX12-NEXT: v_and_b32_e64 v13, 0xff, s4 -; GFX12-NEXT: v_and_b32_e64 v14, 0xff, s25 -; GFX12-NEXT: v_and_b32_e64 v7, 0xff, s0 -; GFX12-NEXT: v_and_b32_e64 v15, 0xff, s23 -; GFX12-NEXT: v_and_b32_e64 v17, 0xff, s21 -; GFX12-NEXT: s_lshr_b32 s17, s5, 16 -; GFX12-NEXT: v_lshrrev_b16 v8, 8, s4 -; GFX12-NEXT: v_lshrrev_b16 v9, 8, s5 -; GFX12-NEXT: v_lshrrev_b16 v3, 8, s2 -; GFX12-NEXT: v_lshrrev_b16 v4, 8, s3 -; GFX12-NEXT: v_lshrrev_b16 v2, 8, s1 -; GFX12-NEXT: v_and_b32_e64 v19, 0xff, s17 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX12-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX12-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX12-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_and_b32 v7, 0xffff, v7 -; GFX12-NEXT: v_lshrrev_b16 v0, 8, s0 -; GFX12-NEXT: v_and_b32_e32 v20, 0xffff, v15 -; GFX12-NEXT: v_and_b32_e32 v15, 0xffff, v17 -; GFX12-NEXT: s_lshr_b32 s11, s6, 16 -; GFX12-NEXT: s_lshr_b32 s13, s7, 16 -; GFX12-NEXT: s_lshr_b32 s24, s1, 24 -; GFX12-NEXT: s_lshr_b32 s15, s4, 16 -; GFX12-NEXT: s_lshr_b32 s20, s3, 24 -; GFX12-NEXT: s_lshr_b32 s19, s2, 16 -; GFX12-NEXT: v_and_b32_e32 v17, 0xffff, v19 -; GFX12-NEXT: v_lshl_or_b32 v2, v2, 16, v6 -; GFX12-NEXT: v_lshl_or_b32 v6, v4, 16, v10 -; GFX12-NEXT: v_lshl_or_b32 v4, v3, 16, v11 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_lshl_or_b32 v3, s24, 16, v14 -; GFX12-NEXT: v_lshl_or_b32 v10, v9, 16, v12 -; GFX12-NEXT: v_lshl_or_b32 v8, v8, 16, v13 -; GFX12-NEXT: v_and_b32_e64 v9, 0xff, s7 -; GFX12-NEXT: v_and_b32_e64 v12, 0xff, s6 -; GFX12-NEXT: v_and_b32_e64 v13, 0xff, s13 -; GFX12-NEXT: v_and_b32_e64 v14, 0xff, s11 -; GFX12-NEXT: v_lshl_or_b32 v0, v0, 16, v7 -; GFX12-NEXT: v_lshl_or_b32 v7, s20, 16, v15 -; GFX12-NEXT: v_and_b32_e64 v15, 0xff, s15 -; GFX12-NEXT: v_and_b32_e64 v18, 0xff, s19 -; GFX12-NEXT: s_lshr_b32 s16, s5, 24 -; GFX12-NEXT: v_lshrrev_b16 v1, 8, s6 -; GFX12-NEXT: v_lshrrev_b16 v5, 8, s7 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_lshl_or_b32 v11, s16, 16, v17 -; GFX12-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX12-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX12-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX12-NEXT: v_and_b32_e32 v17, 0xffff, v14 -; GFX12-NEXT: v_and_b32_e32 v19, 0xffff, v15 -; GFX12-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX12-NEXT: s_lshr_b32 s10, s6, 24 -; GFX12-NEXT: s_lshr_b32 s12, s7, 24 -; GFX12-NEXT: s_lshr_b32 s14, s4, 24 -; GFX12-NEXT: s_lshr_b32 s18, s2, 24 -; GFX12-NEXT: v_lshl_or_b32 v14, v5, 16, v9 -; GFX12-NEXT: v_lshl_or_b32 v12, v1, 16, v12 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_lshl_or_b32 v15, s12, 16, v13 -; GFX12-NEXT: v_lshl_or_b32 v13, s10, 16, v17 -; GFX12-NEXT: s_lshr_b32 s22, s0, 24 -; GFX12-NEXT: v_lshl_or_b32 v9, s14, 16, v19 -; GFX12-NEXT: v_lshl_or_b32 v5, s18, 16, v18 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_lshl_or_b32 v1, s22, 16, v20 +; GFX12-NEXT: s_bfe_u32 s12, s7, 0x80008 +; GFX12-NEXT: s_lshr_b32 s13, s7, 24 +; GFX12-NEXT: s_bfe_u32 s33, s7, 0x80010 +; GFX12-NEXT: s_and_b32 s7, s7, 0xff +; GFX12-NEXT: s_bfe_u32 s10, s6, 0x80008 +; GFX12-NEXT: s_lshr_b32 s11, s6, 24 +; GFX12-NEXT: s_pack_ll_b32_b16 s7, s7, s12 +; GFX12-NEXT: s_and_b32 s12, s6, 0xff +; GFX12-NEXT: s_bfe_u32 s6, s6, 0x80010 +; GFX12-NEXT: s_bfe_u32 s14, s4, 0x80008 +; GFX12-NEXT: s_lshr_b32 s15, s4, 24 +; GFX12-NEXT: s_bfe_u32 s16, s5, 0x80008 +; GFX12-NEXT: s_lshr_b32 s17, s5, 24 +; GFX12-NEXT: s_bfe_u32 s30, s5, 0x80010 +; GFX12-NEXT: s_and_b32 s5, s5, 0xff +; GFX12-NEXT: s_bfe_u32 s31, s4, 0x80010 +; GFX12-NEXT: s_and_b32 s4, s4, 0xff +; GFX12-NEXT: s_bfe_u32 s18, s2, 0x80008 +; GFX12-NEXT: s_lshr_b32 s19, s2, 24 +; GFX12-NEXT: s_bfe_u32 s20, s3, 0x80008 +; GFX12-NEXT: s_lshr_b32 s21, s3, 24 +; GFX12-NEXT: s_bfe_u32 s28, s3, 0x80010 +; GFX12-NEXT: s_and_b32 s3, s3, 0xff +; GFX12-NEXT: s_bfe_u32 s29, s2, 0x80010 +; GFX12-NEXT: s_and_b32 s2, s2, 0xff +; GFX12-NEXT: s_pack_ll_b32_b16 s13, s33, s13 +; GFX12-NEXT: s_pack_ll_b32_b16 s10, s12, s10 +; GFX12-NEXT: s_pack_ll_b32_b16 s6, s6, s11 +; GFX12-NEXT: s_bfe_u32 s22, s0, 0x80008 +; GFX12-NEXT: s_lshr_b32 s23, s0, 24 +; GFX12-NEXT: s_bfe_u32 s24, s1, 0x80008 +; GFX12-NEXT: s_lshr_b32 s25, s1, 24 +; GFX12-NEXT: s_bfe_u32 s26, s1, 0x80010 +; GFX12-NEXT: s_and_b32 s1, s1, 0xff +; GFX12-NEXT: s_bfe_u32 s27, s0, 0x80010 +; GFX12-NEXT: s_and_b32 s0, s0, 0xff +; GFX12-NEXT: s_pack_ll_b32_b16 s17, s30, s17 +; GFX12-NEXT: s_pack_ll_b32_b16 s5, s5, s16 +; GFX12-NEXT: s_pack_ll_b32_b16 s15, s31, s15 +; GFX12-NEXT: s_pack_ll_b32_b16 s4, s4, s14 +; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s6 +; GFX12-NEXT: s_pack_ll_b32_b16 s21, s28, s21 +; GFX12-NEXT: s_pack_ll_b32_b16 s3, s3, s20 +; GFX12-NEXT: s_pack_ll_b32_b16 s19, s29, s19 +; GFX12-NEXT: s_pack_ll_b32_b16 s2, s2, s18 +; GFX12-NEXT: v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v3, s13 +; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s15 +; GFX12-NEXT: s_pack_ll_b32_b16 s25, s26, s25 +; GFX12-NEXT: s_pack_ll_b32_b16 s1, s1, s24 +; GFX12-NEXT: s_pack_ll_b32_b16 s23, s27, s23 +; GFX12-NEXT: s_pack_ll_b32_b16 s0, s0, s22 +; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s17 +; GFX12-NEXT: v_dual_mov_b32 v6, s5 :: v_dual_mov_b32 v9, s19 +; GFX12-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v11, s21 +; GFX12-NEXT: v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v13, s23 +; GFX12-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s25 +; GFX12-NEXT: v_mov_b32_e32 v14, s1 ; GFX12-NEXT: s_clause 0x3 -; GFX12-NEXT: global_store_b128 v16, v[12:15], s[8:9] offset:48 -; GFX12-NEXT: global_store_b128 v16, v[8:11], s[8:9] offset:32 -; GFX12-NEXT: global_store_b128 v16, v[4:7], s[8:9] offset:16 -; GFX12-NEXT: global_store_b128 v16, v[0:3], s[8:9] +; GFX12-NEXT: global_store_b128 v16, v[0:3], s[8:9] offset:48 +; GFX12-NEXT: global_store_b128 v16, v[4:7], s[8:9] offset:32 +; GFX12-NEXT: global_store_b128 v16, v[8:11], s[8:9] offset:16 +; GFX12-NEXT: global_store_b128 v16, v[12:15], s[8:9] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -11926,114 +11889,137 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_bfe_i32 s18, s1, 0x80000 -; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v0, 8, s1 ; GFX8-NOHSA-NEXT: s_lshr_b32 s17, s1, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s18, 0xffff, s18 -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-NOHSA-NEXT: s_sext_i32_i16 s18, s1 +; GFX8-NOHSA-NEXT: s_bfe_i32 s19, s1, 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i32 s1, s1, 16 +; GFX8-NOHSA-NEXT: s_lshl_b32 s18, s18, 8 +; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NOHSA-NEXT: s_bfe_i32 s17, s17, 0x80000 ; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s0, 16 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s18, v0 -; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s0, 0x80000 -; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v0, 8, s0 -; GFX8-NOHSA-NEXT: s_bfe_i32 s0, s17, 0x80000 -; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v1, 8, s17 -; GFX8-NOHSA-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v3, s0, v1 -; GFX8-NOHSA-NEXT: s_bfe_i32 s0, s16, 0x80000 -; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v1, 8, s16 -; GFX8-NOHSA-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v1, s0, v1 -; GFX8-NOHSA-NEXT: s_bfe_i32 s0, s3, 0x80000 -; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v4, 8, s3 -; GFX8-NOHSA-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v6, s0, v4 -; GFX8-NOHSA-NEXT: s_bfe_i32 s0, s2, 0x80000 -; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v4, 8, s2 +; GFX8-NOHSA-NEXT: s_and_b32 s18, s18, 0xffff0000 +; GFX8-NOHSA-NEXT: s_and_b32 s19, 0xffff, s19 +; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xffff0000 +; GFX8-NOHSA-NEXT: s_and_b32 s17, 0xffff, s17 +; GFX8-NOHSA-NEXT: s_or_b32 s18, s19, s18 +; GFX8-NOHSA-NEXT: s_sext_i32_i16 s19, s0 +; GFX8-NOHSA-NEXT: s_bfe_i32 s20, s0, 0x80000 +; GFX8-NOHSA-NEXT: s_or_b32 s17, s17, s1 +; GFX8-NOHSA-NEXT: s_ashr_i32 s0, s0, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s16, 0x80000 +; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16 +; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NOHSA-NEXT: s_or_b32 s16, s1, s0 +; GFX8-NOHSA-NEXT: s_sext_i32_i16 s0, s3 +; GFX8-NOHSA-NEXT: s_lshl_b32 s19, s19, 8 +; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 8 +; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s3, 0x80000 +; GFX8-NOHSA-NEXT: s_and_b32 s19, s19, 0xffff0000 +; GFX8-NOHSA-NEXT: s_and_b32 s20, 0xffff, s20 +; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NOHSA-NEXT: s_or_b32 s19, s20, s19 +; GFX8-NOHSA-NEXT: s_or_b32 s20, s1, s0 +; GFX8-NOHSA-NEXT: s_sext_i32_i16 s0, s2 +; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 8 +; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s2, 0x80000 +; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NOHSA-NEXT: s_lshr_b32 s15, s3, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v4, s0, v4 -; GFX8-NOHSA-NEXT: s_bfe_i32 s0, s15, 0x80000 -; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v5, 8, s15 +; GFX8-NOHSA-NEXT: s_or_b32 s21, s1, s0 +; GFX8-NOHSA-NEXT: s_ashr_i32 s0, s3, 16 +; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 8 +; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s15, 0x80000 ; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s2, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v7, s0, v5 -; GFX8-NOHSA-NEXT: s_bfe_i32 s0, s14, 0x80000 -; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v5, 8, s14 +; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NOHSA-NEXT: s_or_b32 s3, s1, s0 +; GFX8-NOHSA-NEXT: s_ashr_i32 s0, s2, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s14, 0x80000 +; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16 +; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NOHSA-NEXT: s_or_b32 s2, s1, s0 +; GFX8-NOHSA-NEXT: s_sext_i32_i16 s0, s5 +; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 8 +; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s5, 0x80000 +; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NOHSA-NEXT: s_or_b32 s14, s1, s0 +; GFX8-NOHSA-NEXT: s_sext_i32_i16 s0, s4 +; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 8 +; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s4, 0x80000 +; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xffff0000 ; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NOHSA-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX8-NOHSA-NEXT: s_lshr_b32 s13, s5, 16 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s1, v0 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v5, s0, v5 +; GFX8-NOHSA-NEXT: s_or_b32 s15, s1, s0 ; GFX8-NOHSA-NEXT: s_ashr_i64 s[0:1], s[4:5], 56 ; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s13, 0x80000 +; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s4, 16 ; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NOHSA-NEXT: s_or_b32 s2, s1, s0 -; GFX8-NOHSA-NEXT: s_bfe_i32 s0, s5, 0x80000 -; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v8, 8, s5 -; GFX8-NOHSA-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v10, s0, v8 -; GFX8-NOHSA-NEXT: s_bfe_i32 s0, s4, 0x80000 -; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v8, 8, s4 -; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s4, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v8, s0, v8 -; GFX8-NOHSA-NEXT: s_bfe_i32 s0, s12, 0x80000 -; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v9, 8, s12 -; GFX8-NOHSA-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v9, s0, v9 -; GFX8-NOHSA-NEXT: s_bfe_i32 s0, s7, 0x80000 -; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v11, 8, s7 -; GFX8-NOHSA-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v13, s0, v11 -; GFX8-NOHSA-NEXT: s_bfe_i32 s0, s6, 0x80000 -; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v11, 8, s6 -; GFX8-NOHSA-NEXT: s_lshr_b32 s11, s7, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v11, s0, v11 -; GFX8-NOHSA-NEXT: s_bfe_i32 s0, s11, 0x80000 -; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v12, 8, s11 +; GFX8-NOHSA-NEXT: s_or_b32 s5, s1, s0 +; GFX8-NOHSA-NEXT: s_ashr_i32 s0, s4, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s12, 0x80000 +; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16 +; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NOHSA-NEXT: s_or_b32 s4, s1, s0 +; GFX8-NOHSA-NEXT: s_sext_i32_i16 s0, s7 +; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 8 +; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s7, 0x80000 +; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s6, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v14, s0, v12 -; GFX8-NOHSA-NEXT: s_bfe_i32 s0, s10, 0x80000 -; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v12, 8, s10 -; GFX8-NOHSA-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX8-NOHSA-NEXT: v_or_b32_e32 v12, s0, v12 +; GFX8-NOHSA-NEXT: s_lshr_b32 s11, s7, 16 +; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NOHSA-NEXT: s_sext_i32_i16 s1, s6 +; GFX8-NOHSA-NEXT: s_ashr_i32 s7, s7, 16 +; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NOHSA-NEXT: s_bfe_i32 s12, s6, 0x80000 +; GFX8-NOHSA-NEXT: s_lshl_b32 s7, s7, 8 +; GFX8-NOHSA-NEXT: s_bfe_i32 s11, s11, 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i32 s6, s6, 24 +; GFX8-NOHSA-NEXT: s_bfe_i32 s10, s10, 0x80000 +; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xffff0000 +; GFX8-NOHSA-NEXT: s_and_b32 s12, 0xffff, s12 +; GFX8-NOHSA-NEXT: s_and_b32 s7, s7, 0xffff0000 +; GFX8-NOHSA-NEXT: s_and_b32 s11, 0xffff, s11 +; GFX8-NOHSA-NEXT: s_lshl_b32 s6, s6, 16 +; GFX8-NOHSA-NEXT: s_and_b32 s10, 0xffff, s10 +; GFX8-NOHSA-NEXT: s_or_b32 s1, s12, s1 +; GFX8-NOHSA-NEXT: s_or_b32 s7, s11, s7 +; GFX8-NOHSA-NEXT: s_or_b32 s6, s10, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 48 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 32 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s15 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] -; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s21 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s19 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s17 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -12372,93 +12358,87 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshr_b32 s12, s4, 16 -; GFX12-NEXT: s_lshr_b32 s14, s2, 16 -; GFX12-NEXT: v_ashrrev_i16 v4, 8, s2 -; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80000 -; GFX12-NEXT: s_bfe_i32 s20, s5, 0x80000 -; GFX12-NEXT: v_ashrrev_i16 v7, 8, s4 -; GFX12-NEXT: s_bfe_i32 s4, s4, 0x80000 -; GFX12-NEXT: s_lshr_b32 s17, s1, 16 -; GFX12-NEXT: s_lshr_b32 s15, s3, 16 +; GFX12-NEXT: s_lshr_b32 s13, s5, 16 ; GFX12-NEXT: s_lshr_b32 s16, s0, 16 -; GFX12-NEXT: v_ashrrev_i16 v0, 8, s1 -; GFX12-NEXT: s_bfe_i32 s18, s1, 0x80000 -; GFX12-NEXT: v_ashrrev_i16 v1, 8, s0 -; GFX12-NEXT: s_bfe_i32 s19, s0, 0x80000 -; GFX12-NEXT: v_ashrrev_i16 v5, 8, s5 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_lshr_b32 s17, s1, 16 +; GFX12-NEXT: s_ashr_i32 s18, s1, 16 +; GFX12-NEXT: s_bfe_i32 s19, s1, 0x80000 +; GFX12-NEXT: s_sext_i32_i16 s20, s1 +; GFX12-NEXT: s_ashr_i32 s21, s0, 24 +; GFX12-NEXT: s_bfe_i32 s22, s0, 0x80000 +; GFX12-NEXT: s_sext_i32_i16 s23, s0 ; GFX12-NEXT: s_ashr_i64 s[0:1], s[4:5], 56 -; GFX12-NEXT: v_and_b32_e64 v10, 0xffff, s2 -; GFX12-NEXT: v_and_b32_e64 v12, 0xffff, s20 -; GFX12-NEXT: s_bfe_i32 s1, s17, 0x80000 -; GFX12-NEXT: v_ashrrev_i16 v3, 8, s3 -; GFX12-NEXT: s_bfe_i32 s3, s3, 0x80000 -; GFX12-NEXT: s_bfe_i32 s2, s15, 0x80000 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_and_b32_e64 v14, 0xffff, s1 -; GFX12-NEXT: s_bfe_i32 s1, s12, 0x80000 -; GFX12-NEXT: v_and_b32_e64 v2, 0xffff, s18 -; GFX12-NEXT: v_and_b32_e64 v6, 0xffff, s19 -; GFX12-NEXT: v_and_b32_e64 v8, 0xffff, s3 -; GFX12-NEXT: v_ashrrev_i16 v11, 8, s15 -; GFX12-NEXT: v_and_b32_e64 v13, 0xffff, s4 -; GFX12-NEXT: v_and_b32_e64 v15, 0xffff, s2 -; GFX12-NEXT: v_lshl_or_b32 v4, v4, 16, v10 -; GFX12-NEXT: v_lshl_or_b32 v10, v5, 16, v12 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_and_b32_e64 v5, 0xffff, s1 -; GFX12-NEXT: s_bfe_i32 s1, s7, 0x80000 +; GFX12-NEXT: s_lshr_b32 s12, s4, 16 +; GFX12-NEXT: s_bfe_i32 s1, s5, 0x80000 +; GFX12-NEXT: s_sext_i32_i16 s5, s5 +; GFX12-NEXT: s_bfe_i32 s13, s13, 0x80000 +; GFX12-NEXT: s_lshr_b32 s5, s5, 8 +; GFX12-NEXT: s_pack_ll_b32_b16 s0, s13, s0 +; GFX12-NEXT: s_ashr_i32 s13, s4, 24 +; GFX12-NEXT: s_bfe_i32 s12, s12, 0x80000 +; GFX12-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX12-NEXT: s_pack_ll_b32_b16 s5, s12, s13 +; GFX12-NEXT: s_sext_i32_i16 s12, s4 +; GFX12-NEXT: s_bfe_i32 s4, s4, 0x80000 +; GFX12-NEXT: s_lshr_b32 s12, s12, 8 +; GFX12-NEXT: s_ashr_i32 s13, s7, 16 +; GFX12-NEXT: s_pack_ll_b32_b16 s4, s4, s12 +; GFX12-NEXT: s_lshr_b32 s12, s13, 8 +; GFX12-NEXT: s_sext_i32_i16 s13, s7 ; GFX12-NEXT: s_lshr_b32 s11, s7, 16 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_and_b32_e64 v12, 0xffff, s1 -; GFX12-NEXT: s_bfe_i32 s1, s6, 0x80000 +; GFX12-NEXT: s_bfe_i32 s7, s7, 0x80000 +; GFX12-NEXT: s_lshr_b32 s13, s13, 8 ; GFX12-NEXT: s_lshr_b32 s10, s6, 16 -; GFX12-NEXT: v_lshl_or_b32 v2, v0, 16, v2 -; GFX12-NEXT: v_lshl_or_b32 v0, v1, 16, v6 -; GFX12-NEXT: v_lshl_or_b32 v6, v3, 16, v8 -; GFX12-NEXT: v_lshl_or_b32 v8, v7, 16, v13 -; GFX12-NEXT: v_lshl_or_b32 v7, v11, 16, v15 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_and_b32_e64 v15, 0xffff, s1 -; GFX12-NEXT: s_bfe_i32 s1, s11, 0x80000 -; GFX12-NEXT: s_lshr_b32 s13, s5, 16 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_and_b32_e64 v22, 0xffff, s1 -; GFX12-NEXT: s_bfe_i32 s1, s10, 0x80000 -; GFX12-NEXT: v_ashrrev_i16 v9, 8, s17 -; GFX12-NEXT: s_bfe_i32 s3, s14, 0x80000 -; GFX12-NEXT: v_ashrrev_i16 v11, 8, s7 -; GFX12-NEXT: v_ashrrev_i16 v13, 8, s6 -; GFX12-NEXT: v_ashrrev_i16 v21, 8, s11 -; GFX12-NEXT: v_ashrrev_i16 v23, 8, s10 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_and_b32_e64 v24, 0xffff, s1 -; GFX12-NEXT: s_bfe_i32 s5, s16, 0x80000 -; GFX12-NEXT: v_ashrrev_i16 v1, 8, s12 -; GFX12-NEXT: v_ashrrev_i16 v18, 8, s14 -; GFX12-NEXT: s_bfe_i32 s4, s13, 0x80000 -; GFX12-NEXT: v_and_b32_e64 v20, 0xffff, s3 -; GFX12-NEXT: v_ashrrev_i16 v17, 8, s16 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_and_b32_e64 v19, 0xffff, s5 -; GFX12-NEXT: s_pack_ll_b32_b16 s0, s4, s0 -; GFX12-NEXT: v_mov_b32_e32 v16, 0 -; GFX12-NEXT: v_lshl_or_b32 v3, v9, 16, v14 -; GFX12-NEXT: v_lshl_or_b32 v14, v11, 16, v12 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v11, s0 -; GFX12-NEXT: v_lshl_or_b32 v12, v13, 16, v15 -; GFX12-NEXT: v_lshl_or_b32 v15, v21, 16, v22 -; GFX12-NEXT: v_lshl_or_b32 v13, v23, 16, v24 -; GFX12-NEXT: v_lshl_or_b32 v9, v1, 16, v5 -; GFX12-NEXT: v_lshl_or_b32 v5, v18, 16, v20 -; GFX12-NEXT: v_lshl_or_b32 v1, v17, 16, v19 +; GFX12-NEXT: s_bfe_i32 s11, s11, 0x80000 +; GFX12-NEXT: s_pack_ll_b32_b16 s7, s7, s13 +; GFX12-NEXT: s_sext_i32_i16 s13, s6 +; GFX12-NEXT: s_lshr_b32 s14, s2, 16 +; GFX12-NEXT: s_lshr_b32 s15, s3, 16 +; GFX12-NEXT: s_ashr_i32 s24, s3, 16 +; GFX12-NEXT: s_bfe_i32 s25, s3, 0x80000 +; GFX12-NEXT: s_sext_i32_i16 s3, s3 +; GFX12-NEXT: s_ashr_i32 s26, s2, 24 +; GFX12-NEXT: s_bfe_i32 s27, s2, 0x80000 +; GFX12-NEXT: s_sext_i32_i16 s2, s2 +; GFX12-NEXT: s_pack_ll_b32_b16 s11, s11, s12 +; GFX12-NEXT: s_ashr_i32 s12, s6, 24 +; GFX12-NEXT: s_bfe_i32 s6, s6, 0x80000 +; GFX12-NEXT: s_lshr_b32 s13, s13, 8 +; GFX12-NEXT: s_bfe_i32 s10, s10, 0x80000 +; GFX12-NEXT: s_lshr_b32 s24, s24, 8 +; GFX12-NEXT: s_bfe_i32 s15, s15, 0x80000 +; GFX12-NEXT: s_lshr_b32 s3, s3, 8 +; GFX12-NEXT: s_bfe_i32 s14, s14, 0x80000 +; GFX12-NEXT: s_lshr_b32 s2, s2, 8 +; GFX12-NEXT: s_pack_ll_b32_b16 s6, s6, s13 +; GFX12-NEXT: s_pack_ll_b32_b16 s10, s10, s12 +; GFX12-NEXT: s_lshr_b32 s18, s18, 8 +; GFX12-NEXT: s_bfe_i32 s17, s17, 0x80000 +; GFX12-NEXT: s_lshr_b32 s20, s20, 8 +; GFX12-NEXT: s_bfe_i32 s16, s16, 0x80000 +; GFX12-NEXT: s_lshr_b32 s23, s23, 8 +; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s10 +; GFX12-NEXT: s_pack_ll_b32_b16 s15, s15, s24 +; GFX12-NEXT: s_pack_ll_b32_b16 s3, s25, s3 +; GFX12-NEXT: s_pack_ll_b32_b16 s14, s14, s26 +; GFX12-NEXT: s_pack_ll_b32_b16 s2, s27, s2 +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s11 +; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s5 +; GFX12-NEXT: s_pack_ll_b32_b16 s17, s17, s18 +; GFX12-NEXT: s_pack_ll_b32_b16 s18, s19, s20 +; GFX12-NEXT: s_pack_ll_b32_b16 s16, s16, s21 +; GFX12-NEXT: s_pack_ll_b32_b16 s19, s22, s23 +; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s0 +; GFX12-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v9, s14 +; GFX12-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v11, s15 +; GFX12-NEXT: v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v13, s16 +; GFX12-NEXT: v_dual_mov_b32 v12, s19 :: v_dual_mov_b32 v15, s17 +; GFX12-NEXT: v_mov_b32_e32 v14, s18 ; GFX12-NEXT: s_clause 0x3 -; GFX12-NEXT: global_store_b128 v16, v[12:15], s[8:9] offset:48 -; GFX12-NEXT: global_store_b128 v16, v[8:11], s[8:9] offset:32 -; GFX12-NEXT: global_store_b128 v16, v[4:7], s[8:9] offset:16 -; GFX12-NEXT: global_store_b128 v16, v[0:3], s[8:9] +; GFX12-NEXT: global_store_b128 v16, v[0:3], s[8:9] offset:48 +; GFX12-NEXT: global_store_b128 v16, v[4:7], s[8:9] offset:32 +; GFX12-NEXT: global_store_b128 v16, v[8:11], s[8:9] offset:16 +; GFX12-NEXT: global_store_b128 v16, v[12:15], s[8:9] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll index 6ed99f7074b641..add5f13bd2d996 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll @@ -1,6 +1,6 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GCN-NOHSA,SI,FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GCN-HSA,SI,FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GCN-NOHSA,VI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GCN-NOHSA,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GCN-HSA,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GCN-NOHSA,FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s @@ -163,8 +163,7 @@ define amdgpu_kernel void @global_sextload_v2i8_to_v2i32(ptr addrspace(1) %out, ; GCN-NOHSA: buffer_load_dword v ; GCN-HSA: flat_load_dword v -; SI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8 -; VI-DAG: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8 ; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 8 ; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff, @@ -186,16 +185,12 @@ entry: ; GCN-NOHSA: buffer_load_dword v ; GCN-HSA: flat_load_dword v -;FIXME: Need to optimize this sequence to avoid extra shift on VI. - ; t23: i16 = truncate t18 ; t49: i16 = srl t23, Constant:i32<8> ; t57: i32 = any_extend t49 ; t58: i32 = sign_extend_inreg t57, ValueType:ch:i8 -; SI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8 -; VI-DAG: v_lshrrev_b16_e32 [[SHIFT:v[0-9]+]], 8, v{{[0-9]+}} -; VI-DAG: v_bfe_i32 v{{[0-9]+}}, [[SHIFT]], 0, 8 +; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8 ; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 ; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 8 diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i8.ll b/llvm/test/CodeGen/AMDGPU/load-local-i8.ll index a2e55ce06b5252..97314910f82809 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-i8.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SI,SICIVI,FUNC %s -; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,VI,SICIVI,FUNC %s +; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SICIVI,FUNC %s +; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SICIVI,FUNC %s ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,FUNC %s ; RUN: llc -mtriple=r600-- -mcpu=redwood -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s @@ -161,12 +161,8 @@ define amdgpu_kernel void @local_zextload_v2i8_to_v2i32(ptr addrspace(3) %out, p ; t31: i32 = any_extend t23 ; t33: i32 = sign_extend_inreg t31, ValueType:ch:i8 -; SI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8 -; SI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 - -; VI-DAG: v_lshrrev_b16_e32 [[SHIFT:v[0-9]+]], 8, v{{[0-9]+}} -; VI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 -; VI-DAG: v_bfe_i32 v{{[0-9]+}}, [[SHIFT]], 0, 8 +; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8 +; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 ; EG: LDS_USHORT_READ_RET ; EG-DAG: BFE_INT @@ -182,8 +178,7 @@ define amdgpu_kernel void @local_sextload_v2i8_to_v2i32(ptr addrspace(3) %out, p ; GFX9-NOT: m0 ; GCN: ds_read_b32 -; SI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8 -; VI-DAG: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, {{v[0-9]+}} +; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8 ; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 8 ; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff, diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll index 0fb9e2572446b5..68b07bae032139 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --version 3 +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -O3 --amdgpu-lower-module-lds-strategy=module < %s | FileCheck -check-prefix=GCN %s ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module < %s | FileCheck %s ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module < %s | FileCheck %s @@ -28,7 +28,7 @@ define protected amdgpu_kernel void @test(ptr addrspace(1) nocapture %ptr.coerce ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_write_b8 v1, v2 offset:6 ; GCN-NEXT: ds_write_b16 v1, v3 offset:4 -; GCN-NEXT: v_cmp_eq_u16_sdwa s[2:3], v3, v0 src0_sel:BYTE_0 src1_sel:DWORD +; GCN-NEXT: v_cmp_eq_u32_sdwa s[2:3], v3, v0 src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] ; GCN-NEXT: global_store_byte v1, v0, s[0:1] ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index 686797f290b97f..05ef2698c1f774 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -590,54 +590,66 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; VI-LABEL: s_test_imin_sle_v4i8: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x28 -; VI-NEXT: s_load_dword s3, s[6:7], 0x4c ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s3, s[6:7], 0x4c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_lshrrev_b16_e64 v0, 8, s2 -; VI-NEXT: v_lshrrev_b16_e64 v1, 8, s3 ; VI-NEXT: s_ashr_i32 s4, s2, 24 ; VI-NEXT: s_bfe_i32 s5, s2, 0x80010 +; VI-NEXT: s_bfe_i32 s6, s2, 0x80008 ; VI-NEXT: s_sext_i32_i8 s2, s2 -; VI-NEXT: s_ashr_i32 s6, s3, 24 -; VI-NEXT: s_bfe_i32 s7, s3, 0x80010 +; VI-NEXT: s_ashr_i32 s7, s3, 24 +; VI-NEXT: s_bfe_i32 s8, s3, 0x80010 +; VI-NEXT: s_bfe_i32 s9, s3, 0x80008 ; VI-NEXT: s_sext_i32_i8 s3, s3 -; VI-NEXT: s_min_i32 s4, s4, s6 ; VI-NEXT: s_min_i32 s2, s2, s3 -; VI-NEXT: s_min_i32 s3, s5, s7 -; VI-NEXT: v_min_i32_sdwa v0, sext(v0), sext(v1) dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; VI-NEXT: v_lshlrev_b16_e64 v1, 8, s4 -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_min_i32 s3, s6, s9 +; VI-NEXT: s_min_i32 s5, s5, s8 +; VI-NEXT: s_min_i32 s4, s4, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xff +; VI-NEXT: s_lshl_b32 s3, s3, 8 +; VI-NEXT: s_and_b32 s2, s2, 0xff +; VI-NEXT: s_lshl_b32 s4, s4, 24 +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s2, s2, s3 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_or_b32 s2, s2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_test_imin_sle_v4i8: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s3, s[6:7], 0x4c ; GFX9-NEXT: s_load_dword s2, s[6:7], 0x28 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x4c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_bfe_i32 s6, s4, 0x80000 -; GFX9-NEXT: s_lshr_b32 s7, s3, 16 -; GFX9-NEXT: s_bfe_i32 s8, s7, 0x80000 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_lshr_b32 s5, s2, 16 +; GFX9-NEXT: s_lshr_b32 s8, s3, 16 +; GFX9-NEXT: s_ashr_i32 s9, s3, 24 +; GFX9-NEXT: s_ashr_i32 s6, s2, 24 +; GFX9-NEXT: s_bfe_i32 s8, s8, 0x80000 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_bfe_i32 s5, s5, 0x80000 +; GFX9-NEXT: s_sext_i32_i16 s7, s3 +; GFX9-NEXT: v_min_i16_e32 v1, s6, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-NEXT: s_bfe_i32 s9, s3, 0x80000 -; GFX9-NEXT: v_min_i16_sdwa v1, sext(s4), sext(v1) dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX9-NEXT: v_min_i16_e32 v2, s6, v2 -; GFX9-NEXT: s_bfe_i32 s5, s2, 0x80000 +; GFX9-NEXT: s_sext_i32_i16 s4, s2 +; GFX9-NEXT: s_lshr_b32 s7, s7, 8 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: v_min_i16_e32 v2, s5, v2 +; GFX9-NEXT: s_lshr_b32 s4, s4, 8 +; GFX9-NEXT: s_bfe_i32 s3, s3, 0x80000 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: v_min_i16_sdwa v2, sext(s2), sext(v2) dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX9-NEXT: v_min_i16_e32 v3, s5, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: s_bfe_i32 s2, s2, 0x80000 +; GFX9-NEXT: v_min_i16_e32 v2, s4, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX9-NEXT: v_min_i16_e32 v3, s2, v3 ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -650,26 +662,28 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4c ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-NEXT: s_lshr_b32 s5, s3, 16 -; GFX10-NEXT: v_ashrrev_i16 v0, 8, s2 -; GFX10-NEXT: v_ashrrev_i16 v1, 8, s4 -; GFX10-NEXT: v_ashrrev_i16 v2, 8, s5 -; GFX10-NEXT: v_ashrrev_i16 v3, 8, s3 +; GFX10-NEXT: s_sext_i32_i16 s4, s2 +; GFX10-NEXT: s_sext_i32_i16 s7, s3 +; GFX10-NEXT: s_ashr_i32 s6, s2, 24 +; GFX10-NEXT: s_ashr_i32 s9, s3, 24 +; GFX10-NEXT: s_lshr_b32 s4, s4, 8 +; GFX10-NEXT: s_lshr_b32 s7, s7, 8 +; GFX10-NEXT: v_min_i16 v0, s6, s9 +; GFX10-NEXT: v_min_i16 v1, s4, s7 +; GFX10-NEXT: s_lshr_b32 s5, s2, 16 +; GFX10-NEXT: s_lshr_b32 s8, s3, 16 ; GFX10-NEXT: s_bfe_i32 s2, s2, 0x80000 -; GFX10-NEXT: s_bfe_i32 s3, s3, 0x80000 -; GFX10-NEXT: s_bfe_i32 s4, s4, 0x80000 -; GFX10-NEXT: v_min_i16 v1, v1, v2 -; GFX10-NEXT: v_min_i16 v0, v0, v3 ; GFX10-NEXT: s_bfe_i32 s5, s5, 0x80000 -; GFX10-NEXT: v_min_i16 v2, s2, s3 -; GFX10-NEXT: v_min_i16 v3, s4, s5 -; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 -; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: s_bfe_i32 s4, s8, 0x80000 +; GFX10-NEXT: s_bfe_i32 s3, s3, 0x80000 +; GFX10-NEXT: v_min_i16 v2, s5, s4 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX10-NEXT: v_min_i16 v3, s2, s3 +; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -679,34 +693,36 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x28 ; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x4c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s4, s0, 16 -; GFX11-NEXT: s_lshr_b32 s5, s1, 16 -; GFX11-NEXT: v_ashrrev_i16 v0, 8, s0 -; GFX11-NEXT: v_ashrrev_i16 v1, 8, s1 -; GFX11-NEXT: v_ashrrev_i16 v2, 8, s4 -; GFX11-NEXT: v_ashrrev_i16 v3, 8, s5 +; GFX11-NEXT: s_sext_i32_i16 s4, s0 +; GFX11-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-NEXT: s_sext_i32_i16 s7, s1 +; GFX11-NEXT: s_lshr_b32 s8, s1, 16 +; GFX11-NEXT: s_ashr_i32 s6, s0, 24 ; GFX11-NEXT: s_bfe_i32 s0, s0, 0x80000 +; GFX11-NEXT: s_ashr_i32 s9, s1, 24 ; GFX11-NEXT: s_bfe_i32 s1, s1, 0x80000 -; GFX11-NEXT: s_bfe_i32 s4, s4, 0x80000 +; GFX11-NEXT: s_lshr_b32 s4, s4, 8 ; GFX11-NEXT: s_bfe_i32 s5, s5, 0x80000 -; GFX11-NEXT: v_min_i16 v4, s0, s1 -; GFX11-NEXT: v_min_i16 v5, s4, s5 -; GFX11-NEXT: v_min_i16 v2, v2, v3 -; GFX11-NEXT: v_min_i16 v0, v0, v1 +; GFX11-NEXT: s_lshr_b32 s7, s7, 8 +; GFX11-NEXT: s_bfe_i32 s8, s8, 0x80000 +; GFX11-NEXT: v_min_i16 v0, s6, s9 +; GFX11-NEXT: v_min_i16 v1, s0, s1 +; GFX11-NEXT: v_min_i16 v2, s5, s8 +; GFX11-NEXT: v_min_i16 v3, s4, s7 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v4 -; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v5 -; GFX11-NEXT: v_lshlrev_b16 v2, 8, v2 -; GFX11-NEXT: v_lshlrev_b16 v0, 8, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX11-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index 9b44b58c4a01e7..30a40e6af85389 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -1512,29 +1512,29 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 ; ; VI-LABEL: s_mul_i1: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x70 -; VI-NEXT: s_load_dword s5, s[2:3], 0x4c +; VI-NEXT: s_load_dword s4, s[2:3], 0x4c +; VI-NEXT: s_load_dword s5, s[2:3], 0x70 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mul_i32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s4, 1 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mul_lo_u16_e32 v0, s5, v0 -; VI-NEXT: v_and_b32_e32 v0, 1, v0 ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_mul_i1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x70 -; GFX9-NEXT: s_load_dword s5, s[2:3], 0x4c +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x4c +; GFX9-NEXT: s_load_dword s5, s[2:3], 0x70 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s4, 1 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mul_lo_u16_e32 v0, s5, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -1545,10 +1545,11 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 ; GFX10-NEXT: s_load_dword s5, s[2:3], 0x70 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mul_lo_u16 v0, s4, s5 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: s_mul_i32 s4, s4, s5 +; GFX10-NEXT: s_and_b32 s2, s4, 1 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; @@ -1559,11 +1560,12 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 ; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x70 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_lo_u16 v0, s4, s5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_mul_i32 s4, s4, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s2, s4, 1 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1576,11 +1578,12 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 ; GFX12-NEXT: s_load_b32 s5, s[2:3], 0x70 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: s_mov_b32 s3, 0x31016000 -; GFX12-NEXT: s_mov_b32 s2, -1 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mul_lo_u16 v0, s4, s5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX12-NEXT: s_mul_i32 s4, s4, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s2, s4, 1 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: s_mov_b32 s2, -1 ; GFX12-NEXT: buffer_store_b8 v0, off, s[0:3], null ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1655,7 +1658,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mul_lo_u16_e32 v0, v0, v1 +; VI-NEXT: v_mul_lo_u32 v0, v0, v1 ; VI-NEXT: v_and_b32_e32 v0, 1, v0 ; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -1675,7 +1678,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -1696,7 +1699,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: s_mov_b32 s1, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX10-NEXT: v_mul_lo_u32 v0, v0, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm @@ -1717,7 +1720,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX11-NEXT: s_mov_b32 s4, s0 ; GFX11-NEXT: s_mov_b32 s5, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX11-NEXT: v_mul_lo_u32 v0, v0, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: buffer_store_b8 v0, off, s[4:7], 0 @@ -1741,7 +1744,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX12-NEXT: s_mov_b32 s4, s0 ; GFX12-NEXT: s_mov_b32 s5, s1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX12-NEXT: v_mul_lo_u32 v0, v0, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX12-NEXT: buffer_store_b8 v0, off, s[4:7], null diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll index 050300a69c46bb..57f5473749513f 100644 --- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll +++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll @@ -611,50 +611,65 @@ define amdgpu_kernel void @shuffle8i8(ptr addrspace(1) %in0, ptr addrspace(1) %i ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s3, s3, 8 -; GFX10-NEXT: s_lshr_b32 s4, s9, 16 -; GFX10-NEXT: v_lshlrev_b16 v0, 8, s9 -; GFX10-NEXT: v_and_b32_e64 v1, 0xffffff00, s8 -; GFX10-NEXT: v_lshlrev_b16 v2, 8, s4 -; GFX10-NEXT: v_lshlrev_b16 v3, 8, s8 -; GFX10-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-NEXT: v_or_b32_sdwa v0, s3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v1, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v3, s2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX10-NEXT: s_bfe_u32 s3, s3, 0x80008 +; GFX10-NEXT: s_lshl_b32 s5, s9, 8 +; GFX10-NEXT: s_bfe_u32 s9, s9, 0x100010 +; GFX10-NEXT: s_bfe_u32 s4, s2, 0x80008 +; GFX10-NEXT: s_lshl_b32 s6, s8, 8 +; GFX10-NEXT: s_and_b32 s7, s8, 0xff00 +; GFX10-NEXT: s_bfe_u32 s8, s2, 0x80010 +; GFX10-NEXT: s_and_b32 s2, s2, 0xff +; GFX10-NEXT: s_or_b32 s3, s3, s5 +; GFX10-NEXT: s_lshl_b32 s5, s9, 8 +; GFX10-NEXT: s_or_b32 s4, s4, s6 +; GFX10-NEXT: s_or_b32 s6, s8, s7 +; GFX10-NEXT: s_or_b32 s2, s2, s5 +; GFX10-NEXT: s_and_b32 s4, s4, 0xffff +; GFX10-NEXT: s_lshl_b32 s3, s3, 16 +; GFX10-NEXT: s_and_b32 s2, s2, 0xffff +; GFX10-NEXT: s_lshl_b32 s5, s6, 16 +; GFX10-NEXT: s_or_b32 s3, s4, s3 +; GFX10-NEXT: s_or_b32 s2, s2, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: shuffle8i8: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffffff00 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s3, s3, 8 -; GFX9-NEXT: v_lshlrev_b16_e64 v1, 8, s9 -; GFX9-NEXT: v_or_b32_sdwa v4, s3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_lshr_b32 s3, s9, 16 -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: v_lshlrev_b16_e64 v3, 8, s8 -; GFX9-NEXT: v_and_b32_e32 v0, s8, v0 -; GFX9-NEXT: v_lshlrev_b16_e64 v1, 8, s3 -; GFX9-NEXT: v_or_b32_sdwa v3, s2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_bfe_u32 s4, s2, 0x80008 +; GFX9-NEXT: s_lshl_b32 s6, s8, 8 +; GFX9-NEXT: s_lshl_b32 s5, s9, 8 +; GFX9-NEXT: s_bfe_u32 s3, s3, 0x80008 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_bfe_u32 s6, s9, 0x100010 +; GFX9-NEXT: s_and_b32 s7, s8, 0xff00 +; GFX9-NEXT: s_or_b32 s3, s3, s5 +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x80010 +; GFX9-NEXT: s_and_b32 s2, s2, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_or_b32 s2, s2, s6 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s3, s3, 16 +; GFX9-NEXT: s_or_b32 s2, s2, s5 +; GFX9-NEXT: s_or_b32 s3, s4, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll index b7113a65607fc5..2aaffd7121ae97 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll @@ -33,10 +33,9 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) inreg %out, i8 zero ; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. ; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-NEXT: ; %bb.0: -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: s_and_b32 s0, s4, 0xff ; GFX940-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX940-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm ; @@ -44,10 +43,9 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) inreg %out, i8 zero ; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. ; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-NEXT: ; %bb.0: -; GFX90a-NEXT: s_mov_b32 s0, 0xffff -; GFX90a-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-NEXT: s_and_b32 s0, s8, 0xff ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX90a-NEXT: v_mov_b32_e32 v1, s0 ; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-NEXT: s_endpgm %ext = zext i8 %arg0 to i32 @@ -164,22 +162,18 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) inreg %out, <2 ; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. ; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-NEXT: ; %bb.0: -; GFX940-NEXT: s_lshr_b32 s0, s4, 8 -; GFX940-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX940-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-NEXT: global_store_short v1, v0, s[2:3] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm ; ; GFX90a-LABEL: ptr1_v2i8_preload_arg: ; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. ; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-NEXT: ; %bb.0: -; GFX90a-NEXT: s_lshr_b32 s0, s8, 8 -; GFX90a-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX90a-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-NEXT: v_mov_b32_e32 v1, 0 -; GFX90a-NEXT: global_store_short v1, v0, s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-NEXT: global_store_short v0, v1, s[6:7] ; GFX90a-NEXT: s_endpgm store <2 x i8> %in, ptr addrspace(1) %out ret void @@ -388,36 +382,36 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture inreg %ou ; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. ; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-NEXT: ; %bb.0: -; GFX940-NEXT: s_lshr_b32 s0, s4, 8 -; GFX940-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX940-NEXT: s_lshr_b32 s0, s4, 24 -; GFX940-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX940-NEXT: s_lshr_b32 s0, s4, 16 -; GFX940-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-NEXT: v_mov_b32_e32 v2, s5 -; GFX940-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX940-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-NEXT: global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1 -; GFX940-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1 +; GFX940-NEXT: s_lshr_b32 s1, s4, 24 +; GFX940-NEXT: s_and_b32 s0, s4, 0xffff +; GFX940-NEXT: s_lshl_b32 s1, s1, 8 +; GFX940-NEXT: s_bfe_u32 s4, s4, 0x80010 +; GFX940-NEXT: s_or_b32 s1, s4, s1 +; GFX940-NEXT: s_lshl_b32 s1, s1, 16 +; GFX940-NEXT: s_or_b32 s0, s0, s1 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-NEXT: global_store_byte v0, v1, s[2:3] offset:4 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm ; ; GFX90a-LABEL: v5i8_preload_arg: ; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. ; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-NEXT: ; %bb.0: -; GFX90a-NEXT: s_lshr_b32 s0, s8, 8 -; GFX90a-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX90a-NEXT: s_lshr_b32 s0, s8, 24 -; GFX90a-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX90a-NEXT: s_lshr_b32 s0, s8, 16 -; GFX90a-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX90a-NEXT: v_mov_b32_e32 v1, 0 -; GFX90a-NEXT: v_mov_b32_e32 v2, s9 -; GFX90a-NEXT: global_store_byte v1, v2, s[6:7] offset:4 -; GFX90a-NEXT: global_store_dword v1, v0, s[6:7] +; GFX90a-NEXT: s_lshr_b32 s1, s8, 24 +; GFX90a-NEXT: s_lshl_b32 s1, s1, 8 +; GFX90a-NEXT: s_bfe_u32 s2, s8, 0x80010 +; GFX90a-NEXT: s_or_b32 s1, s2, s1 +; GFX90a-NEXT: s_and_b32 s0, s8, 0xffff +; GFX90a-NEXT: s_lshl_b32 s1, s1, 16 +; GFX90a-NEXT: s_or_b32 s0, s0, s1 +; GFX90a-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NEXT: v_mov_b32_e32 v1, s9 +; GFX90a-NEXT: global_store_byte v0, v1, s[6:7] offset:4 +; GFX90a-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-NEXT: s_endpgm store <5 x i8> %in, ptr addrspace(1) %out, align 4 ret void @@ -478,23 +472,22 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) inreg %out, <8 x i8 ; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. ; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-NEXT: ; %bb.0: -; GFX940-NEXT: s_lshr_b32 s0, s5, 8 -; GFX940-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX940-NEXT: s_lshr_b32 s0, s5, 24 -; GFX940-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX940-NEXT: s_lshr_b32 s0, s5, 16 -; GFX940-NEXT: v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-NEXT: s_lshr_b32 s0, s4, 8 -; GFX940-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX940-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX940-NEXT: s_lshr_b32 s0, s4, 24 -; GFX940-NEXT: v_lshlrev_b16_e64 v2, 8, s0 -; GFX940-NEXT: s_lshr_b32 s0, s4, 16 -; GFX940-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX940-NEXT: s_lshr_b32 s1, s5, 24 +; GFX940-NEXT: s_and_b32 s0, s5, 0xffff +; GFX940-NEXT: s_lshl_b32 s1, s1, 8 +; GFX940-NEXT: s_bfe_u32 s5, s5, 0x80010 +; GFX940-NEXT: s_or_b32 s1, s5, s1 +; GFX940-NEXT: s_lshl_b32 s1, s1, 16 +; GFX940-NEXT: s_lshr_b32 s5, s4, 24 +; GFX940-NEXT: s_or_b32 s0, s0, s1 +; GFX940-NEXT: s_and_b32 s1, s4, 0xffff +; GFX940-NEXT: s_lshl_b32 s5, s5, 8 +; GFX940-NEXT: s_bfe_u32 s4, s4, 0x80010 +; GFX940-NEXT: s_or_b32 s4, s4, s5 +; GFX940-NEXT: s_lshl_b32 s4, s4, 16 +; GFX940-NEXT: s_or_b32 s1, s1, s4 +; GFX940-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm @@ -503,22 +496,22 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) inreg %out, <8 x i8 ; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. ; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-NEXT: ; %bb.0: -; GFX90a-NEXT: s_lshr_b32 s0, s9, 8 -; GFX90a-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX90a-NEXT: s_lshr_b32 s0, s9, 24 -; GFX90a-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX90a-NEXT: s_lshr_b32 s0, s9, 16 -; GFX90a-NEXT: v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-NEXT: s_lshr_b32 s0, s8, 8 -; GFX90a-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX90a-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX90a-NEXT: s_lshr_b32 s0, s8, 24 -; GFX90a-NEXT: v_lshlrev_b16_e64 v2, 8, s0 -; GFX90a-NEXT: s_lshr_b32 s0, s8, 16 -; GFX90a-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX90a-NEXT: s_lshr_b32 s1, s9, 24 +; GFX90a-NEXT: s_lshl_b32 s1, s1, 8 +; GFX90a-NEXT: s_bfe_u32 s2, s9, 0x80010 +; GFX90a-NEXT: s_or_b32 s1, s2, s1 +; GFX90a-NEXT: s_lshr_b32 s2, s8, 24 +; GFX90a-NEXT: s_lshl_b32 s2, s2, 8 +; GFX90a-NEXT: s_bfe_u32 s3, s8, 0x80010 +; GFX90a-NEXT: s_and_b32 s0, s9, 0xffff +; GFX90a-NEXT: s_lshl_b32 s1, s1, 16 +; GFX90a-NEXT: s_or_b32 s2, s3, s2 +; GFX90a-NEXT: s_or_b32 s0, s0, s1 +; GFX90a-NEXT: s_and_b32 s1, s8, 0xffff +; GFX90a-NEXT: s_lshl_b32 s2, s2, 16 +; GFX90a-NEXT: s_or_b32 s1, s1, s2 +; GFX90a-NEXT: v_mov_b32_e32 v0, s1 +; GFX90a-NEXT: v_mov_b32_e32 v1, s0 ; GFX90a-NEXT: v_mov_b32_e32 v2, 0 ; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX90a-NEXT: s_endpgm @@ -782,44 +775,38 @@ define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) inreg %out, ; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. ; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-NEXT: ; %bb.0: -; GFX940-NEXT: s_lshr_b32 s0, s4, 8 -; GFX940-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX940-NEXT: s_lshr_b32 s0, s4, 24 -; GFX940-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX940-NEXT: s_lshr_b32 s0, s4, 16 -; GFX940-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-NEXT: s_lshr_b32 s0, s5, 8 -; GFX940-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX940-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: v_mov_b32_e32 v3, s5 -; GFX940-NEXT: v_or_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-NEXT: global_store_byte_d16_hi v2, v3, s[2:3] offset:6 sc0 sc1 -; GFX940-NEXT: global_store_short v2, v1, s[2:3] offset:4 sc0 sc1 -; GFX940-NEXT: global_store_dword v2, v0, s[2:3] sc0 sc1 +; GFX940-NEXT: s_lshr_b32 s1, s4, 24 +; GFX940-NEXT: s_and_b32 s0, s4, 0xffff +; GFX940-NEXT: s_lshl_b32 s1, s1, 8 +; GFX940-NEXT: s_bfe_u32 s4, s4, 0x80010 +; GFX940-NEXT: s_or_b32 s1, s4, s1 +; GFX940-NEXT: s_lshl_b32 s1, s1, 16 +; GFX940-NEXT: s_or_b32 s0, s0, s1 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-NEXT: global_store_byte_d16_hi v0, v1, s[2:3] offset:6 sc0 sc1 +; GFX940-NEXT: global_store_short v0, v1, s[2:3] offset:4 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm ; ; GFX90a-LABEL: v7i8_kernel_preload_arg: ; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. ; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-NEXT: ; %bb.0: -; GFX90a-NEXT: s_lshr_b32 s0, s8, 8 -; GFX90a-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX90a-NEXT: s_lshr_b32 s0, s8, 24 -; GFX90a-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX90a-NEXT: s_lshr_b32 s0, s8, 16 -; GFX90a-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-NEXT: s_lshr_b32 s0, s9, 8 -; GFX90a-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX90a-NEXT: v_lshlrev_b16_e64 v1, 8, s0 -; GFX90a-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-NEXT: v_mov_b32_e32 v3, s9 -; GFX90a-NEXT: v_or_b32_sdwa v1, s9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-NEXT: global_store_byte_d16_hi v2, v3, s[6:7] offset:6 -; GFX90a-NEXT: global_store_short v2, v1, s[6:7] offset:4 -; GFX90a-NEXT: global_store_dword v2, v0, s[6:7] +; GFX90a-NEXT: s_lshr_b32 s1, s8, 24 +; GFX90a-NEXT: s_lshl_b32 s1, s1, 8 +; GFX90a-NEXT: s_bfe_u32 s2, s8, 0x80010 +; GFX90a-NEXT: s_or_b32 s1, s2, s1 +; GFX90a-NEXT: s_and_b32 s0, s8, 0xffff +; GFX90a-NEXT: s_lshl_b32 s1, s1, 16 +; GFX90a-NEXT: s_or_b32 s0, s0, s1 +; GFX90a-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NEXT: v_mov_b32_e32 v1, s9 +; GFX90a-NEXT: global_store_byte_d16_hi v0, v1, s[6:7] offset:6 +; GFX90a-NEXT: global_store_short v0, v1, s[6:7] offset:4 +; GFX90a-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-NEXT: s_endpgm store <7 x i8> %in, ptr addrspace(1) %out ret void @@ -948,13 +935,14 @@ define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) inreg %o ; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-NEXT: ; %bb.0: ; GFX940-NEXT: s_lshr_b32 s0, s4, 24 -; GFX940-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX940-NEXT: s_lshr_b32 s0, s4, 16 -; GFX940-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: v_or_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX940-NEXT: global_store_short v1, v2, s[2:3] sc0 sc1 -; GFX940-NEXT: global_store_short v1, v0, s[6:7] sc0 sc1 +; GFX940-NEXT: s_lshl_b32 s0, s0, 8 +; GFX940-NEXT: s_bfe_u32 s1, s4, 0x80010 +; GFX940-NEXT: s_or_b32 s0, s1, s0 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NEXT: global_store_short v0, v1, s[6:7] sc0 sc1 ; GFX940-NEXT: s_endpgm ; ; GFX90a-LABEL: i16_v2i8_kernel_preload_arg: @@ -962,13 +950,14 @@ define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) inreg %o ; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-NEXT: ; %bb.0: ; GFX90a-NEXT: s_lshr_b32 s0, s8, 24 -; GFX90a-NEXT: v_lshlrev_b16_e64 v0, 8, s0 -; GFX90a-NEXT: s_lshr_b32 s0, s8, 16 -; GFX90a-NEXT: v_mov_b32_e32 v1, 0 -; GFX90a-NEXT: v_mov_b32_e32 v2, s8 -; GFX90a-NEXT: v_or_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX90a-NEXT: global_store_short v1, v2, s[6:7] -; GFX90a-NEXT: global_store_short v1, v0, s[10:11] +; GFX90a-NEXT: s_lshl_b32 s0, s0, 8 +; GFX90a-NEXT: s_bfe_u32 s1, s8, 0x80010 +; GFX90a-NEXT: s_or_b32 s0, s1, s0 +; GFX90a-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-NEXT: global_store_short v0, v1, s[10:11] ; GFX90a-NEXT: s_endpgm store i16 %in, ptr addrspace(1) %out store <2 x i8> %in2, ptr addrspace(1) %out2 diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll index 1700ce302cc9db..f299232918d99b 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -120,13 +120,14 @@ define amdgpu_kernel void @scalar_to_vector_v4i16() { ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0 -; VI-NEXT: v_or_b32_e32 v2, v1, v0 -; VI-NEXT: v_and_b32_e32 v1, 0xffffff00, v2 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; VI-NEXT: v_or_b32_sdwa v1, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_readfirstlane_b32 s0, v0 +; VI-NEXT: s_lshl_b32 s1, s0, 8 +; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: s_lshl_b32 s1, s0, 16 +; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm bb: @@ -160,13 +161,19 @@ define amdgpu_kernel void @scalar_to_vector_v4f16() { ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_and_b32_e32 v1, 0xffffff00, v0 -; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_readfirstlane_b32 s0, v0 +; VI-NEXT: s_lshl_b32 s1, s0, 8 +; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: s_and_b32 s1, s0, 0xff00 +; VI-NEXT: s_bfe_u32 s4, s0, 0x80008 +; VI-NEXT: s_or_b32 s1, s4, s1 +; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_lshl_b32 s4, s1, 16 +; VI-NEXT: s_and_b32 s1, s1, 0xffff +; VI-NEXT: s_or_b32 s1, s1, s4 +; VI-NEXT: s_or_b32 s0, s0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll index 64b3317edc5192..5a1cc72644d47d 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -1868,62 +1868,58 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp ; NOSDWA-NEXT: v_mov_b32_e32 v0, s0 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s1 ; NOSDWA-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; NOSDWA-NEXT: v_mov_b32_e32 v2, s2 ; NOSDWA-NEXT: v_mov_b32_e32 v3, s3 +; NOSDWA-NEXT: v_mov_b32_e32 v2, s2 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) -; NOSDWA-NEXT: v_and_b32_e32 v4, 0xff, v0 -; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 8, v0 -; NOSDWA-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; NOSDWA-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; NOSDWA-NEXT: v_and_b32_e32 v7, 0xff, v1 -; NOSDWA-NEXT: v_lshrrev_b32_e32 v8, 8, v1 -; NOSDWA-NEXT: v_lshrrev_b32_e32 v9, 24, v1 -; NOSDWA-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; NOSDWA-NEXT: v_lshlrev_b16_e32 v5, 8, v5 -; NOSDWA-NEXT: v_lshlrev_b16_e32 v6, 8, v6 -; NOSDWA-NEXT: v_and_b32_e32 v0, 0xff, v0 -; NOSDWA-NEXT: v_lshlrev_b16_e32 v8, 8, v8 -; NOSDWA-NEXT: v_lshlrev_b16_e32 v9, 8, v9 -; NOSDWA-NEXT: v_and_b32_e32 v1, 0xff, v1 -; NOSDWA-NEXT: v_or_b32_e32 v4, v4, v5 -; NOSDWA-NEXT: v_or_b32_e32 v0, v0, v6 -; NOSDWA-NEXT: v_or_b32_e32 v5, v7, v8 -; NOSDWA-NEXT: v_or_b32_e32 v1, v1, v9 -; NOSDWA-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; NOSDWA-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; NOSDWA-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; NOSDWA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; NOSDWA-NEXT: v_or_b32_e32 v0, v4, v0 -; NOSDWA-NEXT: v_or_b32_e32 v1, v5, v1 +; NOSDWA-NEXT: v_readfirstlane_b32 s0, v1 +; NOSDWA-NEXT: v_readfirstlane_b32 s1, v0 +; NOSDWA-NEXT: s_lshr_b32 s3, s1, 24 +; NOSDWA-NEXT: s_lshr_b32 s5, s0, 24 +; NOSDWA-NEXT: s_and_b32 s2, s1, 0xffff +; NOSDWA-NEXT: s_bfe_u32 s1, s1, 0x80010 +; NOSDWA-NEXT: s_and_b32 s4, s0, 0xffff +; NOSDWA-NEXT: s_bfe_u32 s0, s0, 0x80010 +; NOSDWA-NEXT: s_lshl_b32 s3, s3, 8 +; NOSDWA-NEXT: s_lshl_b32 s5, s5, 8 +; NOSDWA-NEXT: s_or_b32 s1, s1, s3 +; NOSDWA-NEXT: s_or_b32 s0, s0, s5 +; NOSDWA-NEXT: s_lshl_b32 s1, s1, 16 +; NOSDWA-NEXT: s_lshl_b32 s0, s0, 16 +; NOSDWA-NEXT: s_or_b32 s1, s2, s1 +; NOSDWA-NEXT: s_or_b32 s0, s4, s0 +; NOSDWA-NEXT: v_mov_b32_e32 v0, s1 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s0 ; NOSDWA-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; NOSDWA-NEXT: s_endpgm ; ; GFX89-LABEL: pulled_out_test: ; GFX89: ; %bb.0: ; %entry ; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX89-NEXT: v_mov_b32_e32 v4, 8 -; GFX89-NEXT: v_mov_b32_e32 v5, 0xff ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s0 ; GFX89-NEXT: v_mov_b32_e32 v1, s1 ; GFX89-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX89-NEXT: v_mov_b32_e32 v2, s2 ; GFX89-NEXT: v_mov_b32_e32 v3, s3 +; GFX89-NEXT: v_mov_b32_e32 v2, s2 ; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: v_lshrrev_b32_sdwa v6, v4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX89-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; GFX89-NEXT: v_lshrrev_b32_sdwa v4, v4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX89-NEXT: v_lshrrev_b32_e32 v9, 24, v1 -; GFX89-NEXT: v_and_b32_sdwa v8, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX89-NEXT: v_and_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX89-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX89-NEXT: v_lshlrev_b16_e32 v6, 8, v7 -; GFX89-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX89-NEXT: v_lshlrev_b16_e32 v4, 8, v9 -; GFX89-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX89-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX89-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX89-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX89-NEXT: v_readfirstlane_b32 s0, v1 +; GFX89-NEXT: v_readfirstlane_b32 s1, v0 +; GFX89-NEXT: s_lshr_b32 s3, s1, 24 +; GFX89-NEXT: s_lshr_b32 s5, s0, 24 +; GFX89-NEXT: s_and_b32 s2, s1, 0xffff +; GFX89-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX89-NEXT: s_and_b32 s4, s0, 0xffff +; GFX89-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX89-NEXT: s_lshl_b32 s3, s3, 8 +; GFX89-NEXT: s_lshl_b32 s5, s5, 8 +; GFX89-NEXT: s_or_b32 s1, s1, s3 +; GFX89-NEXT: s_or_b32 s0, s0, s5 +; GFX89-NEXT: s_lshl_b32 s1, s1, 16 +; GFX89-NEXT: s_lshl_b32 s0, s0, 16 +; GFX89-NEXT: s_or_b32 s1, s2, s1 +; GFX89-NEXT: s_or_b32 s0, s4, s0 +; GFX89-NEXT: v_mov_b32_e32 v0, s1 +; GFX89-NEXT: v_mov_b32_e32 v1, s0 ; GFX89-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX89-NEXT: s_endpgm ; @@ -1931,25 +1927,27 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, 8 -; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_sdwa v4, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_sdwa v3, v3, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX9-NEXT: v_and_b32_sdwa v6, v0, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v5 -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v7 -; GFX9-NEXT: v_or_b32_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-NEXT: s_lshr_b32 s3, s1, 24 +; GFX9-NEXT: s_lshr_b32 s5, s0, 24 +; GFX9-NEXT: s_and_b32 s2, s1, 0xffff +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX9-NEXT: s_and_b32 s4, s0, 0xffff +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX9-NEXT: s_lshl_b32 s3, s3, 8 +; GFX9-NEXT: s_lshl_b32 s5, s5, 8 +; GFX9-NEXT: s_or_b32 s1, s1, s3 +; GFX9-NEXT: s_or_b32 s0, s0, s5 +; GFX9-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9-NEXT: s_lshl_b32 s0, s0, 16 +; GFX9-NEXT: s_or_b32 s1, s2, s1 +; GFX9-NEXT: s_or_b32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -1957,24 +1955,27 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, 8 -; GFX10-NEXT: v_mov_b32_e32 v4, 24 -; GFX10-NEXT: v_mov_b32_e32 v5, 0xff ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v6, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v7, v4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_and_b32_sdwa v8, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v3, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_and_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v6, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: s_lshr_b32 s3, s0, 24 +; GFX10-NEXT: s_lshr_b32 s5, s1, 24 +; GFX10-NEXT: s_and_b32 s2, s0, 0xffff +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_and_b32 s4, s1, 0xffff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_lshl_b32 s5, s5, 8 +; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: s_or_b32 s1, s1, s5 +; GFX10-NEXT: s_lshl_b32 s0, s0, 16 +; GFX10-NEXT: s_lshl_b32 s1, s1, 16 +; GFX10-NEXT: s_or_b32 s0, s2, s0 +; GFX10-NEXT: s_or_b32 s1, s4, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm entry: @@ -2101,16 +2102,16 @@ define void @crash_lshlrevb16_not_reg_op() { ; NOSDWA: ; %bb.0: ; %bb0 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; NOSDWA-NEXT: s_mov_b64 s[4:5], 0 -; NOSDWA-NEXT: v_mov_b32_e32 v0, 0x100 ; NOSDWA-NEXT: s_and_b64 vcc, exec, -1 ; NOSDWA-NEXT: .LBB22_1: ; %bb1 ; NOSDWA-NEXT: ; =>This Inner Loop Header: Depth=1 ; NOSDWA-NEXT: s_lshl_b32 s6, s4, 3 -; NOSDWA-NEXT: v_mov_b32_e32 v1, s4 -; NOSDWA-NEXT: v_mov_b32_e32 v2, s5 +; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 +; NOSDWA-NEXT: s_lshr_b32 s6, 0x100, s6 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 ; NOSDWA-NEXT: s_mov_b64 s[4:5], 1 -; NOSDWA-NEXT: v_lshrrev_b16_e32 v3, s6, v0 -; NOSDWA-NEXT: flat_store_byte v[1:2], v3 +; NOSDWA-NEXT: v_mov_b32_e32 v2, s6 +; NOSDWA-NEXT: flat_store_byte v[0:1], v2 ; NOSDWA-NEXT: s_mov_b64 vcc, vcc ; NOSDWA-NEXT: s_cbranch_vccnz .LBB22_1 ; NOSDWA-NEXT: ; %bb.2: ; %DummyReturnBlock @@ -2121,16 +2122,16 @@ define void @crash_lshlrevb16_not_reg_op() { ; GFX89: ; %bb.0: ; %bb0 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX89-NEXT: s_mov_b64 s[4:5], 0 -; GFX89-NEXT: v_mov_b32_e32 v0, 0x100 ; GFX89-NEXT: s_and_b64 vcc, exec, -1 ; GFX89-NEXT: .LBB22_1: ; %bb1 ; GFX89-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX89-NEXT: s_lshl_b32 s6, s4, 3 -; GFX89-NEXT: v_mov_b32_e32 v1, s4 -; GFX89-NEXT: v_mov_b32_e32 v2, s5 +; GFX89-NEXT: v_mov_b32_e32 v0, s4 +; GFX89-NEXT: s_lshr_b32 s6, 0x100, s6 +; GFX89-NEXT: v_mov_b32_e32 v1, s5 ; GFX89-NEXT: s_mov_b64 s[4:5], 1 -; GFX89-NEXT: v_lshrrev_b16_e32 v3, s6, v0 -; GFX89-NEXT: flat_store_byte v[1:2], v3 +; GFX89-NEXT: v_mov_b32_e32 v2, s6 +; GFX89-NEXT: flat_store_byte v[0:1], v2 ; GFX89-NEXT: s_mov_b64 vcc, vcc ; GFX89-NEXT: s_cbranch_vccnz .LBB22_1 ; GFX89-NEXT: ; %bb.2: ; %DummyReturnBlock @@ -2141,16 +2142,16 @@ define void @crash_lshlrevb16_not_reg_op() { ; GFX9: ; %bb.0: ; %bb0 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0x100 ; GFX9-NEXT: s_and_b64 vcc, exec, -1 ; GFX9-NEXT: .LBB22_1: ; %bb1 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_lshl_b32 s6, s4, 3 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_lshr_b32 s6, 0x100, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: s_mov_b64 s[4:5], 1 -; GFX9-NEXT: v_lshrrev_b16_e32 v3, s6, v0 -; GFX9-NEXT: flat_store_byte v[1:2], v3 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: flat_store_byte v[0:1], v2 ; GFX9-NEXT: s_mov_b64 vcc, vcc ; GFX9-NEXT: s_cbranch_vccnz .LBB22_1 ; GFX9-NEXT: ; %bb.2: ; %DummyReturnBlock @@ -2167,7 +2168,8 @@ define void @crash_lshlrevb16_not_reg_op() { ; GFX10-NEXT: s_lshl_b32 s6, s4, 3 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: v_lshrrev_b16 v2, s6, 0x100 +; GFX10-NEXT: s_lshr_b32 s4, 0x100, s6 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-NEXT: s_mov_b64 s[4:5], 1 ; GFX10-NEXT: flat_store_byte v[0:1], v2 ; GFX10-NEXT: s_cbranch_vccnz .LBB22_1 diff --git a/llvm/test/CodeGen/AMDGPU/select-i1.ll b/llvm/test/CodeGen/AMDGPU/select-i1.ll index eb7ceb82ff9e98..06a2d86c2755ee 100644 --- a/llvm/test/CodeGen/AMDGPU/select-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/select-i1.ll @@ -4,7 +4,7 @@ ; FIXME: This should go in existing select.ll test, except the current testcase there is broken on GCN ; GCN-LABEL: {{^}}select_i1: -; GCN: v_cndmask_b32 +; GCN: s_cselect_b32 ; GCN-NOT: v_cndmask_b32 define amdgpu_kernel void @select_i1(ptr addrspace(1) %out, i32 %cond, i1 %a, i1 %b) nounwind { %cmp = icmp ugt i32 %cond, 5 @@ -16,14 +16,9 @@ define amdgpu_kernel void @select_i1(ptr addrspace(1) %out, i32 %cond, i1 %a, i1 ; GCN-LABEL: {{^}}s_minmax_i1: ; GCN: s_load_dword [[LOAD:s[0-9]+]], ; GCN: s_bitcmp1_b32 [[LOAD]], 0 -; GCN: s_cselect_b64 vcc, -1, 0 -; GCN-DAG: s_lshr_b32 [[A:s[0-9]+]], [[LOAD]], 8 -; GCN-DAG: s_lshr_b32 [[B:s[0-9]+]], [[LOAD]], 16 - -; GCN: v_mov_b32_e32 [[V_B:v[0-9]+]], [[B]] -; GCN: v_mov_b32_e32 [[V_A:v[0-9]+]], [[A]] -; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], [[V_B]], [[V_A]] -; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, [[SEL]] +; GCN: s_cselect_b32 [[SHIFTVAL:s[0-9]+]], 8, 16 +; GCN: s_lshr_b32 [[LOAD]], [[LOAD]], [[SHIFTVAL]] +; GCN: s_and_b32 [[LOAD]], [[LOAD]], 1 define amdgpu_kernel void @s_minmax_i1(ptr addrspace(1) %out, [8 x i32], i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind { %cmp = icmp slt i1 %cond, false %sel = select i1 %cmp, i1 %a, i1 %b diff --git a/llvm/test/CodeGen/AMDGPU/select-vectors.ll b/llvm/test/CodeGen/AMDGPU/select-vectors.ll index cca44548bb8f8b..c8c40d41dab720 100644 --- a/llvm/test/CodeGen/AMDGPU/select-vectors.ll +++ b/llvm/test/CodeGen/AMDGPU/select-vectors.ll @@ -14,7 +14,7 @@ ; This is worse when i16 is legal and packed is not because ; SelectionDAGBuilder for some reason changes the select type. -; VI: v_cndmask_b32 +; VI: s_cselect_b64 ; VI: v_cndmask_b32 define amdgpu_kernel void @v_select_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 { %a = load <2 x i8>, ptr addrspace(1) %a.ptr, align 2 @@ -111,8 +111,7 @@ define amdgpu_kernel void @v_select_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; SI: cndmask ; SI-NOT: cndmask -; VI: s_cselect_b32 -; VI: s_cselect_b32 +; VI: s_cselect_b64 ; GFX9: cndmask ; GFX9: cndmask define amdgpu_kernel void @v_select_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/setcc-opt.ll b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll index 46aafed322cd8c..b477a72d3810ea 100644 --- a/llvm/test/CodeGen/AMDGPU/setcc-opt.ll +++ b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll @@ -154,19 +154,12 @@ define amdgpu_kernel void @zext_bool_icmp_ne_neg1(ptr addrspace(1) %out, i32 %a, } ; FUNC-LABEL: {{^}}cmp_zext_k_i8max: -; SI: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; SI-DAG: s_and_b32 [[B:s[0-9]+]], [[VALUE]], 0xff -; SI: s_cmpk_lg_i32 [[B]], 0xff -; SI: s_cselect_b64 [[CC:[^,]+]], -1, 0 - -; VI: v_mov_b32_e32 [[VK255:v[0-9]+]], 0xff -; VI: s_movk_i32 [[K255:s[0-9]+]], 0xff -; VI: v_and_b32_e32 [[B:v[0-9]+]], [[VALUE]], [[VK255]] -; VI: v_cmp_ne_u16_e32 vcc, [[K255]], [[B]] +; GCN: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}} +; GCN-DAG: s_and_b32 [[B:s[0-9]+]], [[VALUE]], 0xff +; GCN: s_cmpk_lg_i32 [[B]], 0xff +; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0 -; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]] -; VI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]] ; GCN: buffer_store_byte [[RESULT]] ; GCN: s_endpgm define amdgpu_kernel void @cmp_zext_k_i8max(ptr addrspace(1) %out, i8 %b) nounwind { diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll index b54df3b4d0c6c6..5f1e3bd9a9fe1a 100644 --- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll @@ -372,14 +372,14 @@ define amdgpu_kernel void @s_sext_v4i8_to_v4i32(ptr addrspace(1) %out, i32 %a) n ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_lshrrev_b16_e64 v0, 8, s4 ; VI-NEXT: s_ashr_i32 s5, s4, 24 ; VI-NEXT: s_bfe_i32 s6, s4, 0x80010 +; VI-NEXT: s_bfe_i32 s7, s4, 0x80008 ; VI-NEXT: s_sext_i32_i8 s4, s4 -; VI-NEXT: v_bfe_i32 v0, v0, 0, 8 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 @@ -447,19 +447,18 @@ define amdgpu_kernel void @v_sext_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrs ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v0 -; VI-NEXT: v_ashrrev_i32_e32 v2, 24, v0 -; VI-NEXT: v_bfe_i32 v3, v0, 16, 8 +; VI-NEXT: v_ashrrev_i32_e32 v1, 24, v0 +; VI-NEXT: v_bfe_i32 v2, v0, 16, 8 +; VI-NEXT: v_bfe_i32 v3, v0, 8, 8 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 8 -; VI-NEXT: v_bfe_i32 v1, v1, 0, 8 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm %a = load i32, ptr addrspace(1) %in %cast = bitcast i32 %a to <4 x i8> diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll index 448fa7e959a51d..3446e0384cc545 100644 --- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -861,26 +861,26 @@ define amdgpu_kernel void @v_min_max_v2i16_user(ptr addrspace(1) %out0, ptr addr ; GFX9-NEXT: global_load_dword v2, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX9-NEXT: v_cmp_gt_i32_sdwa vcc, sext(v1), sext(v2) src0_sel:WORD_0 src1_sel:WORD_0 +; GFX9-NEXT: v_cmp_gt_i32_sdwa s[0:1], sext(v1), sext(v2) src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX9-NEXT: v_cmp_gt_i16_e32 vcc, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v1, vcc -; GFX9-NEXT: v_cmp_gt_i16_e64 s[0:1], v3, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v6, v4, v3, s[0:1] -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_lshl_or_b32 v5, v6, 16, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX9-NEXT: v_lshl_or_b32 v4, v6, 16, v4 ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 1, v4 -; GFX9-NEXT: global_store_dword v0, v5, s[4:5] +; GFX9-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX9-NEXT: global_store_dword v0, v4, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_e32 v0, v2, v3 -; GFX9-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 3, v2 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -921,7 +921,7 @@ define amdgpu_kernel void @v_min_max_v2i16_user(ptr addrspace(1) %out0, ptr addr ; VI-NEXT: s_cselect_b32 s0, s2, s4 ; VI-NEXT: s_cselect_b32 s1, s4, s2 ; VI-NEXT: s_and_b32 s0, s0, 0xffff -; VI-NEXT: v_lshlrev_b16_e32 v4, 1, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v4 ; VI-NEXT: s_lshl_b32 s2, s3, 16 ; VI-NEXT: s_and_b32 s1, s1, 0xffff ; VI-NEXT: s_or_b32 s0, s0, s5 diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll index 7b0241984a3491..1622f498dce65a 100644 --- a/llvm/test/CodeGen/AMDGPU/srem.ll +++ b/llvm/test/CodeGen/AMDGPU/srem.ll @@ -9,17 +9,17 @@ define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_movk_i32 s0, 0x4925 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_ushort v1, v0, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_bfe_i32 v2, v1, 0, 16 -; GCN-NEXT: v_mul_lo_u32 v2, v2, s0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 31, v2 -; GCN-NEXT: v_ashrrev_i32_e32 v2, 17, v2 -; GCN-NEXT: v_add_u16_e32 v2, v2, v3 -; GCN-NEXT: v_mul_lo_u16_e32 v2, 7, v2 -; GCN-NEXT: v_sub_u16_e32 v1, v1, v2 +; GCN-NEXT: v_readfirstlane_b32 s0, v1 +; GCN-NEXT: s_sext_i32_i16 s0, s0 +; GCN-NEXT: s_mulk_i32 s0, 0x4925 +; GCN-NEXT: s_lshr_b32 s1, s0, 31 +; GCN-NEXT: s_ashr_i32 s0, s0, 17 +; GCN-NEXT: s_add_i32 s0, s0, s1 +; GCN-NEXT: s_mul_i32 s0, s0, 7 +; GCN-NEXT: v_subrev_u32_e32 v1, s0, v1 ; GCN-NEXT: global_store_short v0, v1, s[4:5] ; GCN-NEXT: s_endpgm ; @@ -54,17 +54,17 @@ define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 ; TONGA-NEXT: flat_load_ushort v2, v[0:1] -; TONGA-NEXT: s_movk_i32 s2, 0x4925 +; TONGA-NEXT: v_mov_b32_e32 v0, s0 ; TONGA-NEXT: v_mov_b32_e32 v1, s1 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_bfe_i32 v0, v2, 0, 16 -; TONGA-NEXT: v_mul_lo_u32 v3, v0, s2 -; TONGA-NEXT: v_mov_b32_e32 v0, s0 -; TONGA-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; TONGA-NEXT: v_ashrrev_i32_e32 v3, 17, v3 -; TONGA-NEXT: v_add_u16_e32 v3, v3, v4 -; TONGA-NEXT: v_mul_lo_u16_e32 v3, 7, v3 -; TONGA-NEXT: v_sub_u16_e32 v2, v2, v3 +; TONGA-NEXT: v_readfirstlane_b32 s0, v2 +; TONGA-NEXT: s_sext_i32_i16 s0, s0 +; TONGA-NEXT: s_mulk_i32 s0, 0x4925 +; TONGA-NEXT: s_lshr_b32 s1, s0, 31 +; TONGA-NEXT: s_ashr_i32 s0, s0, 17 +; TONGA-NEXT: s_add_i32 s0, s0, s1 +; TONGA-NEXT: s_mul_i32 s0, s0, 7 +; TONGA-NEXT: v_subrev_u32_e32 v2, vcc, s0, v2 ; TONGA-NEXT: flat_store_short v[0:1], v2 ; TONGA-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll index 22eb7dddb84f4d..80b0bdd8c03759 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll @@ -41,7 +41,7 @@ define i16 @trunc_bitcast_v2i32_to_i16(<2 x i32> %bar) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v0, 4, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v0 ; VI-NEXT: s_setpc_b64 s[30:31] %load0 = load i32, ptr addrspace(1) undef %load1 = load i32, ptr addrspace(1) null @@ -70,7 +70,7 @@ define i16 @trunc_bitcast_v2f32_to_i16(<2 x float> %bar) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v0, 4, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v0 ; VI-NEXT: s_setpc_b64 s[30:31] %load0 = load float, ptr addrspace(1) undef %load1 = load float, ptr addrspace(1) null diff --git a/llvm/test/CodeGen/AMDGPU/trunc-store.ll b/llvm/test/CodeGen/AMDGPU/trunc-store.ll index efb1a630f927ca..088aff983ddc9c 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc-store.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-store.ll @@ -58,35 +58,47 @@ define amdgpu_kernel void @truncstore_arg_v16i32_to_v16i8(ptr addrspace(1) %out, ; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_lshlrev_b16_e64 v0, 8, s17 -; VI-NEXT: v_mov_b32_e32 v1, s16 -; VI-NEXT: v_lshlrev_b16_e64 v2, 8, s19 -; VI-NEXT: v_mov_b32_e32 v3, s18 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e64 v0, 8, s13 -; VI-NEXT: v_mov_b32_e32 v1, s12 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e64 v1, 8, s15 -; VI-NEXT: v_mov_b32_e32 v2, s14 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e64 v0, 8, s9 -; VI-NEXT: v_mov_b32_e32 v1, s8 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e64 v1, 8, s11 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e64 v0, 8, s5 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e64 v4, 8, s7 -; VI-NEXT: v_mov_b32_e32 v5, s6 -; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshl_b32 s2, s19, 8 +; VI-NEXT: s_and_b32 s3, s18, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_and_b32 s16, s16, 0xff +; VI-NEXT: s_or_b32 s2, s3, s2 +; VI-NEXT: s_or_b32 s3, s16, s17 +; VI-NEXT: s_lshl_b32 s2, s2, 16 +; VI-NEXT: s_and_b32 s3, s3, 0xffff +; VI-NEXT: s_or_b32 s2, s3, s2 +; VI-NEXT: s_lshl_b32 s3, s15, 8 +; VI-NEXT: s_and_b32 s14, s14, 0xff +; VI-NEXT: s_lshl_b32 s13, s13, 8 +; VI-NEXT: s_and_b32 s12, s12, 0xff +; VI-NEXT: s_lshl_b32 s11, s11, 8 +; VI-NEXT: s_and_b32 s10, s10, 0xff +; VI-NEXT: s_lshl_b32 s9, s9, 8 +; VI-NEXT: s_and_b32 s8, s8, 0xff +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: s_and_b32 s6, s6, 0xff +; VI-NEXT: s_lshl_b32 s5, s5, 8 +; VI-NEXT: s_and_b32 s4, s4, 0xff +; VI-NEXT: s_or_b32 s3, s14, s3 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_lshl_b32 s3, s3, 16 +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_or_b32 s3, s12, s3 +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -153,35 +165,47 @@ define amdgpu_kernel void @truncstore_arg_v16i64_to_v16i8(ptr addrspace(1) %out, ; VI-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; VI-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0xa4 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_lshlrev_b16_e64 v0, 8, s26 -; VI-NEXT: v_mov_b32_e32 v1, s24 -; VI-NEXT: v_lshlrev_b16_e64 v2, 8, s30 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s28 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e64 v0, 8, s18 -; VI-NEXT: v_mov_b32_e32 v1, s16 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e64 v1, 8, s22 -; VI-NEXT: v_mov_b32_e32 v2, s20 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e64 v0, 8, s10 -; VI-NEXT: v_mov_b32_e32 v1, s8 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e64 v1, 8, s14 -; VI-NEXT: v_mov_b32_e32 v4, s12 -; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e64 v0, 8, s2 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e64 v4, 8, s6 -; VI-NEXT: v_mov_b32_e32 v5, s4 -; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshl_b32 s1, s30, 8 +; VI-NEXT: s_and_b32 s3, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s26, 8 +; VI-NEXT: s_or_b32 s1, s3, s1 +; VI-NEXT: s_and_b32 s3, s24, 0xff +; VI-NEXT: s_or_b32 s3, s3, s5 +; VI-NEXT: s_lshl_b32 s1, s1, 16 +; VI-NEXT: s_and_b32 s3, s3, 0xffff +; VI-NEXT: s_or_b32 s1, s3, s1 +; VI-NEXT: s_lshl_b32 s3, s22, 8 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_or_b32 s3, s5, s3 +; VI-NEXT: s_lshl_b32 s5, s18, 8 +; VI-NEXT: s_and_b32 s7, s16, 0xff +; VI-NEXT: s_or_b32 s5, s7, s5 +; VI-NEXT: s_lshl_b32 s3, s3, 16 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s3, s5, s3 +; VI-NEXT: s_lshl_b32 s5, s14, 8 +; VI-NEXT: s_and_b32 s7, s12, 0xff +; VI-NEXT: s_or_b32 s5, s7, s5 +; VI-NEXT: s_lshl_b32 s7, s10, 8 +; VI-NEXT: s_and_b32 s8, s8, 0xff +; VI-NEXT: s_lshl_b32 s6, s6, 8 +; VI-NEXT: s_and_b32 s4, s4, 0xff +; VI-NEXT: s_lshl_b32 s2, s2, 8 +; VI-NEXT: s_and_b32 s0, s0, 0xff +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_or_b32 s0, s0, s2 +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s4, s4, 16 +; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_or_b32 s5, s7, s5 +; VI-NEXT: s_or_b32 s0, s0, s4 ; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v5, s35 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll index c08571a733cc51..1fd5f7f8f9bb3b 100644 --- a/llvm/test/CodeGen/AMDGPU/uaddo.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll @@ -499,8 +499,10 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v5, v4, v5 -; VI-NEXT: v_cmp_lt_u16_e32 vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, v4, v5 +; VI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; VI-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; VI-NEXT: v_cmp_lt_u32_e32 vcc, v6, v4 ; VI-NEXT: flat_store_short v[0:1], v5 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; VI-NEXT: flat_store_byte v[2:3], v0 @@ -514,9 +516,9 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: global_load_ushort v1, v0, s[8:9] ; GFX9-NEXT: global_load_ushort v2, v0, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v2, v1, v2 -; GFX9-NEXT: v_cmp_lt_u16_e32 vcc, v2, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v1, v2 +; GFX9-NEXT: v_cmp_lt_u32_sdwa s[0:1], v2, v1 src0_sel:WORD_0 src1_sel:WORD_0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX9-NEXT: global_store_short v0, v2, s[4:5] ; GFX9-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll index 36a0cbd3f09703..cf13bb2efcae7c 100644 --- a/llvm/test/CodeGen/AMDGPU/usubo.ll +++ b/llvm/test/CodeGen/AMDGPU/usubo.ll @@ -499,8 +499,10 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_sub_u16_e32 v5, v4, v5 -; VI-NEXT: v_cmp_gt_u16_e32 vcc, v5, v4 +; VI-NEXT: v_sub_u32_e32 v5, vcc, v4, v5 +; VI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; VI-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; VI-NEXT: v_cmp_gt_u32_e32 vcc, v6, v4 ; VI-NEXT: flat_store_short v[0:1], v5 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; VI-NEXT: flat_store_byte v[2:3], v0 @@ -514,9 +516,9 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: global_load_ushort v1, v0, s[8:9] ; GFX9-NEXT: global_load_ushort v2, v0, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_u16_e32 v2, v1, v2 -; GFX9-NEXT: v_cmp_gt_u16_e32 vcc, v2, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_sub_u32_e32 v2, v1, v2 +; GFX9-NEXT: v_cmp_gt_u32_sdwa s[0:1], v2, v1 src0_sel:WORD_0 src1_sel:WORD_0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX9-NEXT: global_store_short v0, v2, s[4:5] ; GFX9-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll index 85c34e036e1fd9..aec86ec343bdb5 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll @@ -85,8 +85,7 @@ entry: ; GCN-LABEL: {{^}}vector_write_read_bitcast_to_float: ; GCN-ALLOCA: buffer_store_dword -; GCN-PROMOTE: v_cmp_eq_u16 -; GCN-PROMOTE: v_cndmask +; GCN-PROMOTE: s_cmp_eq_u32 ; GCN: s_cbranch diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll index 50927a2cf21afe..15a83475f368e9 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll @@ -46,9 +46,11 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9() { ; CHECK-NEXT: v_readlane_b32 s5, v1, 1 ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_mov_b32 s4, 0xffff ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_ne_u16_e64 s[4:5], v0, s4 +; CHECK-NEXT: v_and_b32_e64 v0, s4, v0 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, s4 ; CHECK-NEXT: s_and_b64 vcc, exec, s[4:5] ; CHECK-NEXT: s_cbranch_vccnz .LBB0_4 ; CHECK-NEXT: ; %bb.3: ; %bb201 diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll index 978ac548443f73..7a1f05f56a7517 100644 --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -318,42 +318,40 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) { ; VI-LABEL: widen_v2i8_constant_load: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: v_mov_b32_e32 v0, 44 -; VI-NEXT: v_mov_b32_e32 v1, 3 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s1, s0, 0xffff +; VI-NEXT: s_and_b32 s1, s0, 0xffffff00 +; VI-NEXT: s_add_i32 s0, s0, 12 +; VI-NEXT: s_or_b32 s0, s0, 4 +; VI-NEXT: s_and_b32 s0, s0, 0xff +; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: s_addk_i32 s0, 0x2c00 +; VI-NEXT: s_or_b32 s0, s0, 0x300 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: s_add_i32 s1, s1, 12 -; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; VI-NEXT: s_or_b32 s0, s1, 4 -; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v0, 0 -; VI-NEXT: v_mov_b32_e32 v1, 0 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: widen_v2i8_constant_load: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u16 v0, s0, 12 -; GFX11-NEXT: v_and_b32_e64 v1, 0xffffff00, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v0, 4, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_add_nc_u16 v2, v0, 0x2c00 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_or_b32_e32 v2, 0x300, v2 +; GFX11-NEXT: s_add_i32 s1, s0, 12 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff00 +; GFX11-NEXT: s_or_b32 s1, s1, 4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_addk_i32 s0, 0x2c00 +; GFX11-NEXT: s_or_b32 s0, s0, 0x300 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) From 904f58e6b9418dab53719c7817e9216b95981a49 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Thu, 19 Sep 2024 09:12:17 +0200 Subject: [PATCH 162/321] [clang][bytecode] Use field descriptor in IntPointer::atOffset (#109238) We're otherwise still pointing to the old type, but with the new offset. --- clang/lib/AST/ByteCode/Pointer.cpp | 2 +- clang/test/AST/ByteCode/codegen.c | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/clang/lib/AST/ByteCode/Pointer.cpp b/clang/lib/AST/ByteCode/Pointer.cpp index 282953eb991a6b..387cad9b137c02 100644 --- a/clang/lib/AST/ByteCode/Pointer.cpp +++ b/clang/lib/AST/ByteCode/Pointer.cpp @@ -665,7 +665,7 @@ IntPointer IntPointer::atOffset(const ASTContext &ASTCtx, uint64_t FieldOffset = ASTCtx.toCharUnitsFromBits(Layout.getFieldOffset(FieldIndex)) .getQuantity(); - return IntPointer{this->Desc, this->Value + FieldOffset}; + return IntPointer{F->Desc, this->Value + FieldOffset}; } IntPointer IntPointer::baseCast(const ASTContext &ASTCtx, diff --git a/clang/test/AST/ByteCode/codegen.c b/clang/test/AST/ByteCode/codegen.c index 8434992823010e..3c6f17e2b87264 100644 --- a/clang/test/AST/ByteCode/codegen.c +++ b/clang/test/AST/ByteCode/codegen.c @@ -17,3 +17,10 @@ struct B { }; const int A = (char *)(&( (struct B *)(16) )->b[0]) - (char *)(16); // CHECK: @A = constant i32 1 + +struct X { int a[2]; }; +int test(void) { + static int i23 = (int) &(((struct X *)0)->a[1]); + return i23; +} +// CHECK: @test.i23 = internal global i32 4, align 4 From da1a222337681c1823e9d6215451f392131ad8d1 Mon Sep 17 00:00:00 2001 From: pvanhout Date: Thu, 19 Sep 2024 09:23:59 +0200 Subject: [PATCH 163/321] [AMDGPU] Regenerate load-constant-i1 test Fix failure caused by #106383 --- llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index ec7c04a82a1eed..db88ddf1807f38 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -8969,7 +8969,6 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v13, s3 ; GFX8-NEXT: v_mov_b32_e32 v12, s2 ; GFX8-NEXT: buffer_store_dword v12, off, s[88:91], 0 ; 4-byte Folded Spill -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_store_dword v13, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; GFX8-NEXT: flat_store_dwordx4 v[42:43], v[4:7] ; GFX8-NEXT: flat_store_dwordx4 v[45:46], v[8:11] From 4e3781607cd12eb337298ee6d16ebecde4ce5741 Mon Sep 17 00:00:00 2001 From: David Green Date: Thu, 19 Sep 2024 08:32:23 +0100 Subject: [PATCH 164/321] [ARM][MVE] Add vector tests for ucmp/scmp. NFC --- llvm/test/CodeGen/Thumb2/mve-scmp.ll | 344 +++++++++++++++++++++++++++ llvm/test/CodeGen/Thumb2/mve-ucmp.ll | 343 ++++++++++++++++++++++++++ 2 files changed, 687 insertions(+) create mode 100644 llvm/test/CodeGen/Thumb2/mve-scmp.ll create mode 100644 llvm/test/CodeGen/Thumb2/mve-ucmp.ll diff --git a/llvm/test/CodeGen/Thumb2/mve-scmp.ll b/llvm/test/CodeGen/Thumb2/mve-scmp.ll new file mode 100644 index 00000000000000..23462384eca901 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-scmp.ll @@ -0,0 +1,344 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s + +define arm_aapcs_vfpcc <8 x i8> @s_v8i8(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: s_v8i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovlb.s8 q1, q1 +; CHECK-NEXT: vmovlb.s8 q0, q0 +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vmov.i16 q3, #0x1 +; CHECK-NEXT: vcmp.s16 gt, q0, q1 +; CHECK-NEXT: vpsel q2, q3, q2 +; CHECK-NEXT: vmov.i8 q3, #0xff +; CHECK-NEXT: vcmp.s16 gt, q1, q0 +; CHECK-NEXT: vpsel q0, q3, q2 +; CHECK-NEXT: bx lr +entry: + %c = call <8 x i8> @llvm.scmp(<8 x i8> %a, <8 x i8> %b) + ret <8 x i8> %c +} + +define arm_aapcs_vfpcc <16 x i8> @s_v16i8(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: s_v16i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vmov.i8 q3, #0x1 +; CHECK-NEXT: vcmp.s8 gt, q0, q1 +; CHECK-NEXT: vpsel q2, q3, q2 +; CHECK-NEXT: vmov.i8 q3, #0xff +; CHECK-NEXT: vcmp.s8 gt, q1, q0 +; CHECK-NEXT: vpsel q0, q3, q2 +; CHECK-NEXT: bx lr +entry: + %c = call <16 x i8> @llvm.scmp(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %c +} + +define arm_aapcs_vfpcc <4 x i16> @s_v4i16(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: s_v4i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovlb.s16 q1, q1 +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vmov.i32 q3, #0x1 +; CHECK-NEXT: vcmp.s32 gt, q0, q1 +; CHECK-NEXT: vpsel q2, q3, q2 +; CHECK-NEXT: vmov.i8 q3, #0xff +; CHECK-NEXT: vcmp.s32 gt, q1, q0 +; CHECK-NEXT: vpsel q0, q3, q2 +; CHECK-NEXT: bx lr +entry: + %c = call <4 x i16> @llvm.scmp(<4 x i16> %a, <4 x i16> %b) + ret <4 x i16> %c +} + +define arm_aapcs_vfpcc <8 x i16> @s_v8i16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: s_v8i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vmov.i16 q3, #0x1 +; CHECK-NEXT: vcmp.s16 gt, q0, q1 +; CHECK-NEXT: vpsel q2, q3, q2 +; CHECK-NEXT: vmov.i8 q3, #0xff +; CHECK-NEXT: vcmp.s16 gt, q1, q0 +; CHECK-NEXT: vpsel q0, q3, q2 +; CHECK-NEXT: bx lr +entry: + %c = call <8 x i16> @llvm.scmp(<8 x i16> %a, <8 x i16> %b) + ret <8 x i16> %c +} + +define arm_aapcs_vfpcc <16 x i16> @s_v16i16(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: s_v16i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: vmov.i16 q5, #0x1 +; CHECK-NEXT: vcmp.s16 gt, q0, q2 +; CHECK-NEXT: vmov.i8 q7, #0xff +; CHECK-NEXT: vpsel q6, q5, q4 +; CHECK-NEXT: vcmp.s16 gt, q2, q0 +; CHECK-NEXT: vpsel q0, q7, q6 +; CHECK-NEXT: vcmp.s16 gt, q1, q3 +; CHECK-NEXT: vpsel q2, q5, q4 +; CHECK-NEXT: vcmp.s16 gt, q3, q1 +; CHECK-NEXT: vpsel q1, q7, q2 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: bx lr +entry: + %c = call <16 x i16> @llvm.scmp(<16 x i16> %a, <16 x i16> %b) + ret <16 x i16> %c +} + +define arm_aapcs_vfpcc <2 x i32> @s_v2i32(<2 x i32> %a, <2 x i32> %b) { +; CHECK-LABEL: s_v2i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: adr.w r12, .LCPI5_0 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vldrw.u32 q3, [r12] +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: subs r3, r2, r1 +; CHECK-NEXT: asr.w lr, r2, #31 +; CHECK-NEXT: sbcs.w r3, lr, r1, asr #31 +; CHECK-NEXT: csetm r12, lt +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: subs r5, r0, r4 +; CHECK-NEXT: bfi r3, r12, #0, #8 +; CHECK-NEXT: asr.w r12, r0, #31 +; CHECK-NEXT: sbcs.w r5, r12, r4, asr #31 +; CHECK-NEXT: csetm r5, lt +; CHECK-NEXT: bfi r3, r5, #8, #8 +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: asrs r3, r1, #31 +; CHECK-NEXT: subs r1, r1, r2 +; CHECK-NEXT: vpsel q0, q3, q2 +; CHECK-NEXT: sbcs.w r1, r3, r2, asr #31 +; CHECK-NEXT: csetm r1, lt +; CHECK-NEXT: subs r2, r4, r0 +; CHECK-NEXT: bfi r6, r1, #0, #8 +; CHECK-NEXT: asr.w r1, r4, #31 +; CHECK-NEXT: sbcs.w r0, r1, r0, asr #31 +; CHECK-NEXT: csetm r0, lt +; CHECK-NEXT: bfi r6, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r6 +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI5_0: +; CHECK-NEXT: .long 1 @ 0x1 +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 1 @ 0x1 +; CHECK-NEXT: .long 0 @ 0x0 +entry: + %c = call <2 x i32> @llvm.scmp(<2 x i32> %a, <2 x i32> %b) + ret <2 x i32> %c +} + +define arm_aapcs_vfpcc <4 x i32> @s_v4i32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: s_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vmov.i32 q3, #0x1 +; CHECK-NEXT: vcmp.s32 gt, q0, q1 +; CHECK-NEXT: vpsel q2, q3, q2 +; CHECK-NEXT: vmov.i8 q3, #0xff +; CHECK-NEXT: vcmp.s32 gt, q1, q0 +; CHECK-NEXT: vpsel q0, q3, q2 +; CHECK-NEXT: bx lr +entry: + %c = call <4 x i32> @llvm.scmp(<4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %c +} + +define arm_aapcs_vfpcc <8 x i32> @s_v8i32(<8 x i32> %a, <8 x i32> %b) { +; CHECK-LABEL: s_v8i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: vmov.i32 q5, #0x1 +; CHECK-NEXT: vcmp.s32 gt, q0, q2 +; CHECK-NEXT: vmov.i8 q7, #0xff +; CHECK-NEXT: vpsel q6, q5, q4 +; CHECK-NEXT: vcmp.s32 gt, q2, q0 +; CHECK-NEXT: vpsel q0, q7, q6 +; CHECK-NEXT: vcmp.s32 gt, q1, q3 +; CHECK-NEXT: vpsel q2, q5, q4 +; CHECK-NEXT: vcmp.s32 gt, q3, q1 +; CHECK-NEXT: vpsel q1, q7, q2 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: bx lr +entry: + %c = call <8 x i32> @llvm.scmp(<8 x i32> %a, <8 x i32> %b) + ret <8 x i32> %c +} + +define arm_aapcs_vfpcc <2 x i64> @s_v2i64(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: s_v2i64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: vmov lr, r12, d0 +; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: vmov r3, r8, d2 +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: vmov r6, r7, d3 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: subs.w r1, r3, lr +; CHECK-NEXT: sbcs.w r1, r8, r12 +; CHECK-NEXT: csetm r1, lt +; CHECK-NEXT: bfi r4, r1, #0, #8 +; CHECK-NEXT: vmov r1, r5, d1 +; CHECK-NEXT: subs r2, r6, r1 +; CHECK-NEXT: sbcs.w r2, r7, r5 +; CHECK-NEXT: csetm r2, lt +; CHECK-NEXT: bfi r4, r2, #8, #8 +; CHECK-NEXT: adr r2, .LCPI8_0 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: subs.w r2, lr, r3 +; CHECK-NEXT: sbcs.w r2, r12, r8 +; CHECK-NEXT: vmsr p0, r4 +; CHECK-NEXT: csetm r2, lt +; CHECK-NEXT: subs r1, r1, r6 +; CHECK-NEXT: sbcs.w r1, r5, r7 +; CHECK-NEXT: bfi r0, r2, #0, #8 +; CHECK-NEXT: csetm r1, lt +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bfi r0, r1, #8, #8 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI8_0: +; CHECK-NEXT: .long 1 @ 0x1 +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 1 @ 0x1 +; CHECK-NEXT: .long 0 @ 0x0 +entry: + %c = call <2 x i64> @llvm.scmp(<2 x i64> %a, <2 x i64> %b) + ret <2 x i64> %c +} + +define arm_aapcs_vfpcc <4 x i64> @s_v4i64(<4 x i64> %a, <4 x i64> %b) { +; CHECK-LABEL: s_v4i64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vmov lr, r12, d0 +; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: vmov r3, r8, d4 +; CHECK-NEXT: vmov.i32 q5, #0x0 +; CHECK-NEXT: vmov r6, r7, d5 +; CHECK-NEXT: mov.w r9, #0 +; CHECK-NEXT: vmov.i8 q2, #0xff +; CHECK-NEXT: subs.w r1, r3, lr +; CHECK-NEXT: sbcs.w r1, r8, r12 +; CHECK-NEXT: csetm r1, lt +; CHECK-NEXT: bfi r4, r1, #0, #8 +; CHECK-NEXT: vmov r1, r5, d1 +; CHECK-NEXT: subs r2, r6, r1 +; CHECK-NEXT: sbcs.w r2, r7, r5 +; CHECK-NEXT: csetm r2, lt +; CHECK-NEXT: bfi r4, r2, #8, #8 +; CHECK-NEXT: adr r2, .LCPI9_0 +; CHECK-NEXT: vldrw.u32 q4, [r2] +; CHECK-NEXT: subs.w r2, lr, r3 +; CHECK-NEXT: sbcs.w r2, r12, r8 +; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: csetm r2, lt +; CHECK-NEXT: subs r1, r1, r6 +; CHECK-NEXT: sbcs.w r1, r5, r7 +; CHECK-NEXT: bfi r3, r2, #0, #8 +; CHECK-NEXT: csetm r1, lt +; CHECK-NEXT: vmsr p0, r4 +; CHECK-NEXT: bfi r3, r1, #8, #8 +; CHECK-NEXT: vpsel q0, q4, q5 +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vmov lr, r12, d2 +; CHECK-NEXT: vmov r3, r7, d6 +; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: vmov r2, r1, d7 +; CHECK-NEXT: vpsel q0, q2, q0 +; CHECK-NEXT: subs.w r6, r3, lr +; CHECK-NEXT: sbcs.w r6, r7, r12 +; CHECK-NEXT: csetm r6, lt +; CHECK-NEXT: bfi r5, r6, #0, #8 +; CHECK-NEXT: vmov r6, r4, d3 +; CHECK-NEXT: subs r0, r2, r6 +; CHECK-NEXT: sbcs.w r0, r1, r4 +; CHECK-NEXT: csetm r0, lt +; CHECK-NEXT: bfi r5, r0, #8, #8 +; CHECK-NEXT: subs.w r0, lr, r3 +; CHECK-NEXT: sbcs.w r0, r12, r7 +; CHECK-NEXT: vmsr p0, r5 +; CHECK-NEXT: csetm r0, lt +; CHECK-NEXT: vpsel q1, q4, q5 +; CHECK-NEXT: bfi r9, r0, #0, #8 +; CHECK-NEXT: subs r0, r6, r2 +; CHECK-NEXT: sbcs.w r0, r4, r1 +; CHECK-NEXT: csetm r0, lt +; CHECK-NEXT: bfi r9, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r9 +; CHECK-NEXT: vpsel q1, q2, q1 +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI9_0: +; CHECK-NEXT: .long 1 @ 0x1 +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 1 @ 0x1 +; CHECK-NEXT: .long 0 @ 0x0 +entry: + %c = call <4 x i64> @llvm.scmp(<4 x i64> %a, <4 x i64> %b) + ret <4 x i64> %c +} + +define arm_aapcs_vfpcc <16 x i8> @signOf_neon(<8 x i16> %s0_lo, <8 x i16> %s0_hi, <8 x i16> %s1_lo, <8 x i16> %s1_hi) { +; CHECK-LABEL: signOf_neon: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: vmov.i16 q5, #0x1 +; CHECK-NEXT: vcmp.s16 gt, q1, q3 +; CHECK-NEXT: vmov.i8 q7, #0xff +; CHECK-NEXT: vpsel q6, q5, q4 +; CHECK-NEXT: vcmp.s16 gt, q3, q1 +; CHECK-NEXT: vpsel q1, q7, q6 +; CHECK-NEXT: vcmp.s16 gt, q0, q2 +; CHECK-NEXT: vpsel q3, q5, q4 +; CHECK-NEXT: vcmp.s16 gt, q2, q0 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vpsel q0, q7, q3 +; CHECK-NEXT: vstrb.16 q1, [r0, #8] +; CHECK-NEXT: vstrb.16 q0, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: bx lr +entry: + %0 = shufflevector <8 x i16> %s0_lo, <8 x i16> %s0_hi, <16 x i32> + %1 = shufflevector <8 x i16> %s1_lo, <8 x i16> %s1_hi, <16 x i32> + %or.i = tail call <16 x i8> @llvm.scmp.v16i8.v16i16(<16 x i16> %0, <16 x i16> %1) + ret <16 x i8> %or.i +} diff --git a/llvm/test/CodeGen/Thumb2/mve-ucmp.ll b/llvm/test/CodeGen/Thumb2/mve-ucmp.ll new file mode 100644 index 00000000000000..92dc9a01d2116b --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-ucmp.ll @@ -0,0 +1,343 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s + +define arm_aapcs_vfpcc <8 x i8> @u_v8i8(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: u_v8i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovlb.u8 q1, q1 +; CHECK-NEXT: vmovlb.u8 q0, q0 +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vmov.i16 q3, #0x1 +; CHECK-NEXT: vcmp.u16 hi, q0, q1 +; CHECK-NEXT: vpsel q2, q3, q2 +; CHECK-NEXT: vmov.i8 q3, #0xff +; CHECK-NEXT: vcmp.u16 hi, q1, q0 +; CHECK-NEXT: vpsel q0, q3, q2 +; CHECK-NEXT: bx lr +entry: + %c = call <8 x i8> @llvm.ucmp(<8 x i8> %a, <8 x i8> %b) + ret <8 x i8> %c +} + +define arm_aapcs_vfpcc <16 x i8> @u_v16i8(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: u_v16i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vmov.i8 q3, #0x1 +; CHECK-NEXT: vcmp.u8 hi, q0, q1 +; CHECK-NEXT: vpsel q2, q3, q2 +; CHECK-NEXT: vmov.i8 q3, #0xff +; CHECK-NEXT: vcmp.u8 hi, q1, q0 +; CHECK-NEXT: vpsel q0, q3, q2 +; CHECK-NEXT: bx lr +entry: + %c = call <16 x i8> @llvm.ucmp(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %c +} + +define arm_aapcs_vfpcc <4 x i16> @u_v4i16(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: u_v4i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovlb.u16 q1, q1 +; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vmov.i32 q3, #0x1 +; CHECK-NEXT: vcmp.u32 hi, q0, q1 +; CHECK-NEXT: vpsel q2, q3, q2 +; CHECK-NEXT: vmov.i8 q3, #0xff +; CHECK-NEXT: vcmp.u32 hi, q1, q0 +; CHECK-NEXT: vpsel q0, q3, q2 +; CHECK-NEXT: bx lr +entry: + %c = call <4 x i16> @llvm.ucmp(<4 x i16> %a, <4 x i16> %b) + ret <4 x i16> %c +} + +define arm_aapcs_vfpcc <8 x i16> @u_v8i16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: u_v8i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vmov.i16 q3, #0x1 +; CHECK-NEXT: vcmp.u16 hi, q0, q1 +; CHECK-NEXT: vpsel q2, q3, q2 +; CHECK-NEXT: vmov.i8 q3, #0xff +; CHECK-NEXT: vcmp.u16 hi, q1, q0 +; CHECK-NEXT: vpsel q0, q3, q2 +; CHECK-NEXT: bx lr +entry: + %c = call <8 x i16> @llvm.ucmp(<8 x i16> %a, <8 x i16> %b) + ret <8 x i16> %c +} + +define arm_aapcs_vfpcc <16 x i16> @u_v16i16(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: u_v16i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: vmov.i16 q5, #0x1 +; CHECK-NEXT: vcmp.u16 hi, q0, q2 +; CHECK-NEXT: vmov.i8 q7, #0xff +; CHECK-NEXT: vpsel q6, q5, q4 +; CHECK-NEXT: vcmp.u16 hi, q2, q0 +; CHECK-NEXT: vpsel q0, q7, q6 +; CHECK-NEXT: vcmp.u16 hi, q1, q3 +; CHECK-NEXT: vpsel q2, q5, q4 +; CHECK-NEXT: vcmp.u16 hi, q3, q1 +; CHECK-NEXT: vpsel q1, q7, q2 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: bx lr +entry: + %c = call <16 x i16> @llvm.ucmp(<16 x i16> %a, <16 x i16> %b) + ret <16 x i16> %c +} + +define arm_aapcs_vfpcc <2 x i32> @u_v2i32(<2 x i32> %a, <2 x i32> %b) { +; CHECK-LABEL: u_v2i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: vmov.i64 q2, #0xffffffff +; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: vand q1, q1, q2 +; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vmov lr, r12, d0 +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: vmov r3, r8, d2 +; CHECK-NEXT: vmov r6, r7, d3 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: subs.w r1, r3, lr +; CHECK-NEXT: sbcs.w r1, r8, r12 +; CHECK-NEXT: csetm r1, lo +; CHECK-NEXT: bfi r4, r1, #0, #8 +; CHECK-NEXT: vmov r1, r5, d1 +; CHECK-NEXT: subs r2, r6, r1 +; CHECK-NEXT: sbcs.w r2, r7, r5 +; CHECK-NEXT: csetm r2, lo +; CHECK-NEXT: bfi r4, r2, #8, #8 +; CHECK-NEXT: adr r2, .LCPI5_0 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: subs.w r2, lr, r3 +; CHECK-NEXT: sbcs.w r2, r12, r8 +; CHECK-NEXT: vmsr p0, r4 +; CHECK-NEXT: csetm r2, lo +; CHECK-NEXT: subs r1, r1, r6 +; CHECK-NEXT: sbcs.w r1, r5, r7 +; CHECK-NEXT: bfi r0, r2, #0, #8 +; CHECK-NEXT: csetm r1, lo +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bfi r0, r1, #8, #8 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI5_0: +; CHECK-NEXT: .long 1 @ 0x1 +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 1 @ 0x1 +; CHECK-NEXT: .long 0 @ 0x0 +entry: + %c = call <2 x i32> @llvm.ucmp(<2 x i32> %a, <2 x i32> %b) + ret <2 x i32> %c +} + +define arm_aapcs_vfpcc <4 x i32> @u_v4i32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: u_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vmov.i32 q3, #0x1 +; CHECK-NEXT: vcmp.u32 hi, q0, q1 +; CHECK-NEXT: vpsel q2, q3, q2 +; CHECK-NEXT: vmov.i8 q3, #0xff +; CHECK-NEXT: vcmp.u32 hi, q1, q0 +; CHECK-NEXT: vpsel q0, q3, q2 +; CHECK-NEXT: bx lr +entry: + %c = call <4 x i32> @llvm.ucmp(<4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %c +} + +define arm_aapcs_vfpcc <8 x i32> @u_v8i32(<8 x i32> %a, <8 x i32> %b) { +; CHECK-LABEL: u_v8i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: vmov.i32 q5, #0x1 +; CHECK-NEXT: vcmp.u32 hi, q0, q2 +; CHECK-NEXT: vmov.i8 q7, #0xff +; CHECK-NEXT: vpsel q6, q5, q4 +; CHECK-NEXT: vcmp.u32 hi, q2, q0 +; CHECK-NEXT: vpsel q0, q7, q6 +; CHECK-NEXT: vcmp.u32 hi, q1, q3 +; CHECK-NEXT: vpsel q2, q5, q4 +; CHECK-NEXT: vcmp.u32 hi, q3, q1 +; CHECK-NEXT: vpsel q1, q7, q2 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: bx lr +entry: + %c = call <8 x i32> @llvm.ucmp(<8 x i32> %a, <8 x i32> %b) + ret <8 x i32> %c +} + +define arm_aapcs_vfpcc <2 x i64> @u_v2i64(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: u_v2i64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: vmov lr, r12, d0 +; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: vmov r3, r8, d2 +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: vmov r6, r7, d3 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: subs.w r1, r3, lr +; CHECK-NEXT: sbcs.w r1, r8, r12 +; CHECK-NEXT: csetm r1, lo +; CHECK-NEXT: bfi r4, r1, #0, #8 +; CHECK-NEXT: vmov r1, r5, d1 +; CHECK-NEXT: subs r2, r6, r1 +; CHECK-NEXT: sbcs.w r2, r7, r5 +; CHECK-NEXT: csetm r2, lo +; CHECK-NEXT: bfi r4, r2, #8, #8 +; CHECK-NEXT: adr r2, .LCPI8_0 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: subs.w r2, lr, r3 +; CHECK-NEXT: sbcs.w r2, r12, r8 +; CHECK-NEXT: vmsr p0, r4 +; CHECK-NEXT: csetm r2, lo +; CHECK-NEXT: subs r1, r1, r6 +; CHECK-NEXT: sbcs.w r1, r5, r7 +; CHECK-NEXT: bfi r0, r2, #0, #8 +; CHECK-NEXT: csetm r1, lo +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bfi r0, r1, #8, #8 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI8_0: +; CHECK-NEXT: .long 1 @ 0x1 +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 1 @ 0x1 +; CHECK-NEXT: .long 0 @ 0x0 +entry: + %c = call <2 x i64> @llvm.ucmp(<2 x i64> %a, <2 x i64> %b) + ret <2 x i64> %c +} + +define arm_aapcs_vfpcc <4 x i64> @u_v4i64(<4 x i64> %a, <4 x i64> %b) { +; CHECK-LABEL: u_v4i64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vmov lr, r12, d0 +; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: vmov r3, r8, d4 +; CHECK-NEXT: vmov.i32 q5, #0x0 +; CHECK-NEXT: vmov r6, r7, d5 +; CHECK-NEXT: mov.w r9, #0 +; CHECK-NEXT: vmov.i8 q2, #0xff +; CHECK-NEXT: subs.w r1, r3, lr +; CHECK-NEXT: sbcs.w r1, r8, r12 +; CHECK-NEXT: csetm r1, lo +; CHECK-NEXT: bfi r4, r1, #0, #8 +; CHECK-NEXT: vmov r1, r5, d1 +; CHECK-NEXT: subs r2, r6, r1 +; CHECK-NEXT: sbcs.w r2, r7, r5 +; CHECK-NEXT: csetm r2, lo +; CHECK-NEXT: bfi r4, r2, #8, #8 +; CHECK-NEXT: adr r2, .LCPI9_0 +; CHECK-NEXT: vldrw.u32 q4, [r2] +; CHECK-NEXT: subs.w r2, lr, r3 +; CHECK-NEXT: sbcs.w r2, r12, r8 +; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: csetm r2, lo +; CHECK-NEXT: subs r1, r1, r6 +; CHECK-NEXT: sbcs.w r1, r5, r7 +; CHECK-NEXT: bfi r3, r2, #0, #8 +; CHECK-NEXT: csetm r1, lo +; CHECK-NEXT: vmsr p0, r4 +; CHECK-NEXT: bfi r3, r1, #8, #8 +; CHECK-NEXT: vpsel q0, q4, q5 +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vmov lr, r12, d2 +; CHECK-NEXT: vmov r3, r7, d6 +; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: vmov r2, r1, d7 +; CHECK-NEXT: vpsel q0, q2, q0 +; CHECK-NEXT: subs.w r6, r3, lr +; CHECK-NEXT: sbcs.w r6, r7, r12 +; CHECK-NEXT: csetm r6, lo +; CHECK-NEXT: bfi r5, r6, #0, #8 +; CHECK-NEXT: vmov r6, r4, d3 +; CHECK-NEXT: subs r0, r2, r6 +; CHECK-NEXT: sbcs.w r0, r1, r4 +; CHECK-NEXT: csetm r0, lo +; CHECK-NEXT: bfi r5, r0, #8, #8 +; CHECK-NEXT: subs.w r0, lr, r3 +; CHECK-NEXT: sbcs.w r0, r12, r7 +; CHECK-NEXT: vmsr p0, r5 +; CHECK-NEXT: csetm r0, lo +; CHECK-NEXT: vpsel q1, q4, q5 +; CHECK-NEXT: bfi r9, r0, #0, #8 +; CHECK-NEXT: subs r0, r6, r2 +; CHECK-NEXT: sbcs.w r0, r4, r1 +; CHECK-NEXT: csetm r0, lo +; CHECK-NEXT: bfi r9, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r9 +; CHECK-NEXT: vpsel q1, q2, q1 +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI9_0: +; CHECK-NEXT: .long 1 @ 0x1 +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 1 @ 0x1 +; CHECK-NEXT: .long 0 @ 0x0 +entry: + %c = call <4 x i64> @llvm.ucmp(<4 x i64> %a, <4 x i64> %b) + ret <4 x i64> %c +} + +define arm_aapcs_vfpcc <16 x i8> @signOf_neon(<8 x i16> %s0_lo, <8 x i16> %s0_hi, <8 x i16> %s1_lo, <8 x i16> %s1_hi) { +; CHECK-LABEL: signOf_neon: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: vmov.i16 q5, #0x1 +; CHECK-NEXT: vcmp.u16 hi, q1, q3 +; CHECK-NEXT: vmov.i8 q7, #0xff +; CHECK-NEXT: vpsel q6, q5, q4 +; CHECK-NEXT: vcmp.u16 hi, q3, q1 +; CHECK-NEXT: vpsel q1, q7, q6 +; CHECK-NEXT: vcmp.u16 hi, q0, q2 +; CHECK-NEXT: vpsel q3, q5, q4 +; CHECK-NEXT: vcmp.u16 hi, q2, q0 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vpsel q0, q7, q3 +; CHECK-NEXT: vstrb.16 q1, [r0, #8] +; CHECK-NEXT: vstrb.16 q0, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: bx lr +entry: + %0 = shufflevector <8 x i16> %s0_lo, <8 x i16> %s0_hi, <16 x i32> + %1 = shufflevector <8 x i16> %s1_lo, <8 x i16> %s1_hi, <16 x i32> + %or.i = tail call <16 x i8> @llvm.ucmp.v16i8.v16i16(<16 x i16> %0, <16 x i16> %1) + ret <16 x i8> %or.i +} From dc6876fc9890f556ed1692141643c73b1542ee5b Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 19 Sep 2024 09:38:20 +0200 Subject: [PATCH 165/321] [ValueTracking] Use isSafeToSpeculativelyExecuteWithVariableReplaced() in more places (#109149) This replaces some uses of isSafeToSpeculativelyExecute() with isSafeToSpeculativelyExecuteWithVariableReplaced(), in cases where we are guarding against operand changes rather plain speculation. I believe that this is NFC with the current implementation of the function (as it only does something different from loads), but this makes us more defensive against future generalizations. --- llvm/lib/Analysis/LazyValueInfo.cpp | 3 ++- llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp | 5 +++-- llvm/lib/Transforms/InstCombine/InstructionCombining.cpp | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp index 69e0627a89cc29..30dc4ae30dbfa5 100644 --- a/llvm/lib/Analysis/LazyValueInfo.cpp +++ b/llvm/lib/Analysis/LazyValueInfo.cpp @@ -1572,7 +1572,8 @@ ValueLatticeElement LazyValueInfoImpl::getValueAtUse(const Use &U) { // This also disallows looking through phi nodes: If the phi node is part // of a cycle, we might end up reasoning about values from different cycle // iterations (PR60629). - if (!CurrI->hasOneUse() || !isSafeToSpeculativelyExecute(CurrI)) + if (!CurrI->hasOneUse() || + !isSafeToSpeculativelyExecuteWithVariableReplaced(CurrI)) break; CurrU = &*CurrI->use_begin(); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index e018f80dc3b2c8..d9b4faff4c004d 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -476,7 +476,8 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) { // it may make the operand poison. BinaryOperator *BO; if (match(SrcVec, m_BinOp(BO)) && cheapToScalarize(SrcVec, Index) && - (HasKnownValidIndex || isSafeToSpeculativelyExecute(BO))) { + (HasKnownValidIndex || + isSafeToSpeculativelyExecuteWithVariableReplaced(BO))) { // extelt (binop X, Y), Index --> binop (extelt X, Index), (extelt Y, Index) Value *X = BO->getOperand(0), *Y = BO->getOperand(1); Value *E0 = Builder.CreateExtractElement(X, Index); @@ -2777,7 +2778,7 @@ Instruction *InstCombinerImpl::simplifyBinOpSplats(ShuffleVectorInst &SVI) { return nullptr; auto *BinOp = cast(Op0); - if (!isSafeToSpeculativelyExecute(BinOp)) + if (!isSafeToSpeculativelyExecuteWithVariableReplaced(BinOp)) return nullptr; Value *NewBO = Builder.CreateBinOp(BinOp->getOpcode(), X, Y); diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index aa3f3fbdaeffa0..1e606c51f72cdb 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2105,7 +2105,7 @@ Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) { // It may not be safe to reorder shuffles and things like div, urem, etc. // because we may trap when executing those ops on unknown vector elements. // See PR20059. - if (!isSafeToSpeculativelyExecute(&Inst)) + if (!isSafeToSpeculativelyExecuteWithVariableReplaced(&Inst)) return nullptr; auto createBinOpShuffle = [&](Value *X, Value *Y, ArrayRef M) { From c18be32185ca10e55bdef0f2d43629ccfb7e89eb Mon Sep 17 00:00:00 2001 From: Phoebe Wang Date: Thu, 19 Sep 2024 15:39:07 +0800 Subject: [PATCH 166/321] Reland "[X86][BF16] Add libcall for F80 -> BF16 (#109116)" (#109143) This reverts commit ababfee78714313a0cad87591b819f0944b90d09. Add X86 FP80 check. --- compiler-rt/lib/builtins/CMakeLists.txt | 1 + compiler-rt/lib/builtins/fp_trunc.h | 12 ++++++++++++ compiler-rt/lib/builtins/truncxfbf2.c | 16 ++++++++++++++++ llvm/include/llvm/IR/RuntimeLibcalls.def | 1 + llvm/lib/CodeGen/TargetLoweringBase.cpp | 2 ++ llvm/test/CodeGen/X86/bfloat.ll | 22 ++++++++++++++++++++++ 6 files changed, 54 insertions(+) create mode 100644 compiler-rt/lib/builtins/truncxfbf2.c diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt index 2c3b0fa84a4782..9a0a50ee7003f1 100644 --- a/compiler-rt/lib/builtins/CMakeLists.txt +++ b/compiler-rt/lib/builtins/CMakeLists.txt @@ -192,6 +192,7 @@ set(GENERIC_SOURCES set(BF16_SOURCES extendbfsf2.c truncdfbf2.c + truncxfbf2.c truncsfbf2.c ) diff --git a/compiler-rt/lib/builtins/fp_trunc.h b/compiler-rt/lib/builtins/fp_trunc.h index 141fe63e132d96..a1bd881eb57c9e 100644 --- a/compiler-rt/lib/builtins/fp_trunc.h +++ b/compiler-rt/lib/builtins/fp_trunc.h @@ -35,6 +35,18 @@ static const int srcSigFracBits = 52; // srcBits - srcSigFracBits - 1 static const int srcExpBits = 11; +#elif defined SRC_80 +typedef xf_float src_t; +typedef __uint128_t src_rep_t; +#define SRC_REP_C (__uint128_t) +// sign bit, exponent and significand occupy the lower 80 bits. +static const int srcBits = 80; +static const int srcSigFracBits = 63; +// -1 accounts for the sign bit. +// -1 accounts for the explicitly stored integer bit. +// srcBits - srcSigFracBits - 1 - 1 +static const int srcExpBits = 15; + #elif defined SRC_QUAD typedef tf_float src_t; typedef __uint128_t src_rep_t; diff --git a/compiler-rt/lib/builtins/truncxfbf2.c b/compiler-rt/lib/builtins/truncxfbf2.c new file mode 100644 index 00000000000000..aa4108c334eebc --- /dev/null +++ b/compiler-rt/lib/builtins/truncxfbf2.c @@ -0,0 +1,16 @@ +//===-- lib/truncxfbf2.c - long double -> bfloat conversion -------*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#if defined(CRT_HAS_TF_MODE) && __LDBL_MANT_DIG__ == 64 && defined(__x86_64__) +#define SRC_80 +#define DST_BFLOAT +#include "fp_trunc_impl.inc" + +COMPILER_RT_ABI dst_t __truncxfbf2(long double a) { return __truncXfYf2__(a); } + +#endif diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.def b/llvm/include/llvm/IR/RuntimeLibcalls.def index c3d5ef9f4e4f82..69cf43140ad4bd 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.def +++ b/llvm/include/llvm/IR/RuntimeLibcalls.def @@ -367,6 +367,7 @@ HANDLE_LIBCALL(FPROUND_F128_F16, "__trunctfhf2") HANDLE_LIBCALL(FPROUND_PPCF128_F16, "__trunctfhf2") HANDLE_LIBCALL(FPROUND_F32_BF16, "__truncsfbf2") HANDLE_LIBCALL(FPROUND_F64_BF16, "__truncdfbf2") +HANDLE_LIBCALL(FPROUND_F80_BF16, "__truncxfbf2") HANDLE_LIBCALL(FPROUND_F64_F32, "__truncdfsf2") HANDLE_LIBCALL(FPROUND_F80_F32, "__truncxfsf2") HANDLE_LIBCALL(FPROUND_F128_F32, "__trunctfsf2") diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index eb3190c7cd247a..9fdde454559171 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -169,6 +169,8 @@ RTLIB::Libcall RTLIB::getFPROUND(EVT OpVT, EVT RetVT) { return FPROUND_F32_BF16; if (OpVT == MVT::f64) return FPROUND_F64_BF16; + if (OpVT == MVT::f80) + return FPROUND_F80_BF16; } else if (RetVT == MVT::f32) { if (OpVT == MVT::f64) return FPROUND_F64_F32; diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll index 3759909a2ccc8e..3144fd56d9ccf3 100644 --- a/llvm/test/CodeGen/X86/bfloat.ll +++ b/llvm/test/CodeGen/X86/bfloat.ll @@ -1953,3 +1953,25 @@ define void @PR92471(ptr %0, ptr %1) nounwind { store <7 x float> %4, ptr %1, align 4 ret void } + +define bfloat @PR108936(x86_fp80 %0) nounwind { +; X86-LABEL: PR108936: +; X86: # %bb.0: +; X86-NEXT: subl $12, %esp +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt (%esp) +; X86-NEXT: calll __truncxfbf2 +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl +; +; CHECK-LABEL: PR108936: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fstpt (%rsp) +; CHECK-NEXT: callq __truncxfbf2@PLT +; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: retq + %2 = fptrunc x86_fp80 %0 to bfloat + ret bfloat %2 +} From 4ec4ac15ed47ccb52d79e01c038865817d0cedf6 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 19 Sep 2024 09:39:35 +0200 Subject: [PATCH 167/321] [SCEVExpander] Fix addrec cost model (#106704) The current isHighCostExpansion cost model for addrecs computes the cost for some kind of polynomial expansion that does not appear to have any relation to addrec expansion whatsoever. A literal expansion of an affine addrec is a phi and add (plus the expansion of start and step). For a non-affine addrec, we get another phi+add for each additional addrec nested in the step recurrence. This partially `fixes` https://github.com/llvm/llvm-project/issues/53205 (the runtime unroll test case in this PR). --- .../Utils/ScalarEvolutionExpander.cpp | 48 +++------- .../rewrite-loop-exit-values-phi.ll | 13 +-- .../X86/runtime-unroll-addrec-cost.ll | 91 ++++++++++++++++--- 3 files changed, 96 insertions(+), 56 deletions(-) diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index c7d758aa575e61..0927a3015818fd 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -1911,43 +1911,17 @@ template static InstructionCost costAndCollectOperands( break; } case scAddRecExpr: { - // In this polynominal, we may have some zero operands, and we shouldn't - // really charge for those. So how many non-zero coefficients are there? - int NumTerms = llvm::count_if(S->operands(), [](const SCEV *Op) { - return !Op->isZero(); - }); - - assert(NumTerms >= 1 && "Polynominal should have at least one term."); - assert(!(*std::prev(S->operands().end()))->isZero() && - "Last operand should not be zero"); - - // Ignoring constant term (operand 0), how many of the coefficients are u> 1? - int NumNonZeroDegreeNonOneTerms = - llvm::count_if(S->operands(), [](const SCEV *Op) { - auto *SConst = dyn_cast(Op); - return !SConst || SConst->getAPInt().ugt(1); - }); - - // Much like with normal add expr, the polynominal will require - // one less addition than the number of it's terms. - InstructionCost AddCost = ArithCost(Instruction::Add, NumTerms - 1, - /*MinIdx*/ 1, /*MaxIdx*/ 1); - // Here, *each* one of those will require a multiplication. - InstructionCost MulCost = - ArithCost(Instruction::Mul, NumNonZeroDegreeNonOneTerms); - Cost = AddCost + MulCost; - - // What is the degree of this polynominal? - int PolyDegree = S->getNumOperands() - 1; - assert(PolyDegree >= 1 && "Should be at least affine."); - - // The final term will be: - // Op_{PolyDegree} * x ^ {PolyDegree} - // Where x ^ {PolyDegree} will again require PolyDegree-1 mul operations. - // Note that x ^ {PolyDegree} = x * x ^ {PolyDegree-1} so charging for - // x ^ {PolyDegree} will give us x ^ {2} .. x ^ {PolyDegree-1} for free. - // FIXME: this is conservatively correct, but might be overly pessimistic. - Cost += MulCost * (PolyDegree - 1); + // Addrec expands to a phi and add per recurrence. + unsigned NumRecurrences = S->getNumOperands() - 1; + Cost += TTI.getCFInstrCost(Instruction::PHI, CostKind) * NumRecurrences; + Cost += + TTI.getArithmeticInstrCost(Instruction::Add, S->getType(), CostKind) * + NumRecurrences; + // AR start is used in phi. + Worklist.emplace_back(Instruction::PHI, 0, S->getOperand(0)); + // Other operands are used in add. + for (const SCEV *Op : S->operands().drop_front()) + Worklist.emplace_back(Instruction::Add, 1, Op); break; } } diff --git a/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-values-phi.ll b/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-values-phi.ll index 63f3da7af46ecc..37bc67c23adb75 100644 --- a/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-values-phi.ll +++ b/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-values-phi.ll @@ -14,27 +14,28 @@ define dso_local void @hoge() local_unnamed_addr { ; CHECK-LABEL: @hoge( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[N:%.*]] = sdiv exact i64 undef, 40 +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 undef, [[N]] ; CHECK-NEXT: br label [[HEADER:%.*]] ; CHECK: header: -; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ [[IDX_NEXT:%.*]], [[LATCH:%.*]] ], [ undef, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[LATCH:%.*]] ], [ [[TMP0]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ [[IDX_NEXT:%.*]], [[LATCH]] ], [ undef, [[ENTRY]] ] ; CHECK-NEXT: [[COND:%.*]] = icmp sgt i64 [[N]], [[IDX]] ; CHECK-NEXT: br i1 [[COND]], label [[END:%.*]], label [[INNER_PREHEADER:%.*]] ; CHECK: inner.preheader: ; CHECK-NEXT: br label [[INNER:%.*]] ; CHECK: inner: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[INNER]] ], [ 0, [[INNER_PREHEADER]] ] -; CHECK-NEXT: [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[INNER]] ], [ [[N]], [[INNER_PREHEADER]] ] -; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 -; CHECK-NEXT: [[J_NEXT]] = add nsw i64 [[J]], 1 +; CHECK-NEXT: [[I_NEXT]] = add nuw i64 [[I]], 1 ; CHECK-NEXT: store i64 undef, ptr @ptr, align 8 -; CHECK-NEXT: [[COND1:%.*]] = icmp slt i64 [[J]], [[IDX]] -; CHECK-NEXT: br i1 [[COND1]], label [[INNER]], label [[INNER_EXIT:%.*]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[I_NEXT]], [[INDVARS_IV]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[INNER]], label [[INNER_EXIT:%.*]] ; CHECK: inner_exit: ; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ [[I_NEXT]], [[INNER]] ] ; CHECK-NEXT: [[INDVAR_USE:%.*]] = add i64 [[INDVAR]], 1 ; CHECK-NEXT: br label [[LATCH]] ; CHECK: latch: ; CHECK-NEXT: [[IDX_NEXT]] = add nsw i64 [[IDX]], -1 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], -1 ; CHECK-NEXT: br label [[HEADER]] ; CHECK: end: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/LoopUnroll/X86/runtime-unroll-addrec-cost.ll b/llvm/test/Transforms/LoopUnroll/X86/runtime-unroll-addrec-cost.ll index 3c2c840aeb8103..367ef90d6a3862 100644 --- a/llvm/test/Transforms/LoopUnroll/X86/runtime-unroll-addrec-cost.ll +++ b/llvm/test/Transforms/LoopUnroll/X86/runtime-unroll-addrec-cost.ll @@ -13,17 +13,27 @@ define void @selsort(ptr %array) #0 { ; CHECK-NEXT: br i1 [[CMP21_NOT]], label %[[FOR_END18:.*]], label %[[FOR_BODY_LR_PH:.*]] ; CHECK: [[FOR_BODY_LR_PH]]: ; CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARRAY]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP0]], -1 +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP0]], -2 ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: ; CHECK-NEXT: [[BASE_022:%.*]] = phi i64 [ 0, %[[FOR_BODY_LR_PH]] ], [ [[ADD:%.*]], %[[FOR_END:.*]] ] +; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[BASE_022]], -1 +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[TMP7]], [[TMP10]] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP8]], [[TMP10]] ; CHECK-NEXT: [[ADD]] = add nuw i64 [[BASE_022]], 1 ; CHECK-NEXT: [[CMP318:%.*]] = icmp ult i64 [[ADD]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP318]], label %[[FOR_BODY4_PREHEADER:.*]], label %[[FOR_END]] ; CHECK: [[FOR_BODY4_PREHEADER]]: -; CHECK-NEXT: br label %[[FOR_BODY4:.*]] -; CHECK: [[FOR_BODY4]]: -; CHECK-NEXT: [[MIN_020:%.*]] = phi i64 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY4]] ], [ [[BASE_022]], %[[FOR_BODY4_PREHEADER]] ] -; CHECK-NEXT: [[C_019:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY4]] ], [ [[ADD]], %[[FOR_BODY4_PREHEADER]] ] +; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP11]], 3 +; CHECK-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0 +; CHECK-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY4_PROL_PREHEADER:.*]], label %[[FOR_BODY4_PROL_LOOPEXIT:.*]] +; CHECK: [[FOR_BODY4_PROL_PREHEADER]]: +; CHECK-NEXT: br label %[[FOR_BODY4_PROL:.*]] +; CHECK: [[FOR_BODY4_PROL]]: +; CHECK-NEXT: [[MIN_020:%.*]] = phi i64 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY4_PROL]] ], [ [[BASE_022]], %[[FOR_BODY4_PROL_PREHEADER]] ] +; CHECK-NEXT: [[C_019:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY4_PROL]] ], [ [[ADD]], %[[FOR_BODY4_PROL_PREHEADER]] ] +; CHECK-NEXT: [[PROL_ITER:%.*]] = phi i64 [ 0, %[[FOR_BODY4_PROL_PREHEADER]] ], [ [[PROL_ITER_NEXT:%.*]], %[[FOR_BODY4_PROL]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[C_019]] ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[MIN_020]] @@ -32,18 +42,69 @@ define void @selsort(ptr %array) #0 { ; CHECK-NEXT: [[SPEC_SELECT]] = select i1 [[CMP7]], i64 [[C_019]], i64 [[MIN_020]] ; CHECK-NEXT: [[INC]] = add nuw i64 [[C_019]], 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp ult i64 [[INC]], [[TMP0]] -; CHECK-NEXT: br i1 [[CMP3]], label %[[FOR_BODY4]], label %[[FOR_END_LOOPEXIT:.*]] -; CHECK: [[FOR_END_LOOPEXIT]]: -; CHECK-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[FOR_BODY4]] ] -; CHECK-NEXT: br label %[[FOR_END]] -; CHECK: [[FOR_END]]: -; CHECK-NEXT: [[MIN_0_LCSSA:%.*]] = phi i64 [ [[BASE_022]], %[[FOR_BODY]] ], [ [[SPEC_SELECT_LCSSA]], %[[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: [[PROL_ITER_NEXT]] = add i64 [[PROL_ITER]], 1 +; CHECK-NEXT: [[PROL_ITER_CMP:%.*]] = icmp ne i64 [[PROL_ITER_NEXT]], [[XTRAITER]] +; CHECK-NEXT: br i1 [[PROL_ITER_CMP]], label %[[FOR_BODY4_PROL]], label %[[FOR_BODY4_PROL_LOOPEXIT_UNR_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[FOR_BODY4_PROL_LOOPEXIT_UNR_LCSSA]]: +; CHECK-NEXT: [[MIN_020_UNR_PH:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[FOR_BODY4_PROL]] ] +; CHECK-NEXT: [[C_019_UNR_PH:%.*]] = phi i64 [ [[INC]], %[[FOR_BODY4_PROL]] ] +; CHECK-NEXT: [[SPEC_SELECT_LCSSA_UNR_PH:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[FOR_BODY4_PROL]] ] +; CHECK-NEXT: br label %[[FOR_BODY4_PROL_LOOPEXIT]] +; CHECK: [[FOR_BODY4_PROL_LOOPEXIT]]: +; CHECK-NEXT: [[MIN_020_UNR:%.*]] = phi i64 [ [[BASE_022]], %[[FOR_BODY4_PREHEADER]] ], [ [[MIN_020_UNR_PH]], %[[FOR_BODY4_PROL_LOOPEXIT_UNR_LCSSA]] ] +; CHECK-NEXT: [[C_019_UNR:%.*]] = phi i64 [ [[ADD]], %[[FOR_BODY4_PREHEADER]] ], [ [[C_019_UNR_PH]], %[[FOR_BODY4_PROL_LOOPEXIT_UNR_LCSSA]] ] +; CHECK-NEXT: [[SPEC_SELECT_LCSSA_UNR:%.*]] = phi i64 [ poison, %[[FOR_BODY4_PREHEADER]] ], [ [[SPEC_SELECT_LCSSA_UNR_PH]], %[[FOR_BODY4_PROL_LOOPEXIT_UNR_LCSSA]] ] +; CHECK-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP6]], 3 +; CHECK-NEXT: br i1 [[TMP9]], label %[[FOR_END_LOOPEXIT:.*]], label %[[FOR_BODY4_PREHEADER_NEW:.*]] +; CHECK: [[FOR_BODY4_PREHEADER_NEW]]: +; CHECK-NEXT: br label %[[FOR_BODY4:.*]] +; CHECK: [[FOR_BODY4]]: +; CHECK-NEXT: [[MIN_20:%.*]] = phi i64 [ [[MIN_020_UNR]], %[[FOR_BODY4_PREHEADER_NEW]] ], [ [[SPEC_SELECT_3:%.*]], %[[FOR_BODY4]] ] +; CHECK-NEXT: [[MIN_0_LCSSA:%.*]] = phi i64 [ [[C_019_UNR]], %[[FOR_BODY4_PREHEADER_NEW]] ], [ [[INC_3:%.*]], %[[FOR_BODY4]] ] ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[MIN_0_LCSSA]] ; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX9]], align 4 -; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[BASE_022]] +; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[MIN_20]] ; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX11]], align 4 -; CHECK-NEXT: store i32 [[TMP5]], ptr [[ARRAYIDX9]], align 4 -; CHECK-NEXT: store i32 [[TMP4]], ptr [[ARRAYIDX11]], align 4 +; CHECK-NEXT: [[CMP8:%.*]] = icmp ult i32 [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[SPEC_SELECT1:%.*]] = select i1 [[CMP8]], i64 [[MIN_0_LCSSA]], i64 [[MIN_20]] +; CHECK-NEXT: [[INC1:%.*]] = add nuw i64 [[MIN_0_LCSSA]], 1 +; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[INC1]] +; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX_1]], align 4 +; CHECK-NEXT: [[ARRAYIDX6_1:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[SPEC_SELECT1]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX6_1]], align 4 +; CHECK-NEXT: [[CMP7_1:%.*]] = icmp ult i32 [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[SPEC_SELECT_1:%.*]] = select i1 [[CMP7_1]], i64 [[INC1]], i64 [[SPEC_SELECT1]] +; CHECK-NEXT: [[INC_1:%.*]] = add nuw i64 [[MIN_0_LCSSA]], 2 +; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[INC_1]] +; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX_2]], align 4 +; CHECK-NEXT: [[ARRAYIDX6_2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[SPEC_SELECT_1]] +; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX6_2]], align 4 +; CHECK-NEXT: [[CMP7_2:%.*]] = icmp ult i32 [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[SPEC_SELECT_2:%.*]] = select i1 [[CMP7_2]], i64 [[INC_1]], i64 [[SPEC_SELECT_1]] +; CHECK-NEXT: [[INC_2:%.*]] = add nuw i64 [[MIN_0_LCSSA]], 3 +; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[INC_2]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX_3]], align 4 +; CHECK-NEXT: [[ARRAYIDX6_3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[SPEC_SELECT_2]] +; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX6_3]], align 4 +; CHECK-NEXT: [[CMP7_3:%.*]] = icmp ult i32 [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[SPEC_SELECT_3]] = select i1 [[CMP7_3]], i64 [[INC_2]], i64 [[SPEC_SELECT_2]] +; CHECK-NEXT: [[INC_3]] = add nuw i64 [[MIN_0_LCSSA]], 4 +; CHECK-NEXT: [[CMP3_3:%.*]] = icmp ult i64 [[INC_3]], [[TMP0]] +; CHECK-NEXT: br i1 [[CMP3_3]], label %[[FOR_BODY4]], label %[[FOR_END_LOOPEXIT_UNR_LCSSA:.*]] +; CHECK: [[FOR_END_LOOPEXIT_UNR_LCSSA]]: +; CHECK-NEXT: [[SPEC_SELECT_LCSSA_PH:%.*]] = phi i64 [ [[SPEC_SELECT_3]], %[[FOR_BODY4]] ] +; CHECK-NEXT: br label %[[FOR_END_LOOPEXIT]] +; CHECK: [[FOR_END_LOOPEXIT]]: +; CHECK-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT_LCSSA_UNR]], %[[FOR_BODY4_PROL_LOOPEXIT]] ], [ [[SPEC_SELECT_LCSSA_PH]], %[[FOR_END_LOOPEXIT_UNR_LCSSA]] ] +; CHECK-NEXT: br label %[[FOR_END]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: [[MIN_0_LCSSA1:%.*]] = phi i64 [ [[BASE_022]], %[[FOR_BODY]] ], [ [[SPEC_SELECT_LCSSA]], %[[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[MIN_0_LCSSA1]] +; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[BASE_022]] +; CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4 +; CHECK-NEXT: store i32 [[TMP19]], ptr [[ARRAYIDX10]], align 4 +; CHECK-NEXT: store i32 [[TMP18]], ptr [[ARRAYIDX12]], align 4 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[ADD]], [[TMP0]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END18_LOOPEXIT:.*]], label %[[FOR_BODY]] ; CHECK: [[FOR_END18_LOOPEXIT]]: @@ -96,3 +157,7 @@ for.end18: ; preds = %for.end, %entry } attributes #0 = { "tune-cpu"="generic" } +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.unroll.disable"} +;. From 4c50112ba1fb6b3847decebd6f1e374c61950be9 Mon Sep 17 00:00:00 2001 From: David Green Date: Thu, 19 Sep 2024 08:50:43 +0100 Subject: [PATCH 168/321] [AArch64] Add patterns for 64bit vector addp This extends the existing patterns for addp to 64bit outputs with a single input. Whilst the general pattern is similar to the 128bit patterns (add(uzp1(extract_lo, extract_hi), uzp2(extract_lo, extract_hi))), at the late stage other optimzations have happened to turn the first uzp1 into trunc and the second into extract(uzp2) with undef. Fixes #109108 --- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 12 +++++++++ llvm/test/CodeGen/AArch64/addp-shuffle.ll | 30 ++++++++++++++++++--- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index a47de9a12caca5..c040ef1862f21a 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -9634,6 +9634,18 @@ def : Pat<(v16i8 (add (AArch64uzp1 (v16i8 FPR128:$Rn), (v16i8 FPR128:$Rm)), (AArch64uzp2 (v16i8 FPR128:$Rn), (v16i8 FPR128:$Rm)))), (v16i8 (ADDPv16i8 $Rn, $Rm))>; +def : Pat<(v2i32 (add (AArch64zip1 (extract_subvector (v4i32 FPR128:$Rn), (i64 0)), + (extract_subvector (v4i32 FPR128:$Rn), (i64 2))), + (AArch64zip2 (extract_subvector (v4i32 FPR128:$Rn), (i64 0)), + (extract_subvector (v4i32 FPR128:$Rn), (i64 2))))), + (EXTRACT_SUBREG (ADDPv4i32 $Rn, $Rn), dsub)>; +def : Pat<(v4i16 (add (trunc (v4i32 (bitconvert FPR128:$Rn))), + (extract_subvector (AArch64uzp2 (v8i16 FPR128:$Rn), undef), (i64 0)))), + (EXTRACT_SUBREG (ADDPv8i16 $Rn, $Rn), dsub)>; +def : Pat<(v8i8 (add (trunc (v8i16 (bitconvert FPR128:$Rn))), + (extract_subvector (AArch64uzp2 (v16i8 FPR128:$Rn), undef), (i64 0)))), + (EXTRACT_SUBREG (ADDPv16i8 $Rn, $Rn), dsub)>; + def : Pat<(v2f64 (fadd (AArch64zip1 (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm)), (AArch64zip2 (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm)))), (v2f64 (FADDPv2f64 $Rn, $Rm))>; diff --git a/llvm/test/CodeGen/AArch64/addp-shuffle.ll b/llvm/test/CodeGen/AArch64/addp-shuffle.ll index fb96d11acc275a..54c96820285d32 100644 --- a/llvm/test/CodeGen/AArch64/addp-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/addp-shuffle.ll @@ -27,10 +27,8 @@ define <4 x i32> @deinterleave_shuffle_v8i32_c(<8 x i32> %a) { define <2 x i32> @deinterleave_shuffle_v4i32(<4 x i32> %a) { ; CHECK-LABEL: deinterleave_shuffle_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: zip1 v2.2s, v0.2s, v1.2s -; CHECK-NEXT: zip2 v0.2s, v0.2s, v1.2s -; CHECK-NEXT: add v0.2s, v2.2s, v0.2s +; CHECK-NEXT: addp v0.4s, v0.4s, v0.4s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %r0 = shufflevector <4 x i32> %a, <4 x i32> poison, <2 x i32> %r1 = shufflevector <4 x i32> %a, <4 x i32> poison, <2 x i32> @@ -49,6 +47,18 @@ define <8 x i16> @deinterleave_shuffle_v16i16(<16 x i16> %a) { ret <8 x i16> %o } +define <4 x i16> @deinterleave_shuffle_v8i16(<8 x i16> %a) { +; CHECK-LABEL: deinterleave_shuffle_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: addp v0.8h, v0.8h, v0.8h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %r0 = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> + %r1 = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> + %o = add <4 x i16> %r0, %r1 + ret <4 x i16> %o +} + define <16 x i8> @deinterleave_shuffle_v32i8(<32 x i8> %a) { ; CHECK-LABEL: deinterleave_shuffle_v32i8: ; CHECK: // %bb.0: @@ -60,6 +70,18 @@ define <16 x i8> @deinterleave_shuffle_v32i8(<32 x i8> %a) { ret <16 x i8> %o } +define <8 x i8> @deinterleave_shuffle_v16i8(<16 x i8> %a) { +; CHECK-LABEL: deinterleave_shuffle_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: addp v0.16b, v0.16b, v0.16b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %r0 = shufflevector <16 x i8> %a, <16 x i8> poison, <8 x i32> + %r1 = shufflevector <16 x i8> %a, <16 x i8> poison, <8 x i32> + %o = add <8 x i8> %r0, %r1 + ret <8 x i8> %o +} + define <4 x i64> @deinterleave_shuffle_v8i64(<8 x i64> %a) { ; CHECK-LABEL: deinterleave_shuffle_v8i64: ; CHECK: // %bb.0: From 7183771834d9035ffbedd8f1ff9233b16722b986 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 19 Sep 2024 09:59:36 +0200 Subject: [PATCH 169/321] [InitUndef] Also handle inline asm (#108951) InitUndef should also handle early-clobber / undef conflicts in inline asm operands. Do this by iterating over all_defs() instead of defs(). The newly added ARM test was generating an "unpredictable STXP instruction, status is also a source" error prior to this change. Fixes https://github.com/llvm/llvm-project/issues/106380. --- llvm/lib/CodeGen/InitUndef.cpp | 2 +- llvm/test/CodeGen/AArch64/arm64-ldxr-stxr.ll | 13 +++++++++++++ llvm/test/CodeGen/Thumb2/mve-intrinsics/vcaddq.ll | 13 +++++++++++++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/InitUndef.cpp b/llvm/lib/CodeGen/InitUndef.cpp index 911e8bb7a4d9ef..d4ac131a32a959 100644 --- a/llvm/lib/CodeGen/InitUndef.cpp +++ b/llvm/lib/CodeGen/InitUndef.cpp @@ -98,7 +98,7 @@ INITIALIZE_PASS(InitUndef, DEBUG_TYPE, INIT_UNDEF_NAME, false, false) char &llvm::InitUndefID = InitUndef::ID; static bool isEarlyClobberMI(MachineInstr &MI) { - return llvm::any_of(MI.defs(), [](const MachineOperand &DefMO) { + return llvm::any_of(MI.all_defs(), [](const MachineOperand &DefMO) { return DefMO.isReg() && DefMO.isEarlyClobber(); }); } diff --git a/llvm/test/CodeGen/AArch64/arm64-ldxr-stxr.ll b/llvm/test/CodeGen/AArch64/arm64-ldxr-stxr.ll index 4fb0c2775a7a7a..b498611242d469 100644 --- a/llvm/test/CodeGen/AArch64/arm64-ldxr-stxr.ll +++ b/llvm/test/CodeGen/AArch64/arm64-ldxr-stxr.ll @@ -364,6 +364,19 @@ define dso_local i32 @test_stxp_undef(ptr %p, i64 %x) nounwind { ret i32 %res } +; Same as previous test, but using inline asm. +define dso_local i32 @test_stxp_undef_inline_asm(ptr %p, i64 %x) nounwind { +; CHECK-LABEL: test_stxp_undef_inline_asm: +; CHECK: // %bb.0: +; CHECK-NEXT: //APP +; CHECK-NEXT: stxp w8, x9, x1, [x0] +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: ret + %res = call i32 asm sideeffect "stxp ${0:w}, ${2}, ${3}, [${1}]", "=&r,r,r,r,~{memory}"(ptr %p, i64 undef, i64 %x) + ret i32 %res +} + declare i32 @llvm.aarch64.stlxr.p0(i64, ptr) nounwind ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; FALLBACK: {{.*}} diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vcaddq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vcaddq.ll index 02234c63725360..322086829953dd 100644 --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vcaddq.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vcaddq.ll @@ -710,6 +710,19 @@ entry: ret <4 x i32> %0 } +define arm_aapcs_vfpcc <4 x i32> @test_vhcaddq_rot270_s32_undef_inline_asm() { +; CHECK-LABEL: test_vhcaddq_rot270_s32_undef_inline_asm: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: @APP +; CHECK-NEXT: vhcadd.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #270 +; CHECK-NOT: vhcadd.s32 q[[REG:[0-9]+]], q{{[0-9]+}}, q[[REG]], #270 +; CHECK-NEXT: @NO_APP +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> asm sideeffect "vhcadd.s32 ${0}, ${1}, ${2}, #270", "=&w,w,w,~{memory}"(<4 x i32> undef, <4 x i32> undef) + ret <4 x i32> %0 +} + define arm_aapcs_vfpcc <16 x i8> @test_vhcaddq_rot90_x_s8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) { ; CHECK-LABEL: test_vhcaddq_rot90_x_s8: ; CHECK: @ %bb.0: @ %entry From edc71e22c004d3b3dfc535f7917ea0b47a282ac8 Mon Sep 17 00:00:00 2001 From: Elvis Wang Date: Thu, 19 Sep 2024 16:00:21 +0800 Subject: [PATCH 170/321] [RISCV][TTI] Add instruction cost for vp.load/store. (#109245) This patch makes the instruction cost of vp.load/store same as their non-vp counterpart. --- .../Target/RISCV/RISCVTargetTransformInfo.cpp | 18 ++++++ .../CostModel/RISCV/rvv-intrinsics.ll | 64 +++++++++---------- 2 files changed, 50 insertions(+), 32 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 5d280b44630aef..2c4aad68e1abd0 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -1048,6 +1048,24 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, return getCmpSelInstrCost(*FOp, ICA.getArgTypes()[0], ICA.getReturnType(), UI->getPredicate(), CostKind); } + // vp load/store + case Intrinsic::vp_load: + case Intrinsic::vp_store: { + Intrinsic::ID IID = ICA.getID(); + std::optional FOp = VPIntrinsic::getFunctionalOpcodeForVP(IID); + auto *UI = dyn_cast(ICA.getInst()); + + if (!UI) + break; + assert(FOp.has_value()); + if (ICA.getID() == Intrinsic::vp_load) + return getMemoryOpCost( + *FOp, ICA.getReturnType(), UI->getPointerAlignment(), + UI->getOperand(0)->getType()->getPointerAddressSpace(), CostKind); + return getMemoryOpCost( + *FOp, ICA.getArgTypes()[0], UI->getPointerAlignment(), + UI->getOperand(1)->getType()->getPointerAddressSpace(), CostKind); + } } if (ST->hasVInstructions() && RetTy->isVectorTy()) { diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll index 40aad95e715afd..6fe65d97d7448f 100644 --- a/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll +++ b/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll @@ -852,37 +852,37 @@ define void @load() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; TYPEBASED-LABEL: 'load' -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %t0 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr undef, <2 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t0 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr undef, <2 x i1> undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t1 = load <2 x i8>, ptr undef, align 2 -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %t2 = call <4 x i8> @llvm.vp.load.v4i8.p0(ptr undef, <4 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t2 = call <4 x i8> @llvm.vp.load.v4i8.p0(ptr undef, <4 x i1> undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t3 = load <4 x i8>, ptr undef, align 4 -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %t4 = call <8 x i8> @llvm.vp.load.v8i8.p0(ptr undef, <8 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t4 = call <8 x i8> @llvm.vp.load.v8i8.p0(ptr undef, <8 x i1> undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t5 = load <8 x i8>, ptr undef, align 8 -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 110 for instruction: %t6 = call <16 x i8> @llvm.vp.load.v16i8.p0(ptr undef, <16 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t6 = call <16 x i8> @llvm.vp.load.v16i8.p0(ptr undef, <16 x i1> undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t7 = load <16 x i8>, ptr undef, align 16 -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %t8 = call <2 x i64> @llvm.vp.load.v2i64.p0(ptr undef, <2 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x i64> @llvm.vp.load.v2i64.p0(ptr undef, <2 x i1> undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t9 = load <2 x i64>, ptr undef, align 16 -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %t10 = call <4 x i64> @llvm.vp.load.v4i64.p0(ptr undef, <4 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t10 = call <4 x i64> @llvm.vp.load.v4i64.p0(ptr undef, <4 x i1> undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t12 = load <4 x i64>, ptr undef, align 32 -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %t13 = call <8 x i64> @llvm.vp.load.v8i64.p0(ptr undef, <8 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t13 = call <8 x i64> @llvm.vp.load.v8i64.p0(ptr undef, <8 x i1> undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t14 = load <8 x i64>, ptr undef, align 64 -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 110 for instruction: %t15 = call <16 x i64> @llvm.vp.load.v16i64.p0(ptr undef, <16 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t15 = call <16 x i64> @llvm.vp.load.v16i64.p0(ptr undef, <16 x i1> undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t16 = load <16 x i64>, ptr undef, align 128 -; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %t17 = call @llvm.vp.load.nxv2i8.p0(ptr undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t17 = call @llvm.vp.load.nxv2i8.p0(ptr undef, undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t18 = load , ptr undef, align 2 -; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %t19 = call @llvm.vp.load.nxv4i8.p0(ptr undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t19 = call @llvm.vp.load.nxv4i8.p0(ptr undef, undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t20 = load , ptr undef, align 4 -; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %t21 = call @llvm.vp.load.nxv8i8.p0(ptr undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t21 = call @llvm.vp.load.nxv8i8.p0(ptr undef, undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t22 = load , ptr undef, align 8 -; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %t23 = call @llvm.vp.load.nxv16i8.p0(ptr undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t23 = call @llvm.vp.load.nxv16i8.p0(ptr undef, undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t24 = load , ptr undef, align 16 -; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %t25 = call @llvm.vp.load.nxv2i64.p0(ptr undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t25 = call @llvm.vp.load.nxv2i64.p0(ptr undef, undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t26 = load , ptr undef, align 16 -; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %t27 = call @llvm.vp.load.nxv4i64.p0(ptr undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t27 = call @llvm.vp.load.nxv4i64.p0(ptr undef, undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t28 = load , ptr undef, align 32 -; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %t29 = call @llvm.vp.load.nxv8i64.p0(ptr undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t29 = call @llvm.vp.load.nxv8i64.p0(ptr undef, undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t30 = load , ptr undef, align 64 -; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %t31 = call @llvm.vp.load.nxv16i64.p0(ptr undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %t31 = call @llvm.vp.load.nxv16i64.p0(ptr undef, undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %t32 = load , ptr undef, align 128 ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; @@ -958,37 +958,37 @@ define void @store() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; TYPEBASED-LABEL: 'store' -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.vp.store.v2i8.p0(<2 x i8> undef, ptr undef, <2 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v2i8.p0(<2 x i8> undef, ptr undef, <2 x i1> undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> undef, ptr undef, align 2 -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.vp.store.v4i8.p0(<4 x i8> undef, ptr undef, <4 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v4i8.p0(<4 x i8> undef, ptr undef, <4 x i1> undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i8> undef, ptr undef, align 4 -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.vp.store.v8i8.p0(<8 x i8> undef, ptr undef, <8 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v8i8.p0(<8 x i8> undef, ptr undef, <8 x i1> undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <8 x i8> undef, ptr undef, align 8 -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.vp.store.v16i8.p0(<16 x i8> undef, ptr undef, <16 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v16i8.p0(<16 x i8> undef, ptr undef, <16 x i1> undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <16 x i8> undef, ptr undef, align 16 -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.vp.store.v2i64.p0(<2 x i64> undef, ptr undef, <2 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v2i64.p0(<2 x i64> undef, ptr undef, <2 x i1> undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr undef, align 16 -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.vp.store.v4i64.p0(<4 x i64> undef, ptr undef, <4 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.vp.store.v4i64.p0(<4 x i64> undef, ptr undef, <4 x i1> undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <4 x i64> undef, ptr undef, align 32 -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.vp.store.v8i64.p0(<8 x i64> undef, ptr undef, <8 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.vp.store.v8i64.p0(<8 x i64> undef, ptr undef, <8 x i1> undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <8 x i64> undef, ptr undef, align 64 -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.vp.store.v16i64.p0(<16 x i64> undef, ptr undef, <16 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.vp.store.v16i64.p0(<16 x i64> undef, ptr undef, <16 x i1> undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store <16 x i64> undef, ptr undef, align 128 -; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.vp.store.nxv2i8.p0( undef, ptr undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.nxv2i8.p0( undef, ptr undef, undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store undef, ptr undef, align 2 -; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.vp.store.nxv4i8.p0( undef, ptr undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.nxv4i8.p0( undef, ptr undef, undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store undef, ptr undef, align 4 -; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.vp.store.nxv8i8.p0( undef, ptr undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.nxv8i8.p0( undef, ptr undef, undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store undef, ptr undef, align 8 -; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.vp.store.nxv16i8.p0( undef, ptr undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.vp.store.nxv16i8.p0( undef, ptr undef, undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store undef, ptr undef, align 16 -; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.vp.store.nxv2i64.p0( undef, ptr undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.vp.store.nxv2i64.p0( undef, ptr undef, undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store undef, ptr undef, align 16 -; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.vp.store.nxv4i64.p0( undef, ptr undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.vp.store.nxv4i64.p0( undef, ptr undef, undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store undef, ptr undef, align 32 -; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.vp.store.nxv8i64.p0( undef, ptr undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.vp.store.nxv8i64.p0( undef, ptr undef, undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store undef, ptr undef, align 64 -; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.vp.store.nxv16i64.p0( undef, ptr undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.vp.store.nxv16i64.p0( undef, ptr undef, undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: store undef, ptr undef, align 128 ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; From 0a3b6af768c95f8d8260aa28adbccabd60f7e9e5 Mon Sep 17 00:00:00 2001 From: Rainer Orth Date: Thu, 19 Sep 2024 10:04:18 +0200 Subject: [PATCH 171/321] [ASan][test] Skip Linux/odr_c_test.c on SPARC (#109111) When ASan testing is enabled on SPARC as per PR #107405, the ``` AddressSanitizer-sparc-linux :: TestCases/Linux/odr_c_test.c ``` test `FAIL`s on Linux/sparc64: ``` + projects/compiler-rt/test/asan/SPARCLinuxConfig/TestCases/Linux/Output/odr_c_test.c.tmp + count 0 Expected 0 lines, got 13. AddressSanitizer:DEADLYSIGNAL ================================================================= ==4165420==ERROR: AddressSanitizer: BUS on unknown address (pc 0x7012d5b4 bp 0xffa3b938 sp 0xffa3b8d0 T0) ==4165420==The signal is caused by a READ memory access. ==4165420==Hint: this fault was caused by a dereference of a high value address (see register values below). Disassemble the provided pc to learn which register was used. ``` The test relies on an unaligned access, which cannot work on a strict-alignment target like SPARC. Thus this patch skips the test. Tested on `sparc64-unknown-linux-gnu`. --- compiler-rt/test/asan/TestCases/Linux/odr_c_test.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/compiler-rt/test/asan/TestCases/Linux/odr_c_test.c b/compiler-rt/test/asan/TestCases/Linux/odr_c_test.c index 9929b4a67af38f..4aafe6888c6843 100644 --- a/compiler-rt/test/asan/TestCases/Linux/odr_c_test.c +++ b/compiler-rt/test/asan/TestCases/Linux/odr_c_test.c @@ -10,6 +10,9 @@ // RUN: %clang_asan -fcommon %s -fPIC -shared -mllvm -asan-use-private-alias=1 -o %dynamiclib2 -DFILE2 // RUN: %run %t 2>&1 | count 0 +// Unaligned accesses don't work on strict-alignment targets like SPARC. +// UNSUPPORTED: sparc-target-arch + // CHECK: The following global variable is not properly aligned. // CHECK: ERROR: AddressSanitizer: odr-violation #if defined(FILE1) From 752e10379c2ffb4f6eebf490f1fab7eb769dfbf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krist=C3=B3f=20Umann?= Date: Thu, 19 Sep 2024 10:04:47 +0200 Subject: [PATCH 172/321] =?UTF-8?q?[analyzer]=20Explicitly=20register=20No?= =?UTF-8?q?StoreFuncVisitor=20from=20alpha.unix.cst=E2=80=A6=20(#108373)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …ring.UninitRead This is a drastic simplification of #106982. If you read that patch, this is the same thing with all BugReporterVisitors.cpp and SValBuilder.cpp changes removed! (since all replies came regarding changed to those files, I felt the new PR was justified) The patch was inspired by a pretty poor bug report on FFMpeg: ![image](https://github.com/user-attachments/assets/8f4e03d8-45a4-4ea2-a63d-3ab78d097be9) In this bug report, block is uninitialized, hence the bug report that it should not have been passed to memcpy. The confusing part is in line 93, where block was passed as a non-const pointer to seq_unpack_rle_block, which was obviously meant to initialize block. As developers, we know that clang likely didn't skip this function and found a path of execution on which this initialization failed, but NoStoreFuncVisitor failed to attach the usual "returning without writing to block" message. I fixed this by instead of tracking the entire array, I tracked the actual element which was found to be uninitialized (Remember, we heuristically only check if the first and last-to-access element is initialized, not the entire array). This is how the bug report looks now, with 'seq_unpack_rle_block' having notes describing the path of execution and lack of a value change: ![image](https://github.com/user-attachments/assets/8de5d101-052e-4ecb-9cd9-7c29724333d2) ![image](https://github.com/user-attachments/assets/8bf52a95-62de-44e7-aef8-03a46a3fa08e) Since NoStoreFuncVisitor was a TU-local class, I moved it back to BugReporterVisitors.h, and registered it manually in CStringChecker.cpp. This was done because we don't have a good trackRegionValue() function, only a trackExpressionValue() function. We have an expression for the array, but not for its first (or last-to-access) element, so I only had a MemRegion on hand. --- .../Core/BugReporter/BugReporterVisitors.h | 85 ++++++++++++++++++ .../Core/PathSensitive/CheckerContext.h | 1 + .../StaticAnalyzer/Core/PathSensitive/SVals.h | 2 + .../Checkers/CStringChecker.cpp | 13 ++- .../Core/BugReporterVisitors.cpp | 89 ------------------- clang/lib/StaticAnalyzer/Core/SVals.cpp | 17 ++++ .../test/Analysis/cstring-uninitread-notes.c | 25 ++++++ 7 files changed, 139 insertions(+), 93 deletions(-) create mode 100644 clang/test/Analysis/cstring-uninitread-notes.c diff --git a/clang/include/clang/StaticAnalyzer/Core/BugReporter/BugReporterVisitors.h b/clang/include/clang/StaticAnalyzer/Core/BugReporter/BugReporterVisitors.h index f97514955a5913..56f7ca63d00621 100644 --- a/clang/include/clang/StaticAnalyzer/Core/BugReporter/BugReporterVisitors.h +++ b/clang/include/clang/StaticAnalyzer/Core/BugReporter/BugReporterVisitors.h @@ -718,6 +718,91 @@ class NoStateChangeFuncVisitor : public BugReporterVisitor { PathSensitiveBugReport &R) final; }; +/// Put a diagnostic on return statement of all inlined functions +/// for which the region of interest \p RegionOfInterest was passed into, +/// but not written inside, and it has caused an undefined read or a null +/// pointer dereference outside. +class NoStoreFuncVisitor final : public NoStateChangeFuncVisitor { + const SubRegion *RegionOfInterest; + MemRegionManager &MmrMgr; + const SourceManager &SM; + const PrintingPolicy &PP; + + /// Recursion limit for dereferencing fields when looking for the + /// region of interest. + /// The limit of two indicates that we will dereference fields only once. + static const unsigned DEREFERENCE_LIMIT = 2; + + using RegionVector = SmallVector; + +public: + NoStoreFuncVisitor( + const SubRegion *R, + bugreporter::TrackingKind TKind = bugreporter::TrackingKind::Thorough) + : NoStateChangeFuncVisitor(TKind), RegionOfInterest(R), + MmrMgr(R->getMemRegionManager()), + SM(MmrMgr.getContext().getSourceManager()), + PP(MmrMgr.getContext().getPrintingPolicy()) {} + + void Profile(llvm::FoldingSetNodeID &ID) const override { + static int Tag = 0; + ID.AddPointer(&Tag); + ID.AddPointer(RegionOfInterest); + } + +private: + /// \return Whether \c RegionOfInterest was modified at \p CurrN compared to + /// the value it holds in \p CallExitBeginN. + bool wasModifiedBeforeCallExit(const ExplodedNode *CurrN, + const ExplodedNode *CallExitBeginN) override; + + /// Attempts to find the region of interest in a given record decl, + /// by either following the base classes or fields. + /// Dereferences fields up to a given recursion limit. + /// Note that \p Vec is passed by value, leading to quadratic copying cost, + /// but it's OK in practice since its length is limited to DEREFERENCE_LIMIT. + /// \return A chain fields leading to the region of interest or std::nullopt. + const std::optional + findRegionOfInterestInRecord(const RecordDecl *RD, ProgramStateRef State, + const MemRegion *R, const RegionVector &Vec = {}, + int depth = 0); + + // Region of interest corresponds to an IVar, exiting a method + // which could have written into that IVar, but did not. + PathDiagnosticPieceRef maybeEmitNoteForObjCSelf(PathSensitiveBugReport &R, + const ObjCMethodCall &Call, + const ExplodedNode *N) final; + + PathDiagnosticPieceRef maybeEmitNoteForCXXThis(PathSensitiveBugReport &R, + const CXXConstructorCall &Call, + const ExplodedNode *N) final; + + PathDiagnosticPieceRef + maybeEmitNoteForParameters(PathSensitiveBugReport &R, const CallEvent &Call, + const ExplodedNode *N) final; + + /// Consume the information on the no-store stack frame in order to + /// either emit a note or suppress the report entirely. + /// \return Diagnostics piece for region not modified in the current function, + /// if it decides to emit one. + PathDiagnosticPieceRef + maybeEmitNote(PathSensitiveBugReport &R, const CallEvent &Call, + const ExplodedNode *N, const RegionVector &FieldChain, + const MemRegion *MatchedRegion, StringRef FirstElement, + bool FirstIsReferenceType, unsigned IndirectionLevel); + + bool prettyPrintRegionName(const RegionVector &FieldChain, + const MemRegion *MatchedRegion, + StringRef FirstElement, bool FirstIsReferenceType, + unsigned IndirectionLevel, + llvm::raw_svector_ostream &os); + + StringRef prettyPrintFirstElement(StringRef FirstElement, + bool MoreItemsExpected, + int IndirectionLevel, + llvm::raw_svector_ostream &os); +}; + } // namespace ento } // namespace clang diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h index 0365f9e41312df..168983fd5cb686 100644 --- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h +++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h @@ -69,6 +69,7 @@ class CheckerContext { /// the state of the program before the checker ran. Note, checkers should /// not retain the node in their state since the nodes might get invalidated. ExplodedNode *getPredecessor() { return Pred; } + const ProgramPoint getLocation() const { return Location; } const ProgramStateRef &getState() const { return Pred->getState(); } /// Check if the checker changed the state of the execution; ex: added diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SVals.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SVals.h index def2970d448d48..a054a819a15a85 100644 --- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SVals.h +++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SVals.h @@ -89,6 +89,8 @@ class SVal { SValKind getKind() const { return Kind; } + StringRef getKindStr() const; + // This method is required for using SVal in a FoldingSetNode. It // extracts a unique signature for this SVal object. void Profile(llvm::FoldingSetNodeID &ID) const { diff --git a/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp index 8dd08f14b2728b..21a2d8828249d1 100644 --- a/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp @@ -16,6 +16,7 @@ #include "clang/Basic/Builtins.h" #include "clang/Basic/CharInfo.h" #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h" +#include "clang/StaticAnalyzer/Core/BugReporter/BugReporterVisitors.h" #include "clang/StaticAnalyzer/Core/BugReporter/BugType.h" #include "clang/StaticAnalyzer/Core/Checker.h" #include "clang/StaticAnalyzer/Core/CheckerManager.h" @@ -337,7 +338,8 @@ class CStringChecker : public Checker< eval::Call, const Stmt *S, StringRef WarningMsg) const; void emitAdditionOverflowBug(CheckerContext &C, ProgramStateRef State) const; void emitUninitializedReadBug(CheckerContext &C, ProgramStateRef State, - const Expr *E, StringRef Msg) const; + const Expr *E, const MemRegion *R, + StringRef Msg) const; ProgramStateRef checkAdditionOverflow(CheckerContext &C, ProgramStateRef state, NonLoc left, @@ -474,7 +476,8 @@ ProgramStateRef CStringChecker::checkInit(CheckerContext &C, OS << "The first element of the "; printIdxWithOrdinalSuffix(OS, Buffer.ArgumentIndex + 1); OS << " argument is undefined"; - emitUninitializedReadBug(C, State, Buffer.Expression, OS.str()); + emitUninitializedReadBug(C, State, Buffer.Expression, + FirstElementVal->getAsRegion(), OS.str()); return nullptr; } @@ -538,7 +541,8 @@ ProgramStateRef CStringChecker::checkInit(CheckerContext &C, OS << ") in the "; printIdxWithOrdinalSuffix(OS, Buffer.ArgumentIndex + 1); OS << " argument is undefined"; - emitUninitializedReadBug(C, State, Buffer.Expression, OS.str()); + emitUninitializedReadBug(C, State, Buffer.Expression, + LastElementVal.getAsRegion(), OS.str()); return nullptr; } return State; @@ -818,7 +822,7 @@ void CStringChecker::emitNullArgBug(CheckerContext &C, ProgramStateRef State, void CStringChecker::emitUninitializedReadBug(CheckerContext &C, ProgramStateRef State, - const Expr *E, + const Expr *E, const MemRegion *R, StringRef Msg) const { if (ExplodedNode *N = C.generateErrorNode(State)) { if (!BT_UninitRead) @@ -831,6 +835,7 @@ void CStringChecker::emitUninitializedReadBug(CheckerContext &C, Report->getLocation()); Report->addRange(E->getSourceRange()); bugreporter::trackExpressionValue(N, E, *Report); + Report->addVisitor(R->castAs()); C.emitReport(std::move(Report)); } } diff --git a/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp b/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp index 7102bf51a57e8b..68c8a8dc682507 100644 --- a/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp +++ b/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp @@ -522,95 +522,6 @@ PathDiagnosticPieceRef NoStateChangeFuncVisitor::VisitNode( return maybeEmitNoteForParameters(R, *Call, N); } -//===----------------------------------------------------------------------===// -// Implementation of NoStoreFuncVisitor. -//===----------------------------------------------------------------------===// - -namespace { -/// Put a diagnostic on return statement of all inlined functions -/// for which the region of interest \p RegionOfInterest was passed into, -/// but not written inside, and it has caused an undefined read or a null -/// pointer dereference outside. -class NoStoreFuncVisitor final : public NoStateChangeFuncVisitor { - const SubRegion *RegionOfInterest; - MemRegionManager &MmrMgr; - const SourceManager &SM; - const PrintingPolicy &PP; - - /// Recursion limit for dereferencing fields when looking for the - /// region of interest. - /// The limit of two indicates that we will dereference fields only once. - static const unsigned DEREFERENCE_LIMIT = 2; - - using RegionVector = SmallVector; - -public: - NoStoreFuncVisitor(const SubRegion *R, bugreporter::TrackingKind TKind) - : NoStateChangeFuncVisitor(TKind), RegionOfInterest(R), - MmrMgr(R->getMemRegionManager()), - SM(MmrMgr.getContext().getSourceManager()), - PP(MmrMgr.getContext().getPrintingPolicy()) {} - - void Profile(llvm::FoldingSetNodeID &ID) const override { - static int Tag = 0; - ID.AddPointer(&Tag); - ID.AddPointer(RegionOfInterest); - } - -private: - /// \return Whether \c RegionOfInterest was modified at \p CurrN compared to - /// the value it holds in \p CallExitBeginN. - bool wasModifiedBeforeCallExit(const ExplodedNode *CurrN, - const ExplodedNode *CallExitBeginN) override; - - /// Attempts to find the region of interest in a given record decl, - /// by either following the base classes or fields. - /// Dereferences fields up to a given recursion limit. - /// Note that \p Vec is passed by value, leading to quadratic copying cost, - /// but it's OK in practice since its length is limited to DEREFERENCE_LIMIT. - /// \return A chain fields leading to the region of interest or std::nullopt. - const std::optional - findRegionOfInterestInRecord(const RecordDecl *RD, ProgramStateRef State, - const MemRegion *R, const RegionVector &Vec = {}, - int depth = 0); - - // Region of interest corresponds to an IVar, exiting a method - // which could have written into that IVar, but did not. - PathDiagnosticPieceRef maybeEmitNoteForObjCSelf(PathSensitiveBugReport &R, - const ObjCMethodCall &Call, - const ExplodedNode *N) final; - - PathDiagnosticPieceRef maybeEmitNoteForCXXThis(PathSensitiveBugReport &R, - const CXXConstructorCall &Call, - const ExplodedNode *N) final; - - PathDiagnosticPieceRef - maybeEmitNoteForParameters(PathSensitiveBugReport &R, const CallEvent &Call, - const ExplodedNode *N) final; - - /// Consume the information on the no-store stack frame in order to - /// either emit a note or suppress the report enirely. - /// \return Diagnostics piece for region not modified in the current function, - /// if it decides to emit one. - PathDiagnosticPieceRef - maybeEmitNote(PathSensitiveBugReport &R, const CallEvent &Call, - const ExplodedNode *N, const RegionVector &FieldChain, - const MemRegion *MatchedRegion, StringRef FirstElement, - bool FirstIsReferenceType, unsigned IndirectionLevel); - - bool prettyPrintRegionName(const RegionVector &FieldChain, - const MemRegion *MatchedRegion, - StringRef FirstElement, bool FirstIsReferenceType, - unsigned IndirectionLevel, - llvm::raw_svector_ostream &os); - - StringRef prettyPrintFirstElement(StringRef FirstElement, - bool MoreItemsExpected, - int IndirectionLevel, - llvm::raw_svector_ostream &os); -}; -} // namespace - /// \return Whether the method declaration \p Parent /// syntactically has a binary operation writing into the ivar \p Ivar. static bool potentiallyWritesIntoIvar(const Decl *Parent, diff --git a/clang/lib/StaticAnalyzer/Core/SVals.cpp b/clang/lib/StaticAnalyzer/Core/SVals.cpp index 291e4fa752a8f7..84e7e033404c03 100644 --- a/clang/lib/StaticAnalyzer/Core/SVals.cpp +++ b/clang/lib/StaticAnalyzer/Core/SVals.cpp @@ -263,6 +263,23 @@ bool SVal::isZeroConstant() const { // Pretty-Printing. //===----------------------------------------------------------------------===// +StringRef SVal::getKindStr() const { + switch (getKind()) { +#define BASIC_SVAL(Id, Parent) \ + case Id##Kind: \ + return #Id; +#define LOC_SVAL(Id, Parent) \ + case Loc##Id##Kind: \ + return #Id; +#define NONLOC_SVAL(Id, Parent) \ + case NonLoc##Id##Kind: \ + return #Id; +#include "clang/StaticAnalyzer/Core/PathSensitive/SVals.def" +#undef REGION + } + llvm_unreachable("Unkown kind!"); +} + LLVM_DUMP_METHOD void SVal::dump() const { dumpToStream(llvm::errs()); } void SVal::printJson(raw_ostream &Out, bool AddQuotes) const { diff --git a/clang/test/Analysis/cstring-uninitread-notes.c b/clang/test/Analysis/cstring-uninitread-notes.c new file mode 100644 index 00000000000000..b62519a85c8cc9 --- /dev/null +++ b/clang/test/Analysis/cstring-uninitread-notes.c @@ -0,0 +1,25 @@ +// RUN: %clang_analyze_cc1 -verify %s \ +// RUN: -analyzer-checker=core,alpha.unix.cstring \ +// RUN: -analyzer-output=text + +#include "Inputs/system-header-simulator.h" + +// Inspired by a report on ffmpeg, libavcodec/tiertexseqv.c, seq_decode_op1(). +int coin(); + +void maybeWrite(const char *src, unsigned size, int *dst) { + if (coin()) // expected-note{{Assuming the condition is false}} + // expected-note@-1{{Taking false branch}} + memcpy(dst, src, size); +} // expected-note{{Returning without writing to '*dst'}} + +void returning_without_writing_to_memcpy(const char *src, unsigned size) { + int block[8 * 8]; // expected-note{{'block' initialized here}} + // expected-note@+1{{Calling 'maybeWrite'}} + maybeWrite(src, size, block); // expected-note{{Returning from 'maybeWrite'}} + + int buf[8 * 8]; + memcpy(buf, &block[0], 8); // expected-warning{{The first element of the 2nd argument is undefined [alpha.unix.cstring.UninitializedRead]}} + // expected-note@-1{{The first element of the 2nd argument is undefined}} + // expected-note@-2{{Other elements might also be undefined}} +} From 30cdf1e959d2e4dee1c871ff37470dcdb7e8d099 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 19 Sep 2024 10:19:15 +0200 Subject: [PATCH 173/321] [SimplifyCFG] Pass context instruction to isSafeToSpeculativelyExecute() (#109132) Pass speculation target and assumption cache to isSafeToSpeculativelyExecute() calls. This allows speculating based on dereferenceable/align assumptions, but the primary motivation here is to avoid regressions from planned changes to fix https://github.com/llvm/llvm-project/issues/108854. --- llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 32 +++++++++---------- .../SimplifyCFG/speculate-derefable-load.ll | 19 +++-------- 2 files changed, 20 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 09461e65e2dc21..69c4475a494cbe 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -398,9 +398,6 @@ static void addPredecessorToBlock(BasicBlock *Succ, BasicBlock *NewPred, /// expensive. static InstructionCost computeSpeculationCost(const User *I, const TargetTransformInfo &TTI) { - assert((!isa(I) || - isSafeToSpeculativelyExecute(cast(I))) && - "Instruction is not safe to speculatively execute!"); return TTI.getInstructionCost(I, TargetTransformInfo::TCK_SizeAndLatency); } @@ -421,12 +418,11 @@ static InstructionCost computeSpeculationCost(const User *I, /// After this function returns, Cost is increased by the cost of /// V plus its non-dominating operands. If that cost is greater than /// Budget, false is returned and Cost is undefined. -static bool dominatesMergePoint(Value *V, BasicBlock *BB, +static bool dominatesMergePoint(Value *V, BasicBlock *BB, Instruction *InsertPt, SmallPtrSetImpl &AggressiveInsts, - InstructionCost &Cost, - InstructionCost Budget, + InstructionCost &Cost, InstructionCost Budget, const TargetTransformInfo &TTI, - unsigned Depth = 0) { + AssumptionCache *AC, unsigned Depth = 0) { // It is possible to hit a zero-cost cycle (phi/gep instructions for example), // so limit the recursion depth. // TODO: While this recursion limit does prevent pathological behavior, it @@ -461,7 +457,7 @@ static bool dominatesMergePoint(Value *V, BasicBlock *BB, // Okay, it looks like the instruction IS in the "condition". Check to // see if it's a cheap instruction to unconditionally compute, and if it // only uses stuff defined outside of the condition. If so, hoist it out. - if (!isSafeToSpeculativelyExecute(I)) + if (!isSafeToSpeculativelyExecute(I, InsertPt, AC)) return false; Cost += computeSpeculationCost(I, TTI); @@ -480,8 +476,8 @@ static bool dominatesMergePoint(Value *V, BasicBlock *BB, // Okay, we can only really hoist these out if their operands do // not take us over the cost threshold. for (Use &Op : I->operands()) - if (!dominatesMergePoint(Op, BB, AggressiveInsts, Cost, Budget, TTI, - Depth + 1)) + if (!dominatesMergePoint(Op, BB, InsertPt, AggressiveInsts, Cost, Budget, + TTI, AC, Depth + 1)) return false; // Okay, it's safe to do this! Remember this instruction. AggressiveInsts.insert(I); @@ -3140,7 +3136,8 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI, return false; // Don't hoist the instruction if it's unsafe or expensive. - if (!IsSafeCheapLoadStore && !isSafeToSpeculativelyExecute(&I) && + if (!IsSafeCheapLoadStore && + !isSafeToSpeculativelyExecute(&I, BI, Options.AC) && !(HoistCondStores && !SpeculatedStoreValue && (SpeculatedStoreValue = isSafeToSpeculateStore(&I, BB, ThenBB, EndBB)))) @@ -3651,7 +3648,8 @@ static bool foldCondBranchOnValueKnownInPredecessor(BranchInst *BI, /// Given a BB that starts with the specified two-entry PHI node, /// see if we can eliminate it. static bool foldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI, - DomTreeUpdater *DTU, const DataLayout &DL, + DomTreeUpdater *DTU, AssumptionCache *AC, + const DataLayout &DL, bool SpeculateUnpredictables) { // Ok, this is a two entry PHI node. Check to see if this is a simple "if // statement", which has a very simple dominance structure. Basically, we @@ -3741,10 +3739,10 @@ static bool foldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI, continue; } - if (!dominatesMergePoint(PN->getIncomingValue(0), BB, AggressiveInsts, - Cost, Budget, TTI) || - !dominatesMergePoint(PN->getIncomingValue(1), BB, AggressiveInsts, - Cost, Budget, TTI)) + if (!dominatesMergePoint(PN->getIncomingValue(0), BB, DomBI, + AggressiveInsts, Cost, Budget, TTI, AC) || + !dominatesMergePoint(PN->getIncomingValue(1), BB, DomBI, + AggressiveInsts, Cost, Budget, TTI, AC)) return Changed; } @@ -8116,7 +8114,7 @@ bool SimplifyCFGOpt::simplifyOnce(BasicBlock *BB) { // eliminate it, do so now. if (auto *PN = dyn_cast(BB->begin())) if (PN->getNumIncomingValues() == 2) - if (foldTwoEntryPHINode(PN, TTI, DTU, DL, + if (foldTwoEntryPHINode(PN, TTI, DTU, Options.AC, DL, Options.SpeculateUnpredictables)) return true; } diff --git a/llvm/test/Transforms/SimplifyCFG/speculate-derefable-load.ll b/llvm/test/Transforms/SimplifyCFG/speculate-derefable-load.ll index 9e3f333018e680..8c7afa4598bd4b 100644 --- a/llvm/test/Transforms/SimplifyCFG/speculate-derefable-load.ll +++ b/llvm/test/Transforms/SimplifyCFG/speculate-derefable-load.ll @@ -4,14 +4,10 @@ define i64 @align_deref_align(i1 %c, ptr %p) { ; CHECK-LABEL: define i64 @align_deref_align( ; CHECK-SAME: i1 [[C:%.*]], ptr [[P:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[P]], i64 8), "align"(ptr [[P]], i64 8) ] -; CHECK-NEXT: br i1 [[C]], label %[[IF:.*]], label %[[EXIT:.*]] -; CHECK: [[IF]]: ; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[P]], align 8 -; CHECK-NEXT: br label %[[EXIT]] -; CHECK: [[EXIT]]: -; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[V]], %[[IF]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[RES:%.*]] = select i1 [[C]], i64 [[V]], i64 0 ; CHECK-NEXT: ret i64 [[RES]] ; entry: @@ -30,17 +26,12 @@ exit: define i64 @assume_deref_align2(i1 %c1, i32 %x, ptr %p) { ; CHECK-LABEL: define i64 @assume_deref_align2( ; CHECK-SAME: i1 [[C1:%.*]], i32 [[X:%.*]], ptr [[P:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[P]], i64 8), "align"(ptr [[P]], i64 8) ] -; CHECK-NEXT: br i1 [[C1]], label %[[IF1:.*]], label %[[EXIT:.*]] -; CHECK: [[IF1]]: ; CHECK-NEXT: [[C2:%.*]] = icmp ugt i32 [[X]], 10 -; CHECK-NEXT: br i1 [[C2]], label %[[IF2:.*]], label %[[EXIT]] -; CHECK: [[IF2]]: ; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[P]], align 8 -; CHECK-NEXT: br label %[[EXIT]] -; CHECK: [[EXIT]]: -; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[V]], %[[IF2]] ], [ 1, %[[IF1]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[C2]], i64 [[V]], i64 1 +; CHECK-NEXT: [[RES:%.*]] = select i1 [[C1]], i64 [[SPEC_SELECT]], i64 0 ; CHECK-NEXT: ret i64 [[RES]] ; entry: From 959448fbd6bc6f74fb3f9655b1387d0e8a272ab8 Mon Sep 17 00:00:00 2001 From: yonghong-song Date: Thu, 19 Sep 2024 10:21:58 +0200 Subject: [PATCH 174/321] =?UTF-8?q?[Transforms][IPO]=20Add=20func=20suffix?= =?UTF-8?q?=20in=20ArgumentPromotion=20and=20DeadArgume=E2=80=A6=20(#10574?= =?UTF-8?q?2)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …ntElimination ArgumentPromotion and DeadArgumentElimination passes could change function signatures but the function name remains the same as before the transformation. This makes it hard for tracing with bpf programs where user tends to use function signature in the source. See discussion [1] for details. This patch added suffix to functions whose signatures are changed. The suffix lets users know that function signature has changed and they need to impact the IR or binary to find modified signature before tracing those functions. The suffix for ArgumentPromotion is ".argprom" and the suffixes for DeadArgumentElimination are ".argelim" and ".retelim". The suffix also gives user hints about what kind of transformation has been done. With this patch, I built a recent linux kernel with full LTO enabled. I got 4 functions with only argpromotion like ``` set_track_update.argelim.argprom pmd_trans_huge_lock.argprom ... ``` I got 1058 functions with only deadargelim like ``` process_bit0.argelim pci_io_ecs_init.argelim ... ``` I got 3 functions with both argpromotion and deadargelim ``` set_track_update.argelim.argprom zero_pud_populate.argelim.argprom zero_pmd_populate.argelim.argprom ``` [1] https://github.com/llvm/llvm-project/issues/104678 --- llvm/lib/Transforms/IPO/ArgumentPromotion.cpp | 1 + .../IPO/DeadArgumentElimination.cpp | 4 +++ .../remove-dead-function-spurious-ref-edge.ll | 4 +-- llvm/test/BugPoint/remove_arguments_test.ll | 2 +- llvm/test/CodeGen/AArch64/arg_promotion.ll | 16 +++++----- llvm/test/CodeGen/AMDGPU/internalize.ll | 2 +- .../ThinLTO/X86/memprof-aliased-location1.ll | 24 +++++++------- .../ThinLTO/X86/memprof-aliased-location2.ll | 24 +++++++------- llvm/test/ThinLTO/X86/memprof-basic.ll | 19 ++++++++++- .../X86/memprof-duplicate-context-ids.ll | 14 +++++++- .../ThinLTO/X86/memprof-funcassigncloning.ll | 19 ++++++++++- llvm/test/ThinLTO/X86/memprof-indirectcall.ll | 15 ++++++++- llvm/test/ThinLTO/X86/memprof-inlined.ll | 15 ++++++++- .../2008-02-01-ReturnAttrs.ll | 4 +-- .../ArgumentPromotion/BPF/argpromotion.ll | 2 +- .../ArgumentPromotion/X86/attributes.ll | 4 +-- .../X86/min-legal-vector-width.ll | 32 +++++++++---------- .../ArgumentPromotion/X86/thiscall.ll | 4 +-- .../ArgumentPromotion/actual-arguments.ll | 10 +++--- .../aggregate-promote-dead-gep.ll | 4 +-- .../ArgumentPromotion/aggregate-promote.ll | 4 +-- .../Transforms/ArgumentPromotion/align.ll | 16 +++++----- .../Transforms/ArgumentPromotion/allocsize.ll | 16 +++++----- .../Transforms/ArgumentPromotion/attrs.ll | 4 +-- .../Transforms/ArgumentPromotion/basictest.ll | 8 ++--- .../Transforms/ArgumentPromotion/bitcasts.ll | 8 ++--- .../Transforms/ArgumentPromotion/byval-2.ll | 4 +-- .../ArgumentPromotion/byval-with-padding.ll | 4 +-- .../Transforms/ArgumentPromotion/byval.ll | 20 ++++++------ .../Transforms/ArgumentPromotion/chained.ll | 4 +-- .../ArgumentPromotion/control-flow2.ll | 4 +-- .../Transforms/ArgumentPromotion/crash.ll | 2 +- llvm/test/Transforms/ArgumentPromotion/dbg.ll | 4 +-- .../test/Transforms/ArgumentPromotion/fp80.ll | 12 +++---- .../Transforms/ArgumentPromotion/inalloca.ll | 4 +-- .../ArgumentPromotion/invalidation.ll | 6 ++-- ...lignment-value-overflows-addrspace-size.ll | 8 ++--- .../ArgumentPromotion/max-elements-limit.ll | 4 +-- .../Transforms/ArgumentPromotion/metadata.ll | 8 ++--- .../min-legal-vector-width.ll | 4 +-- .../nonzero-address-spaces.ll | 4 +-- .../ArgumentPromotion/opaque-ptr.ll | 4 +-- .../Transforms/ArgumentPromotion/pr27568.ll | 4 +-- .../Transforms/ArgumentPromotion/pr32917.ll | 4 +-- .../pr33641_remove_arg_dbgvalue.ll | 2 +- .../Transforms/ArgumentPromotion/profile.ll | 4 +-- .../propagate-remove-dead-args.ll | 18 +++++------ .../recursion/aggregate-promote-recursive.ll | 6 ++-- .../argpromotion-recursion-pr1259.ll | 8 ++--- .../recursion/recursion-mixed-calls.ll | 12 +++---- .../recursion/recursion-non-zero-offset.ll | 8 ++--- .../ArgumentPromotion/reserve-tbaa.ll | 4 +-- .../test/Transforms/ArgumentPromotion/sret.ll | 4 +-- .../ArgumentPromotion/store-into-inself.ll | 4 +-- .../ArgumentPromotion/unused-argument.ll | 8 ++--- ...r_cached_analysis_for_deleted_functions.ll | 4 +-- .../DeadArgElim/2007-02-07-FuncRename.ll | 2 +- .../DeadArgElim/2007-12-20-ParamAttrs.ll | 4 +-- .../DeadArgElim/2010-04-30-DbgInfo.ll | 4 +-- .../test/Transforms/DeadArgElim/aggregates.ll | 10 +++--- .../Transforms/DeadArgElim/call_profile.ll | 4 +-- llvm/test/Transforms/DeadArgElim/comdat.ll | 2 +- .../dbginfo-update-dbgval-local.ll | 6 ++-- llvm/test/Transforms/DeadArgElim/dbginfo.ll | 2 +- .../test/Transforms/DeadArgElim/deadretval.ll | 4 +-- llvm/test/Transforms/DeadArgElim/fct_ptr.ll | 2 +- .../Transforms/DeadArgElim/func_metadata.ll | 4 +-- llvm/test/Transforms/DeadArgElim/funclet.ll | 2 +- llvm/test/Transforms/DeadArgElim/keepalive.ll | 4 +-- .../DeadArgElim/nonzero-address-spaces.ll | 4 +-- llvm/test/Transforms/DeadArgElim/returned.ll | 10 +++--- .../Transforms/DeadArgElim/variadic_safety.ll | 2 +- .../function-specialization2.ll | 12 +++---- .../global-var-constants.ll | 14 ++++---- .../non-argument-tracked.ll | 24 +++++++------- .../specialization-order.ll | 12 +++---- llvm/test/Transforms/PhaseOrdering/dae-dce.ll | 6 ++-- .../dce-after-argument-promotion.ll | 4 +-- llvm/test/Transforms/SCCP/recursion.ll | 6 ++-- 79 files changed, 342 insertions(+), 263 deletions(-) diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp index 1f9b546ed29996..c8b75dd475ae44 100644 --- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -215,6 +215,7 @@ doPromotion(Function *F, FunctionAnalysisManager &FAM, F->getParent()->getFunctionList().insert(F->getIterator(), NF); NF->takeName(F); + NF->setName(NF->getName() + ".argprom"); // Loop over all the callers of the function, transforming the call sites to // pass in the loaded pointers. diff --git a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp index d1548592b1ce26..b912cc66d19db5 100644 --- a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -889,6 +889,10 @@ bool DeadArgumentEliminationPass::removeDeadStuffFromFunction(Function *F) { // it again. F->getParent()->getFunctionList().insert(F->getIterator(), NF); NF->takeName(F); + if (NumArgumentsEliminated) + NF->setName(NF->getName() + ".argelim"); + else + NF->setName(NF->getName() + ".retelim"); NF->IsNewDbgInfoFormat = F->IsNewDbgInfoFormat; // Loop over all the callers of the function, transforming the call sites to diff --git a/llvm/test/Analysis/LazyCallGraph/remove-dead-function-spurious-ref-edge.ll b/llvm/test/Analysis/LazyCallGraph/remove-dead-function-spurious-ref-edge.ll index 2bc486f541c71f..4f16c02b1473ff 100644 --- a/llvm/test/Analysis/LazyCallGraph/remove-dead-function-spurious-ref-edge.ll +++ b/llvm/test/Analysis/LazyCallGraph/remove-dead-function-spurious-ref-edge.ll @@ -9,7 +9,7 @@ define internal void @a() alwaysinline { } define internal void @b(ptr) noinline { -; CHECK-LABEL: @b( +; CHECK-LABEL: @b.argprom( ; CHECK-NEXT: ret void ; ret void @@ -17,7 +17,7 @@ define internal void @b(ptr) noinline { define internal void @c() noinline { ; CHECK-LABEL: @c( -; CHECK-NEXT: call void @b() +; CHECK-NEXT: call void @b.argprom() ; CHECK-NEXT: ret void ; call void @b(ptr @a) diff --git a/llvm/test/BugPoint/remove_arguments_test.ll b/llvm/test/BugPoint/remove_arguments_test.ll index 9e9c51eaafc383..bb93e45e4b46ef 100644 --- a/llvm/test/BugPoint/remove_arguments_test.ll +++ b/llvm/test/BugPoint/remove_arguments_test.ll @@ -11,7 +11,7 @@ declare i32 @test2() -; CHECK: define void @test() { +; CHECK: define void @test.argelim() { define i32 @test(i32 %A, ptr %B, float %C) { call i32 @test2() ret i32 %1 diff --git a/llvm/test/CodeGen/AArch64/arg_promotion.ll b/llvm/test/CodeGen/AArch64/arg_promotion.ll index cc37d230c6cbe4..724a7f109f1e29 100644 --- a/llvm/test/CodeGen/AArch64/arg_promotion.ll +++ b/llvm/test/CodeGen/AArch64/arg_promotion.ll @@ -38,16 +38,16 @@ define dso_local void @caller_4xi32(ptr noalias %src, ptr noalias %dst) #1 { ; CHECK-LABEL: define dso_local void @caller_4xi32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SRC_VAL:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 16 -; CHECK-NEXT: call fastcc void @callee_4xi32(<4 x i32> [[SRC_VAL]], ptr noalias [[DST:%.*]]) +; CHECK-NEXT: call fastcc void @callee_4xi32.argprom.argprom(<4 x i32> [[SRC_VAL]], ptr noalias [[DST:%.*]]) ; CHECK-NEXT: ret void ; entry: - call fastcc void @callee_4xi32(ptr noalias %src, ptr noalias %dst) + call fastcc void @callee_4xi32.argprom(ptr noalias %src, ptr noalias %dst) ret void } -define internal fastcc void @callee_4xi32(ptr noalias %src, ptr noalias %dst) #1 { -; CHECK-LABEL: define internal fastcc void @callee_4xi32( +define internal fastcc void @callee_4xi32.argprom(ptr noalias %src, ptr noalias %dst) #1 { +; CHECK-LABEL: define internal fastcc void @callee_4xi32.argprom.argprom( ; CHECK-NEXT: entry: ; CHECK-NEXT: store <4 x i32> [[SRC_0_VAL:%.*]], ptr [[DST:%.*]], align 16 ; CHECK-NEXT: ret void @@ -65,7 +65,7 @@ define dso_local void @caller_i256(ptr noalias %src, ptr noalias %dst) #0 { ; CHECK-LABEL: define dso_local void @caller_i256( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SRC_VAL:%.*]] = load i256, ptr [[SRC:%.*]], align 16 -; CHECK-NEXT: call fastcc void @callee_i256(i256 [[SRC_VAL]], ptr noalias [[DST:%.*]]) +; CHECK-NEXT: call fastcc void @callee_i256.argprom(i256 [[SRC_VAL]], ptr noalias [[DST:%.*]]) ; CHECK-NEXT: ret void ; entry: @@ -74,7 +74,7 @@ entry: } define internal fastcc void @callee_i256(ptr noalias %src, ptr noalias %dst) #0 { -; CHECK-LABEL: define internal fastcc void @callee_i256( +; CHECK-LABEL: define internal fastcc void @callee_i256.argprom( ; CHECK-NEXT: entry: ; CHECK-NEXT: store i256 [[SRC_0_VAL:%.*]], ptr [[DST:%.*]], align 16 ; CHECK-NEXT: ret void @@ -159,7 +159,7 @@ define dso_local void @caller_struct4xi32(ptr noalias %src, ptr noalias %dst) #1 ; CHECK-NEXT: [[SRC_VAL:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 16 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[SRC]], i64 16 ; CHECK-NEXT: [[SRC_VAL1:%.*]] = load <4 x i32>, ptr [[TMP0]], align 16 -; CHECK-NEXT: call fastcc void @callee_struct4xi32(<4 x i32> [[SRC_VAL]], <4 x i32> [[SRC_VAL1]], ptr noalias [[DST:%.*]]) +; CHECK-NEXT: call fastcc void @callee_struct4xi32.argprom(<4 x i32> [[SRC_VAL]], <4 x i32> [[SRC_VAL1]], ptr noalias [[DST:%.*]]) ; CHECK-NEXT: ret void ; entry: @@ -168,7 +168,7 @@ entry: } define internal fastcc void @callee_struct4xi32(ptr noalias %src, ptr noalias %dst) #1 { -; CHECK-LABEL: define internal fastcc void @callee_struct4xi32( +; CHECK-LABEL: define internal fastcc void @callee_struct4xi32.argprom( ; CHECK-NEXT: entry: ; CHECK-NEXT: store <4 x i32> [[SRC_0_VAL:%.*]], ptr [[DST:%.*]], align 16 ; CHECK-NEXT: [[DST2:%.*]] = getelementptr inbounds [[STRUCT_4XI32:%.*]], ptr [[DST]], i64 0, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/internalize.ll b/llvm/test/CodeGen/AMDGPU/internalize.ll index 6b2a4d5fc328b4..08b42f93bf5f47 100644 --- a/llvm/test/CodeGen/AMDGPU/internalize.ll +++ b/llvm/test/CodeGen/AMDGPU/internalize.ll @@ -10,7 +10,7 @@ ; ALL: gvar_used @gvar_used = addrspace(1) global i32 undef, align 4 -; OPT: define internal fastcc void @func_used_noinline( +; OPT: define internal fastcc void @func_used_noinline.argelim( ; OPT-NONE: define fastcc void @func_used_noinline( define fastcc void @func_used_noinline(ptr addrspace(1) %out, i32 %tid) #1 { entry: diff --git a/llvm/test/ThinLTO/X86/memprof-aliased-location1.ll b/llvm/test/ThinLTO/X86/memprof-aliased-location1.ll index 42819d5421ca0f..8be9727b316d28 100644 --- a/llvm/test/ThinLTO/X86/memprof-aliased-location1.ll +++ b/llvm/test/ThinLTO/X86/memprof-aliased-location1.ll @@ -84,22 +84,22 @@ attributes #0 = { noinline optnone } ;; The first call to foo does not allocate cold memory. It should call the ;; original functions, which ultimately call the original allocation decorated ;; with a "notcold" attribute. -; IR: call {{.*}} @_Z3foov() +; IR: call {{.*}} @_Z3foov.retelim() ;; The second call to foo allocates cold memory. It should call cloned functions ;; which ultimately call a cloned allocation decorated with a "cold" attribute. -; IR: call {{.*}} @_Z3foov.memprof.1() -; IR: define internal {{.*}} @_Z3barv() +; IR: call {{.*}} @_Z3foov.memprof.1.retelim() +; IR: define internal {{.*}} @_Z3barv.retelim() ; IR: call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]] -; IR: define internal {{.*}} @_Z3bazv() -; IR: call {{.*}} @_Z3barv() -; IR: define internal {{.*}} @_Z3foov() -; IR: call {{.*}} @_Z3bazv() -; IR: define internal {{.*}} @_Z3barv.memprof.1() +; IR: define internal {{.*}} @_Z3bazv.retelim() +; IR: call {{.*}} @_Z3barv.retelim() +; IR: define internal {{.*}} @_Z3foov.retelim() +; IR: call {{.*}} @_Z3bazv.retelim() +; IR: define internal {{.*}} @_Z3barv.memprof.1.retelim() ; IR: call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]] -; IR: define internal {{.*}} @_Z3bazv.memprof.1() -; IR: call {{.*}} @_Z3barv.memprof.1() -; IR: define internal {{.*}} @_Z3foov.memprof.1() -; IR: call {{.*}} @_Z3bazv.memprof.1() +; IR: define internal {{.*}} @_Z3bazv.memprof.1.retelim() +; IR: call {{.*}} @_Z3barv.memprof.1.retelim() +; IR: define internal {{.*}} @_Z3foov.memprof.1.retelim() +; IR: call {{.*}} @_Z3bazv.memprof.1.retelim() ; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" } ; IR: attributes #[[COLD]] = { "memprof"="cold" } diff --git a/llvm/test/ThinLTO/X86/memprof-aliased-location2.ll b/llvm/test/ThinLTO/X86/memprof-aliased-location2.ll index 663f8525043c2f..4c18cf8226c8bb 100644 --- a/llvm/test/ThinLTO/X86/memprof-aliased-location2.ll +++ b/llvm/test/ThinLTO/X86/memprof-aliased-location2.ll @@ -84,22 +84,22 @@ attributes #0 = { noinline optnone } ;; The first call to foo does not allocate cold memory. It should call the ;; original functions, which ultimately call the original allocation decorated ;; with a "notcold" attribute. -; IR: call {{.*}} @_Z3foov() +; IR: call {{.*}} @_Z3foov.retelim() ;; The second call to foo allocates cold memory. It should call cloned functions ;; which ultimately call a cloned allocation decorated with a "cold" attribute. -; IR: call {{.*}} @_Z3foov.memprof.1() -; IR: define internal {{.*}} @_Z3barv() +; IR: call {{.*}} @_Z3foov.memprof.1.retelim() +; IR: define internal {{.*}} @_Z3barv.retelim() ; IR: call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]] -; IR: define internal {{.*}} @_Z3bazv() -; IR: call {{.*}} @_Z3barv() -; IR: define internal {{.*}} @_Z3foov() -; IR: call {{.*}} @_Z3bazv() -; IR: define internal {{.*}} @_Z3barv.memprof.1() +; IR: define internal {{.*}} @_Z3bazv.retelim() +; IR: call {{.*}} @_Z3barv.retelim() +; IR: define internal {{.*}} @_Z3foov.retelim() +; IR: call {{.*}} @_Z3bazv.retelim() +; IR: define internal {{.*}} @_Z3barv.memprof.1.retelim() ; IR: call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]] -; IR: define internal {{.*}} @_Z3bazv.memprof.1() -; IR: call {{.*}} @_Z3barv.memprof.1() -; IR: define internal {{.*}} @_Z3foov.memprof.1() -; IR: call {{.*}} @_Z3bazv.memprof.1() +; IR: define internal {{.*}} @_Z3bazv.memprof.1.retelim() +; IR: call {{.*}} @_Z3barv.memprof.1.retelim() +; IR: define internal {{.*}} @_Z3foov.memprof.1.retelim() +; IR: call {{.*}} @_Z3bazv.memprof.1.retelim() ; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" } ; IR: attributes #[[COLD]] = { "memprof"="cold" } diff --git a/llvm/test/ThinLTO/X86/memprof-basic.ll b/llvm/test/ThinLTO/X86/memprof-basic.ll index 6922dbfd368467..b7aadf8e32a771 100644 --- a/llvm/test/ThinLTO/X86/memprof-basic.ll +++ b/llvm/test/ThinLTO/X86/memprof-basic.ll @@ -53,7 +53,7 @@ ;; We should have cloned bar, baz, and foo, for the cold memory allocation. ; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED -; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IRNODIST ;; Try again but with distributed ThinLTO @@ -303,6 +303,23 @@ attributes #0 = { noinline optnone } ; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" } ; IR: attributes #[[COLD]] = { "memprof"="cold" } +; IRNODIST: define {{.*}} @main +; IRNODIST: call {{.*}} @_Z3foov.retelim() +; IRNODIST: call {{.*}} @_Z3foov.memprof.1.retelim() +; IRNODIST: define internal {{.*}} @_Z3barv.retelim() +; IRNODIST: call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]] +; IRNODIST: define internal {{.*}} @_Z3bazv.retelim() +; IRNODIST: call {{.*}} @_Z3barv.retelim() +; IRNODIST: define internal {{.*}} @_Z3foov.retelim() +; IRNODIST: call {{.*}} @_Z3bazv.retelim() +; IRNODIST: define internal {{.*}} @_Z3barv.memprof.1.retelim() +; IRNODIST: call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]] +; IRNODIST: define internal {{.*}} @_Z3bazv.memprof.1.retelim() +; IRNODIST: call {{.*}} @_Z3barv.memprof.1.retelim() +; IRNODIST: define internal {{.*}} @_Z3foov.memprof.1.retelim() +; IRNODIST: call {{.*}} @_Z3bazv.memprof.1.retelim() +; IRNODIST: attributes #[[NOTCOLD]] = { "memprof"="notcold" } +; IRNODIST: attributes #[[COLD]] = { "memprof"="cold" } ; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) ; STATS-BE: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend diff --git a/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll index 65d794e9cba87c..bfc7b02a956c6f 100644 --- a/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll +++ b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll @@ -68,7 +68,7 @@ ; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \ ; RUN: --check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS -; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IRNODIST ;; Try again but with distributed ThinLTO @@ -247,6 +247,18 @@ attributes #0 = { noinline optnone} ; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" } ; IR: attributes #[[COLD]] = { "memprof"="cold" } +; IRNODIST: define internal {{.*}} @_Z1Dv.retelim() +; IRNODIST: call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]] +; IRNODIST: define internal {{.*}} @_Z1Fv.retelim() +; IRNODIST: call {{.*}} @_Z1Dv.retelim() +; IRNODIST: define internal {{.*}} @_Z1Bv.retelim() +; IRNODIST: call {{.*}} @_Z1Dv.memprof.1.retelim() +; IRNODIST: define internal {{.*}} @_Z1Ev.retelim() +; IRNODIST: call {{.*}} @_Z1Dv.memprof.1.retelim() +; IRNODIST: define internal {{.*}} @_Z1Dv.memprof.1.retelim() +; IRNODIST: call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]] +; IRNODIST: attributes #[[NOTCOLD]] = { "memprof"="notcold" } +; IRNODIST: attributes #[[COLD]] = { "memprof"="cold" } ; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) ; STATS-BE: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend diff --git a/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll b/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll index f1a494d077fefc..4153524bf44706 100644 --- a/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll +++ b/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll @@ -61,7 +61,7 @@ ; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \ ; RUN: --check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS -; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IRNODIST ;; Try again but with distributed ThinLTO @@ -283,6 +283,23 @@ attributes #0 = { noinline optnone } ; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" } ; IR: attributes #[[COLD]] = { "memprof"="cold" } +; IRNODIST: define internal {{.*}} @_Z1EPPcS0_.argelim( +; IRNODIST: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]] +; IRNODIST: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]] +; IRNODIST: define internal {{.*}} @_Z1BPPcS0_( +; IRNODIST: call {{.*}} @_Z1EPPcS0_.argelim( +; IRNODIST: define internal {{.*}} @_Z1CPPcS0_( +; IRNODIST: call {{.*}} @_Z1EPPcS0_.memprof.3.argelim( +; IRNODIST: define internal {{.*}} @_Z1DPPcS0_( +; IRNODIST: call {{.*}} @_Z1EPPcS0_.memprof.2.argelim( +; IRNODIST: define internal {{.*}} @_Z1EPPcS0_.memprof.2.argelim( +; IRNODIST: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]] +; IRNODIST: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]] +; IRNODIST: define internal {{.*}} @_Z1EPPcS0_.memprof.3.argelim( +; IRNODIST: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]] +; IRNODIST: call {{.*}} @_Znam(i64 noundef 10) #[[COLD]] +; IRNODIST: attributes #[[NOTCOLD]] = { "memprof"="notcold" } +; IRNODIST: attributes #[[COLD]] = { "memprof"="cold" } ; STATS: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) ; STATS-BE: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend diff --git a/llvm/test/ThinLTO/X86/memprof-indirectcall.ll b/llvm/test/ThinLTO/X86/memprof-indirectcall.ll index 07a52f441ca278..ba8811b46175e3 100644 --- a/llvm/test/ThinLTO/X86/memprof-indirectcall.ll +++ b/llvm/test/ThinLTO/X86/memprof-indirectcall.ll @@ -74,7 +74,7 @@ ;; from main allocating cold memory. ; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED -; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IRNODIST ;; Try again but with distributed ThinLTO @@ -419,6 +419,19 @@ attributes #0 = { noinline optnone } ; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" } ; IR: attributes #[[COLD]] = { "memprof"="cold" } +; IRNODIST: define {{.*}} @main( +; IRNODIST: call {{.*}} @_Z3foov.argelim() +; IRNODIST: call {{.*}} @_Z3foov.memprof.1.argelim() +; IRNODIST: call {{.*}} @_Z3barP1A.argelim( +; IRNODIST: call {{.*}} @_Z3barP1A.argelim( +; IRNODIST: call {{.*}} @_Z3barP1A.argelim( +; IRNODIST: call {{.*}} @_Z3barP1A.argelim( +; IRNODIST: define internal {{.*}} @_Z3foov.argelim() +; IRNODIST: call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]] +; IRNODIST: define internal {{.*}} @_Z3foov.memprof.1.argelim() +; IRNODIST: call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]] +; IRNODIST: attributes #[[NOTCOLD]] = { "memprof"="notcold" } +; IRNODIST: attributes #[[COLD]] = { "memprof"="cold" } ; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) ; STATS-BE: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend diff --git a/llvm/test/ThinLTO/X86/memprof-inlined.ll b/llvm/test/ThinLTO/X86/memprof-inlined.ll index 89df345b220423..7111a536a3110a 100644 --- a/llvm/test/ThinLTO/X86/memprof-inlined.ll +++ b/llvm/test/ThinLTO/X86/memprof-inlined.ll @@ -63,7 +63,7 @@ ;; cold memory. ; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED -; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IRNODIST ;; Try again but with distributed ThinLTO @@ -323,6 +323,19 @@ attributes #0 = { noinline optnone } ; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" } ; IR: attributes #[[COLD]] = { "memprof"="cold" } +; IRNODIST: define internal {{.*}} @_Z3barv.retelim() +; IRNODIST: call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]] +; IRNODIST: define internal {{.*}} @_Z3foov.retelim() +; IRNODIST: call {{.*}} @_Z3barv.retelim() +; IRNODIST: define {{.*}} @main() +; IRNODIST: call {{.*}} @_Z3foov.retelim() +; IRNODIST: call {{.*}} @_Z3foov.memprof.1.retelim() +; IRNODIST: define internal {{.*}} @_Z3barv.memprof.1.retelim() +; IRNODIST: call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]] +; IRNODIST: define internal {{.*}} @_Z3foov.memprof.1.retelim() +; IRNODIST: call {{.*}} @_Z3barv.memprof.1.retelim() +; IRNODIST: attributes #[[NOTCOLD]] = { "memprof"="notcold" } +; IRNODIST: attributes #[[COLD]] = { "memprof"="cold" } ; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) ; STATS-BE: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend diff --git a/llvm/test/Transforms/ArgumentPromotion/2008-02-01-ReturnAttrs.ll b/llvm/test/Transforms/ArgumentPromotion/2008-02-01-ReturnAttrs.ll index daa4e1fb757d21..51839033177034 100644 --- a/llvm/test/Transforms/ArgumentPromotion/2008-02-01-ReturnAttrs.ll +++ b/llvm/test/Transforms/ArgumentPromotion/2008-02-01-ReturnAttrs.ll @@ -3,7 +3,7 @@ ; RUN: cat %t | FileCheck -check-prefix=REMARK %s define internal i32 @deref(ptr %x) nounwind { -; CHECK-LABEL: define {{[^@]+}}@deref +; CHECK-LABEL: define {{[^@]+}}@deref.argprom ; CHECK-SAME: (i32 [[X_0_VAL:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: ret i32 [[X_0_VAL]] @@ -29,7 +29,7 @@ define i32 @f(i32 %x) { ; CHECK-NEXT: [[X_ADDR:%.*]] = alloca i32, align 4 ; CHECK-NEXT: store i32 [[X]], ptr [[X_ADDR]], align 4 ; CHECK-NEXT: [[X_ADDR_VAL:%.*]] = load i32, ptr [[X_ADDR]], align 4 -; CHECK-NEXT: [[TEMP1:%.*]] = call i32 @deref(i32 [[X_ADDR_VAL]]) +; CHECK-NEXT: [[TEMP1:%.*]] = call i32 @deref.argprom(i32 [[X_ADDR_VAL]]) ; CHECK-NEXT: ret i32 [[TEMP1]] ; entry: diff --git a/llvm/test/Transforms/ArgumentPromotion/BPF/argpromotion.ll b/llvm/test/Transforms/ArgumentPromotion/BPF/argpromotion.ll index 6c39f27115ada4..f317a5a4533484 100644 --- a/llvm/test/Transforms/ArgumentPromotion/BPF/argpromotion.ll +++ b/llvm/test/Transforms/ArgumentPromotion/BPF/argpromotion.ll @@ -85,4 +85,4 @@ entry: ; Without number-of-argument constraint, argpromotion will create a function signature with 5 arguments, which equals ; the maximum number of argument permitted by bpf backend, so argpromotion result code does work. ; -; CHECK: i32 @foo2(i32 %p1.0.val, i32 %p1.4.val, i32 %p2.8.val, i32 %p2.16.val, i32 %p3.20.val) +; CHECK: i32 @foo2.argprom(i32 %p1.0.val, i32 %p1.4.val, i32 %p2.8.val, i32 %p2.16.val, i32 %p3.20.val) diff --git a/llvm/test/Transforms/ArgumentPromotion/X86/attributes.ll b/llvm/test/Transforms/ArgumentPromotion/X86/attributes.ll index a64b7346d83618..6d34fb57c9dcf7 100644 --- a/llvm/test/Transforms/ArgumentPromotion/X86/attributes.ll +++ b/llvm/test/Transforms/ArgumentPromotion/X86/attributes.ll @@ -42,7 +42,7 @@ bb: } define internal fastcc void @promote_avx2(ptr %arg, ptr readonly %arg1) #0 { -; CHECK-LABEL: define {{[^@]+}}@promote_avx2 +; CHECK-LABEL: define {{[^@]+}}@promote_avx2.argprom ; CHECK-SAME: (ptr [[ARG:%.*]], <4 x i64> [[ARG1_VAL:%.*]]) ; CHECK-NEXT: bb: ; CHECK-NEXT: store <4 x i64> [[ARG1_VAL]], ptr [[ARG]] @@ -62,7 +62,7 @@ define void @promote(ptr %arg) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x i64>, align 32 ; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 32 [[TMP]], i8 0, i64 32, i1 false) ; CHECK-NEXT: [[TMP_VAL:%.*]] = load <4 x i64>, ptr [[TMP]] -; CHECK-NEXT: call fastcc void @promote_avx2(ptr [[TMP2]], <4 x i64> [[TMP_VAL]]) +; CHECK-NEXT: call fastcc void @promote_avx2.argprom(ptr [[TMP2]], <4 x i64> [[TMP_VAL]]) ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32 ; CHECK-NEXT: store <4 x i64> [[TMP4]], ptr [[ARG]], align 2 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/ArgumentPromotion/X86/min-legal-vector-width.ll b/llvm/test/Transforms/ArgumentPromotion/X86/min-legal-vector-width.ll index 3373c09d5f91aa..99aa19e72371fb 100644 --- a/llvm/test/Transforms/ArgumentPromotion/X86/min-legal-vector-width.ll +++ b/llvm/test/Transforms/ArgumentPromotion/X86/min-legal-vector-width.ll @@ -7,7 +7,7 @@ target triple = "x86_64-unknown-linux-gnu" ; This should promote define internal fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512(ptr %arg, ptr readonly %arg1) #0 { -; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512 +; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512.argprom ; CHECK-SAME: (ptr [[ARG:%.*]], <8 x i64> [[ARG1_VAL:%.*]]) ; CHECK-NEXT: bb: ; CHECK-NEXT: store <8 x i64> [[ARG1_VAL]], ptr [[ARG]] @@ -27,7 +27,7 @@ define void @avx512_legal512_prefer512_call_avx512_legal512_prefer512(ptr %arg) ; CHECK-NEXT: [[TMP2:%.*]] = alloca <8 x i64>, align 32 ; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 32 [[TMP]], i8 0, i64 32, i1 false) ; CHECK-NEXT: [[TMP_VAL:%.*]] = load <8 x i64>, ptr [[TMP]] -; CHECK-NEXT: call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512(ptr [[TMP2]], <8 x i64> [[TMP_VAL]]) +; CHECK-NEXT: call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512.argprom(ptr [[TMP2]], <8 x i64> [[TMP_VAL]]) ; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr [[TMP2]], align 32 ; CHECK-NEXT: store <8 x i64> [[TMP4]], ptr [[ARG]], align 2 ; CHECK-NEXT: ret void @@ -44,7 +44,7 @@ bb: ; This should promote define internal fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256(ptr %arg, ptr readonly %arg1) #1 { -; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256 +; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256.argprom ; CHECK-SAME: (ptr [[ARG:%.*]], <8 x i64> [[ARG1_VAL:%.*]]) ; CHECK-NEXT: bb: ; CHECK-NEXT: store <8 x i64> [[ARG1_VAL]], ptr [[ARG]] @@ -64,7 +64,7 @@ define void @avx512_legal512_prefer256_call_avx512_legal512_prefer256(ptr %arg) ; CHECK-NEXT: [[TMP2:%.*]] = alloca <8 x i64>, align 32 ; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 32 [[TMP]], i8 0, i64 32, i1 false) ; CHECK-NEXT: [[TMP_VAL:%.*]] = load <8 x i64>, ptr [[TMP]] -; CHECK-NEXT: call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256(ptr [[TMP2]], <8 x i64> [[TMP_VAL]]) +; CHECK-NEXT: call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256.argprom(ptr [[TMP2]], <8 x i64> [[TMP_VAL]]) ; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr [[TMP2]], align 32 ; CHECK-NEXT: store <8 x i64> [[TMP4]], ptr [[ARG]], align 2 ; CHECK-NEXT: ret void @@ -81,7 +81,7 @@ bb: ; This should promote define internal fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256(ptr %arg, ptr readonly %arg1) #1 { -; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256 +; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256.argprom ; CHECK-SAME: (ptr [[ARG:%.*]], <8 x i64> [[ARG1_VAL:%.*]]) ; CHECK-NEXT: bb: ; CHECK-NEXT: store <8 x i64> [[ARG1_VAL]], ptr [[ARG]] @@ -101,7 +101,7 @@ define void @avx512_legal512_prefer512_call_avx512_legal512_prefer256(ptr %arg) ; CHECK-NEXT: [[TMP2:%.*]] = alloca <8 x i64>, align 32 ; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 32 [[TMP]], i8 0, i64 32, i1 false) ; CHECK-NEXT: [[TMP_VAL:%.*]] = load <8 x i64>, ptr [[TMP]] -; CHECK-NEXT: call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256(ptr [[TMP2]], <8 x i64> [[TMP_VAL]]) +; CHECK-NEXT: call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256.argprom(ptr [[TMP2]], <8 x i64> [[TMP_VAL]]) ; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr [[TMP2]], align 32 ; CHECK-NEXT: store <8 x i64> [[TMP4]], ptr [[ARG]], align 2 ; CHECK-NEXT: ret void @@ -118,7 +118,7 @@ bb: ; This should promote define internal fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512(ptr %arg, ptr readonly %arg1) #0 { -; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512 +; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512.argprom ; CHECK-SAME: (ptr [[ARG:%.*]], <8 x i64> [[ARG1_VAL:%.*]]) ; CHECK-NEXT: bb: ; CHECK-NEXT: store <8 x i64> [[ARG1_VAL]], ptr [[ARG]] @@ -138,7 +138,7 @@ define void @avx512_legal512_prefer256_call_avx512_legal512_prefer512(ptr %arg) ; CHECK-NEXT: [[TMP2:%.*]] = alloca <8 x i64>, align 32 ; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 32 [[TMP]], i8 0, i64 32, i1 false) ; CHECK-NEXT: [[TMP_VAL:%.*]] = load <8 x i64>, ptr [[TMP]] -; CHECK-NEXT: call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512(ptr [[TMP2]], <8 x i64> [[TMP_VAL]]) +; CHECK-NEXT: call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512.argprom(ptr [[TMP2]], <8 x i64> [[TMP_VAL]]) ; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr [[TMP2]], align 32 ; CHECK-NEXT: store <8 x i64> [[TMP4]], ptr [[ARG]], align 2 ; CHECK-NEXT: ret void @@ -229,7 +229,7 @@ bb: ; This should promote define internal fastcc void @callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256(ptr %arg, ptr readonly %arg1) #3 { -; CHECK-LABEL: define {{[^@]+}}@callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256 +; CHECK-LABEL: define {{[^@]+}}@callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256.argprom ; CHECK-SAME: (ptr [[ARG:%.*]], <8 x i64> [[ARG1_VAL:%.*]]) ; CHECK-NEXT: bb: ; CHECK-NEXT: store <8 x i64> [[ARG1_VAL]], ptr [[ARG]] @@ -249,7 +249,7 @@ define void @avx2_legal256_prefer256_call_avx2_legal512_prefer256(ptr %arg) #4 { ; CHECK-NEXT: [[TMP2:%.*]] = alloca <8 x i64>, align 32 ; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 32 [[TMP]], i8 0, i64 32, i1 false) ; CHECK-NEXT: [[TMP_VAL:%.*]] = load <8 x i64>, ptr [[TMP]] -; CHECK-NEXT: call fastcc void @callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256(ptr [[TMP2]], <8 x i64> [[TMP_VAL]]) +; CHECK-NEXT: call fastcc void @callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256.argprom(ptr [[TMP2]], <8 x i64> [[TMP_VAL]]) ; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr [[TMP2]], align 32 ; CHECK-NEXT: store <8 x i64> [[TMP4]], ptr [[ARG]], align 2 ; CHECK-NEXT: ret void @@ -266,7 +266,7 @@ bb: ; This should promote define internal fastcc void @callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256(ptr %arg, ptr readonly %arg1) #4 { -; CHECK-LABEL: define {{[^@]+}}@callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256 +; CHECK-LABEL: define {{[^@]+}}@callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256.argprom ; CHECK-SAME: (ptr [[ARG:%.*]], <8 x i64> [[ARG1_VAL:%.*]]) ; CHECK-NEXT: bb: ; CHECK-NEXT: store <8 x i64> [[ARG1_VAL]], ptr [[ARG]] @@ -286,7 +286,7 @@ define void @avx2_legal512_prefer256_call_avx2_legal256_prefer256(ptr %arg) #3 { ; CHECK-NEXT: [[TMP2:%.*]] = alloca <8 x i64>, align 32 ; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 32 [[TMP]], i8 0, i64 32, i1 false) ; CHECK-NEXT: [[TMP_VAL:%.*]] = load <8 x i64>, ptr [[TMP]] -; CHECK-NEXT: call fastcc void @callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256(ptr [[TMP2]], <8 x i64> [[TMP_VAL]]) +; CHECK-NEXT: call fastcc void @callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256.argprom(ptr [[TMP2]], <8 x i64> [[TMP_VAL]]) ; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr [[TMP2]], align 32 ; CHECK-NEXT: store <8 x i64> [[TMP4]], ptr [[ARG]], align 2 ; CHECK-NEXT: ret void @@ -303,7 +303,7 @@ bb: ; If the arguments are scalar, its ok to promote. define internal i32 @scalar_callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(ptr %X, ptr %Y) #2 { -; CHECK-LABEL: define {{[^@]+}}@scalar_callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256 +; CHECK-LABEL: define {{[^@]+}}@scalar_callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256.argprom ; CHECK-SAME: (i32 [[X_VAL:%.*]], i32 [[Y_VAL:%.*]]) ; CHECK-NEXT: [[C:%.*]] = add i32 [[X_VAL]], [[Y_VAL]] ; CHECK-NEXT: ret i32 [[C]] @@ -321,7 +321,7 @@ define i32 @scalar_avx512_legal256_prefer256_call_avx512_legal512_prefer256(ptr ; CHECK-NEXT: store i32 1, ptr [[A]] ; CHECK-NEXT: [[A_VAL:%.*]] = load i32, ptr [[A]] ; CHECK-NEXT: [[B_VAL:%.*]] = load i32, ptr [[B]] -; CHECK-NEXT: [[C:%.*]] = call i32 @scalar_callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(i32 [[A_VAL]], i32 [[B_VAL]]) +; CHECK-NEXT: [[C:%.*]] = call i32 @scalar_callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256.argprom(i32 [[A_VAL]], i32 [[B_VAL]]) ; CHECK-NEXT: ret i32 [[C]] ; %A = alloca i32 @@ -332,7 +332,7 @@ define i32 @scalar_avx512_legal256_prefer256_call_avx512_legal512_prefer256(ptr ; If the arguments are scalar, its ok to promote. define internal i32 @scalar_callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(ptr %X, ptr %Y) #2 { -; CHECK-LABEL: define {{[^@]+}}@scalar_callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256 +; CHECK-LABEL: define {{[^@]+}}@scalar_callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256.argprom ; CHECK-SAME: (i32 [[X_VAL:%.*]], i32 [[Y_VAL:%.*]]) ; CHECK-NEXT: [[C:%.*]] = add i32 [[X_VAL]], [[Y_VAL]] ; CHECK-NEXT: ret i32 [[C]] @@ -350,7 +350,7 @@ define i32 @scalar_avx512_legal512_prefer256_call_avx512_legal256_prefer256(ptr ; CHECK-NEXT: store i32 1, ptr [[A]] ; CHECK-NEXT: [[A_VAL:%.*]] = load i32, ptr [[A]] ; CHECK-NEXT: [[B_VAL:%.*]] = load i32, ptr [[B]] -; CHECK-NEXT: [[C:%.*]] = call i32 @scalar_callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(i32 [[A_VAL]], i32 [[B_VAL]]) +; CHECK-NEXT: [[C:%.*]] = call i32 @scalar_callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256.argprom(i32 [[A_VAL]], i32 [[B_VAL]]) ; CHECK-NEXT: ret i32 [[C]] ; %A = alloca i32 diff --git a/llvm/test/Transforms/ArgumentPromotion/X86/thiscall.ll b/llvm/test/Transforms/ArgumentPromotion/X86/thiscall.ll index 2195e437bc8637..22e2c92617182f 100644 --- a/llvm/test/Transforms/ArgumentPromotion/X86/thiscall.ll +++ b/llvm/test/Transforms/ArgumentPromotion/X86/thiscall.ll @@ -23,7 +23,7 @@ define internal x86_thiscallcc void @internalfun(ptr %this, ptr inalloca(<{ %str ; ARGPROMOTION-NEXT: call void @ext(ptr inalloca(<{ [[STRUCT_A]] }>) [[ARGMEM]]) ; ARGPROMOTION-NEXT: ret void ; -; GLOBALOPT_ARGPROMOTION-LABEL: define {{[^@]+}}@internalfun +; GLOBALOPT_ARGPROMOTION-LABEL: define {{[^@]+}}@internalfun.argprom ; GLOBALOPT_ARGPROMOTION-SAME: (ptr [[TMP0:%.*]]) unnamed_addr { ; GLOBALOPT_ARGPROMOTION-NEXT: entry: ; GLOBALOPT_ARGPROMOTION-NEXT: [[A:%.*]] = getelementptr inbounds <{ [[STRUCT_A:%.*]] }>, ptr [[TMP0]], i32 0, i32 0 @@ -56,7 +56,7 @@ define void @exportedfun(ptr %a) { ; GLOBALOPT_ARGPROMOTION-SAME: (ptr [[A:%.*]]) local_unnamed_addr { ; GLOBALOPT_ARGPROMOTION-NEXT: [[INALLOCA_SAVE:%.*]] = tail call ptr @llvm.stacksave.p0() ; GLOBALOPT_ARGPROMOTION-NEXT: [[ARGMEM:%.*]] = alloca inalloca <{ [[STRUCT_A:%.*]] }>, align 4 -; GLOBALOPT_ARGPROMOTION-NEXT: call fastcc void @internalfun(ptr [[ARGMEM]]) +; GLOBALOPT_ARGPROMOTION-NEXT: call fastcc void @internalfun.argprom(ptr [[ARGMEM]]) ; GLOBALOPT_ARGPROMOTION-NEXT: call void @llvm.stackrestore.p0(ptr [[INALLOCA_SAVE]]) ; GLOBALOPT_ARGPROMOTION-NEXT: ret void ; diff --git a/llvm/test/Transforms/ArgumentPromotion/actual-arguments.ll b/llvm/test/Transforms/ArgumentPromotion/actual-arguments.ll index 63366ba998c7bb..54e1727b5bca6a 100644 --- a/llvm/test/Transforms/ArgumentPromotion/actual-arguments.ll +++ b/llvm/test/Transforms/ArgumentPromotion/actual-arguments.ll @@ -12,7 +12,7 @@ define internal i32 @test_cannot_promote_1(ptr %p, ptr nocapture readonly %test_ ; CHECK-LABEL: define {{[^@]+}}@test_cannot_promote_1 ; CHECK-SAME: (ptr [[P:%.*]], ptr nocapture readonly [[TEST_C:%.*]]) { ; CHECK-NEXT: [[TEST_C_VAL:%.*]] = load i32, ptr [[TEST_C]], align 4 -; CHECK-NEXT: [[RES:%.*]] = call i32 @callee(ptr [[P]], i32 [[TEST_C_VAL]]) +; CHECK-NEXT: [[RES:%.*]] = call i32 @callee.argprom(ptr [[P]], i32 [[TEST_C_VAL]]) ; CHECK-NEXT: [[LTEST_C:%.*]] = load i32, ptr [[TEST_C]], align 4 ; CHECK-NEXT: [[SUM:%.*]] = add i32 [[LTEST_C]], [[RES]] ; CHECK-NEXT: ret i32 [[SUM]] @@ -33,7 +33,7 @@ define internal i32 @test_cannot_promote_2(ptr %p, ptr nocapture readonly %test_ ; CHECK-LABEL: define {{[^@]+}}@test_cannot_promote_2 ; CHECK-SAME: (ptr [[P:%.*]], ptr nocapture readonly [[TEST_C:%.*]]) { ; CHECK-NEXT: [[TEST_C_VAL:%.*]] = load i32, ptr [[TEST_C]], align 4 -; CHECK-NEXT: [[RES:%.*]] = call i32 @callee(ptr [[P]], i32 [[TEST_C_VAL]]) +; CHECK-NEXT: [[RES:%.*]] = call i32 @callee.argprom(ptr [[P]], i32 [[TEST_C_VAL]]) ; CHECK-NEXT: [[LTEST_C:%.*]] = load i32, ptr [[TEST_C]], align 4 ; CHECK-NEXT: [[SUM:%.*]] = add i32 [[LTEST_C]], [[RES]] ; CHECK-NEXT: ret i32 [[SUM]] @@ -54,7 +54,7 @@ define internal i32 @test_cannot_promote_3(ptr %p, ptr nocapture readonly %test_ ; CHECK-LABEL: define {{[^@]+}}@test_cannot_promote_3 ; CHECK-SAME: (ptr [[P:%.*]], ptr nocapture readonly [[TEST_C:%.*]]) { ; CHECK-NEXT: [[TEST_C_VAL:%.*]] = load i32, ptr [[TEST_C]], align 4 -; CHECK-NEXT: [[RES:%.*]] = call i32 @callee(ptr [[P]], i32 [[TEST_C_VAL]]) +; CHECK-NEXT: [[RES:%.*]] = call i32 @callee.argprom(ptr [[P]], i32 [[TEST_C_VAL]]) ; CHECK-NEXT: [[LTEST_C:%.*]] = load i32, ptr [[TEST_C]], align 4 ; CHECK-NEXT: [[SUM:%.*]] = add i32 [[LTEST_C]], [[RES]] ; CHECK-NEXT: ret i32 [[SUM]] @@ -77,7 +77,7 @@ define internal i32 @test_can_promote_1(ptr %p, ptr nocapture readonly %test_c) ; CHECK-LABEL: define {{[^@]+}}@test_can_promote_1 ; CHECK-SAME: (ptr [[P:%.*]], ptr nocapture readonly [[TEST_C:%.*]]) { ; CHECK-NEXT: [[TEST_C_VAL:%.*]] = load i32, ptr [[TEST_C]], align 4 -; CHECK-NEXT: [[RES:%.*]] = call i32 @callee(ptr [[P]], i32 [[TEST_C_VAL]]) +; CHECK-NEXT: [[RES:%.*]] = call i32 @callee.argprom(ptr [[P]], i32 [[TEST_C_VAL]]) ; CHECK-NEXT: [[LTEST_C:%.*]] = load i32, ptr [[TEST_C]], align 4 ; CHECK-NEXT: [[SUM:%.*]] = add i32 [[LTEST_C]], [[RES]] ; CHECK-NEXT: ret i32 [[SUM]] @@ -101,7 +101,7 @@ define internal i32 @test_can_promote_2(ptr %p, ptr nocapture readonly %test_c) ; CHECK-LABEL: define {{[^@]+}}@test_can_promote_2 ; CHECK-SAME: (ptr [[P:%.*]], ptr nocapture readonly [[TEST_C:%.*]]) { ; CHECK-NEXT: [[TEST_C_VAL:%.*]] = load i32, ptr [[TEST_C]], align 4 -; CHECK-NEXT: [[RES:%.*]] = call i32 @callee(ptr [[P]], i32 [[TEST_C_VAL]]) +; CHECK-NEXT: [[RES:%.*]] = call i32 @callee.argprom(ptr [[P]], i32 [[TEST_C_VAL]]) ; CHECK-NEXT: [[LTEST_C:%.*]] = load i32, ptr [[TEST_C]], align 4 ; CHECK-NEXT: [[SUM:%.*]] = add i32 [[LTEST_C]], [[RES]] ; CHECK-NEXT: ret i32 [[SUM]] diff --git a/llvm/test/Transforms/ArgumentPromotion/aggregate-promote-dead-gep.ll b/llvm/test/Transforms/ArgumentPromotion/aggregate-promote-dead-gep.ll index 75e802b1510c56..3ff3ac7ac61d75 100644 --- a/llvm/test/Transforms/ArgumentPromotion/aggregate-promote-dead-gep.ll +++ b/llvm/test/Transforms/ArgumentPromotion/aggregate-promote-dead-gep.ll @@ -5,7 +5,7 @@ @G = constant %T { i32 0, i32 0, i32 17, i32 25 } define internal i32 @test(ptr %p) { -; CHECK-LABEL: define {{[^@]+}}@test +; CHECK-LABEL: define {{[^@]+}}@test.argprom ; CHECK-SAME: (i32 [[P_12_VAL:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[V:%.*]] = add i32 [[P_12_VAL]], 10 @@ -24,7 +24,7 @@ define i32 @caller() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr @G, i64 12 ; CHECK-NEXT: [[G_VAL:%.*]] = load i32, ptr [[TMP0]], align 4 -; CHECK-NEXT: [[V:%.*]] = call i32 @test(i32 [[G_VAL]]) +; CHECK-NEXT: [[V:%.*]] = call i32 @test.argprom(i32 [[G_VAL]]) ; CHECK-NEXT: ret i32 [[V]] ; entry: diff --git a/llvm/test/Transforms/ArgumentPromotion/aggregate-promote.ll b/llvm/test/Transforms/ArgumentPromotion/aggregate-promote.ll index dc5b376850f08c..cbc3d07efc5e9b 100644 --- a/llvm/test/Transforms/ArgumentPromotion/aggregate-promote.ll +++ b/llvm/test/Transforms/ArgumentPromotion/aggregate-promote.ll @@ -5,7 +5,7 @@ @G = constant %T { i32 0, i32 0, i32 17, i32 25 } define internal i32 @test(ptr %p) { -; CHECK-LABEL: define {{[^@]+}}@test +; CHECK-LABEL: define {{[^@]+}}@test.argprom ; CHECK-SAME: (i32 [[P_8_VAL:%.*]], i32 [[P_12_VAL:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[V:%.*]] = add i32 [[P_12_VAL]], [[P_8_VAL]] @@ -27,7 +27,7 @@ define i32 @caller() { ; CHECK-NEXT: [[G_VAL:%.*]] = load i32, ptr [[TMP0]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr @G, i64 12 ; CHECK-NEXT: [[G_VAL1:%.*]] = load i32, ptr [[TMP1]], align 4 -; CHECK-NEXT: [[V:%.*]] = call i32 @test(i32 [[G_VAL]], i32 [[G_VAL1]]) +; CHECK-NEXT: [[V:%.*]] = call i32 @test.argprom(i32 [[G_VAL]], i32 [[G_VAL1]]) ; CHECK-NEXT: ret i32 [[V]] ; entry: diff --git a/llvm/test/Transforms/ArgumentPromotion/align.ll b/llvm/test/Transforms/ArgumentPromotion/align.ll index 656c7c9da5b4af..251f43b2ae7286 100644 --- a/llvm/test/Transforms/ArgumentPromotion/align.ll +++ b/llvm/test/Transforms/ArgumentPromotion/align.ll @@ -2,7 +2,7 @@ ; RUN: opt -S -passes=argpromotion < %s | FileCheck %s define internal i32 @callee_must_exec(ptr %p) { -; CHECK-LABEL: define {{[^@]+}}@callee_must_exec +; CHECK-LABEL: define {{[^@]+}}@callee_must_exec.argprom ; CHECK-SAME: (i32 [[P_0_VAL:%.*]]) { ; CHECK-NEXT: ret i32 [[P_0_VAL]] ; @@ -14,7 +14,7 @@ define void @caller_must_exec(ptr %p) { ; CHECK-LABEL: define {{[^@]+}}@caller_must_exec ; CHECK-SAME: (ptr [[P:%.*]]) { ; CHECK-NEXT: [[P_VAL:%.*]] = load i32, ptr [[P]], align 16 -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @callee_must_exec(i32 [[P_VAL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @callee_must_exec.argprom(i32 [[P_VAL]]) ; CHECK-NEXT: ret void ; call i32 @callee_must_exec(ptr %p) @@ -22,7 +22,7 @@ define void @caller_must_exec(ptr %p) { } define internal i32 @callee_guaranteed_aligned_1(i1 %c, ptr %p) { -; CHECK-LABEL: define {{[^@]+}}@callee_guaranteed_aligned_1 +; CHECK-LABEL: define {{[^@]+}}@callee_guaranteed_aligned_1.argprom ; CHECK-SAME: (i1 [[C:%.*]], i32 [[P_0_VAL:%.*]]) { ; CHECK-NEXT: br i1 [[C]], label [[IF:%.*]], label [[ELSE:%.*]] ; CHECK: if: @@ -44,7 +44,7 @@ define void @caller_guaranteed_aligned_1(i1 %c, ptr align 16 dereferenceable(4) ; CHECK-LABEL: define {{[^@]+}}@caller_guaranteed_aligned_1 ; CHECK-SAME: (i1 [[C:%.*]], ptr align 16 dereferenceable(4) [[P:%.*]]) { ; CHECK-NEXT: [[P_VAL:%.*]] = load i32, ptr [[P]], align 16 -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @callee_guaranteed_aligned_1(i1 [[C]], i32 [[P_VAL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @callee_guaranteed_aligned_1.argprom(i1 [[C]], i32 [[P_VAL]]) ; CHECK-NEXT: ret void ; call i32 @callee_guaranteed_aligned_1(i1 %c, ptr %p) @@ -52,7 +52,7 @@ define void @caller_guaranteed_aligned_1(i1 %c, ptr align 16 dereferenceable(4) } define internal i32 @callee_guaranteed_aligned_2(i1 %c, ptr align 16 dereferenceable(4) %p) { -; CHECK-LABEL: define {{[^@]+}}@callee_guaranteed_aligned_2 +; CHECK-LABEL: define {{[^@]+}}@callee_guaranteed_aligned_2.argprom ; CHECK-SAME: (i1 [[C:%.*]], i32 [[P_0_VAL:%.*]]) { ; CHECK-NEXT: br i1 [[C]], label [[IF:%.*]], label [[ELSE:%.*]] ; CHECK: if: @@ -74,7 +74,7 @@ define void @caller_guaranteed_aligned_2(i1 %c, ptr %p) { ; CHECK-LABEL: define {{[^@]+}}@caller_guaranteed_aligned_2 ; CHECK-SAME: (i1 [[C:%.*]], ptr [[P:%.*]]) { ; CHECK-NEXT: [[P_VAL:%.*]] = load i32, ptr [[P]], align 16 -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @callee_guaranteed_aligned_2(i1 [[C]], i32 [[P_VAL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @callee_guaranteed_aligned_2.argprom(i1 [[C]], i32 [[P_VAL]]) ; CHECK-NEXT: ret void ; call i32 @callee_guaranteed_aligned_2(i1 %c, ptr %p) @@ -83,7 +83,7 @@ define void @caller_guaranteed_aligned_2(i1 %c, ptr %p) { ; We have seen the offset before but with a lower alignment define internal i32 @callee_guaranteed_aligned_3(i1 %c, ptr align 16 dereferenceable(4) %p) { -; CHECK-LABEL: define {{[^@]+}}@callee_guaranteed_aligned_3 +; CHECK-LABEL: define {{[^@]+}}@callee_guaranteed_aligned_3.argprom ; CHECK-SAME: (i1 [[C:%.*]], i32 [[P_0_VAL:%.*]]) { ; CHECK-NEXT: br i1 [[C]], label [[IF:%.*]], label [[ELSE:%.*]] ; CHECK: if: @@ -106,7 +106,7 @@ define void @caller_guaranteed_aligned_3(i1 %c, ptr %p) { ; CHECK-LABEL: define {{[^@]+}}@caller_guaranteed_aligned_3 ; CHECK-SAME: (i1 [[C:%.*]], ptr [[P:%.*]]) { ; CHECK-NEXT: [[P_VAL:%.*]] = load i32, ptr [[P]], align 16 -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @callee_guaranteed_aligned_3(i1 [[C]], i32 [[P_VAL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @callee_guaranteed_aligned_3.argprom(i1 [[C]], i32 [[P_VAL]]) ; CHECK-NEXT: ret void ; call i32 @callee_guaranteed_aligned_3(i1 %c, ptr %p) diff --git a/llvm/test/Transforms/ArgumentPromotion/allocsize.ll b/llvm/test/Transforms/ArgumentPromotion/allocsize.ll index 36271e17c9d76d..ca648f5a012cc4 100644 --- a/llvm/test/Transforms/ArgumentPromotion/allocsize.ll +++ b/llvm/test/Transforms/ArgumentPromotion/allocsize.ll @@ -6,7 +6,7 @@ declare ptr @calloc(i64, i64) define internal ptr @my_alloc1(i64 %unchanged, ptr %unused, i64 %size, ptr %unused2) allocsize(2) { ; CHECK: Function Attrs: allocsize(1) -; CHECK-LABEL: define internal ptr @my_alloc1( +; CHECK-LABEL: define internal ptr @my_alloc1.argprom( ; CHECK-SAME: i64 [[UNCHANGED:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[PTR:%.*]] = call ptr @malloc(i64 [[SIZE]]) ; CHECK-NEXT: ret ptr [[PTR]] @@ -17,7 +17,7 @@ define internal ptr @my_alloc1(i64 %unchanged, ptr %unused, i64 %size, ptr %unus define internal ptr @my_alloc2(i64 %unchanged, ptr %unused, i64 %size, i64 %size2, ptr %unused2) allocsize(2,3) { ; CHECK: Function Attrs: allocsize(1,2) -; CHECK-LABEL: define internal ptr @my_alloc2( +; CHECK-LABEL: define internal ptr @my_alloc2.argprom( ; CHECK-SAME: i64 [[UNCHANGED:%.*]], i64 [[SIZE:%.*]], i64 [[SIZE2:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: [[PTR:%.*]] = call ptr @calloc(i64 [[SIZE]], i64 [[SIZE2]]) ; CHECK-NEXT: ret ptr [[PTR]] @@ -28,7 +28,7 @@ define internal ptr @my_alloc2(i64 %unchanged, ptr %unused, i64 %size, i64 %size define internal ptr @my_alloc3(i64 %unchanged, ptr %promoted, ptr %promoted2, i64 %size) allocsize(3) { ; CHECK: Function Attrs: allocsize(5) -; CHECK-LABEL: define internal ptr @my_alloc3( +; CHECK-LABEL: define internal ptr @my_alloc3.argprom( ; CHECK-SAME: i64 [[UNCHANGED:%.*]], i32 [[PROMOTED_0_VAL:%.*]], i32 [[PROMOTED_4_VAL:%.*]], i32 [[PROMOTED2_0_VAL:%.*]], i32 [[PROMOTED2_4_VAL:%.*]], i64 [[SIZE:%.*]]) #[[ATTR2:[0-9]+]] { ; CHECK-NEXT: [[PTR:%.*]] = call ptr @malloc(i64 [[SIZE]]) ; CHECK-NEXT: ret ptr [[PTR]] @@ -47,7 +47,7 @@ define internal ptr @my_alloc3(i64 %unchanged, ptr %promoted, ptr %promoted2, i6 define internal ptr @my_alloc4(i64 %unchanged, ptr %promoted, ptr %promoted2, i64 %size, i64 %size2) allocsize(3,4) { ; CHECK: Function Attrs: allocsize(5,6) -; CHECK-LABEL: define internal ptr @my_alloc4( +; CHECK-LABEL: define internal ptr @my_alloc4.argprom( ; CHECK-SAME: i64 [[UNCHANGED:%.*]], i32 [[PROMOTED_0_VAL:%.*]], i32 [[PROMOTED_4_VAL:%.*]], i32 [[PROMOTED2_0_VAL:%.*]], i32 [[PROMOTED2_4_VAL:%.*]], i64 [[SIZE:%.*]], i64 [[SIZE2:%.*]]) #[[ATTR3:[0-9]+]] { ; CHECK-NEXT: [[PTR:%.*]] = call ptr @calloc(i64 [[SIZE]], i64 [[SIZE2]]) ; CHECK-NEXT: ret ptr [[PTR]] @@ -67,22 +67,22 @@ define internal ptr @my_alloc4(i64 %unchanged, ptr %promoted, ptr %promoted2, i6 define void @call_my_alloc(ptr %arg, ptr %arg2) { ; CHECK-LABEL: define void @call_my_alloc( ; CHECK-SAME: ptr [[ARG:%.*]], ptr [[ARG2:%.*]]) { -; CHECK-NEXT: [[TMP1:%.*]] = call ptr @my_alloc1(i64 0, i64 2) -; CHECK-NEXT: [[TMP2:%.*]] = call ptr @my_alloc2(i64 0, i64 2, i64 2) +; CHECK-NEXT: [[TMP1:%.*]] = call ptr @my_alloc1.argprom(i64 0, i64 2) +; CHECK-NEXT: [[TMP2:%.*]] = call ptr @my_alloc2.argprom(i64 0, i64 2, i64 2) ; CHECK-NEXT: [[ARG_VAL:%.*]] = load i32, ptr [[ARG]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[ARG]], i64 4 ; CHECK-NEXT: [[ARG_VAL1:%.*]] = load i32, ptr [[TMP3]], align 4 ; CHECK-NEXT: [[ARG2_VAL:%.*]] = load i32, ptr [[ARG2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[ARG2]], i64 4 ; CHECK-NEXT: [[ARG2_VAL2:%.*]] = load i32, ptr [[TMP4]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = call ptr @my_alloc3(i64 0, i32 [[ARG_VAL]], i32 [[ARG_VAL1]], i32 [[ARG2_VAL]], i32 [[ARG2_VAL2]], i64 2) +; CHECK-NEXT: [[TMP5:%.*]] = call ptr @my_alloc3.argprom(i64 0, i32 [[ARG_VAL]], i32 [[ARG_VAL1]], i32 [[ARG2_VAL]], i32 [[ARG2_VAL2]], i64 2) ; CHECK-NEXT: [[ARG_VAL3:%.*]] = load i32, ptr [[ARG]], align 4 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[ARG]], i64 4 ; CHECK-NEXT: [[ARG_VAL4:%.*]] = load i32, ptr [[TMP6]], align 4 ; CHECK-NEXT: [[ARG2_VAL5:%.*]] = load i32, ptr [[ARG2]], align 4 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[ARG2]], i64 4 ; CHECK-NEXT: [[ARG2_VAL6:%.*]] = load i32, ptr [[TMP7]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = call ptr @my_alloc4(i64 0, i32 [[ARG_VAL3]], i32 [[ARG_VAL4]], i32 [[ARG2_VAL5]], i32 [[ARG2_VAL6]], i64 2, i64 2) +; CHECK-NEXT: [[TMP8:%.*]] = call ptr @my_alloc4.argprom(i64 0, i32 [[ARG_VAL3]], i32 [[ARG_VAL4]], i32 [[ARG2_VAL5]], i32 [[ARG2_VAL6]], i64 2, i64 2) ; CHECK-NEXT: ret void ; %ptr = call ptr @my_alloc1(i64 0, ptr null, i64 2, ptr null) diff --git a/llvm/test/Transforms/ArgumentPromotion/attrs.ll b/llvm/test/Transforms/ArgumentPromotion/attrs.ll index 2b68ef2e403ba0..665065b3c35096 100644 --- a/llvm/test/Transforms/ArgumentPromotion/attrs.ll +++ b/llvm/test/Transforms/ArgumentPromotion/attrs.ll @@ -4,7 +4,7 @@ %struct.ss = type { i32, i64 } define internal void @f(ptr byval(%struct.ss) align 4 %b, ptr byval(i32) align 4 %X, i32 %i) nounwind { -; CHECK-LABEL: define {{[^@]+}}@f +; CHECK-LABEL: define {{[^@]+}}@f.argprom ; CHECK-SAME: (i32 [[B_0:%.*]], i32 [[X:%.*]], i32 [[I:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TEMP:%.*]] = add i32 [[B_0]], 1 @@ -30,7 +30,7 @@ define i32 @test(ptr %X) { ; CHECK-NEXT: store i64 2, ptr [[TEMP4]], align 4 ; CHECK-NEXT: [[S_0_VAL:%.*]] = load i32, ptr [[S]], align 4 ; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr [[X]], align 4 -; CHECK-NEXT: call void @f(i32 [[S_0_VAL]], i32 [[X_VAL]], i32 zeroext 0) +; CHECK-NEXT: call void @f.argprom(i32 [[S_0_VAL]], i32 [[X_VAL]], i32 zeroext 0) ; CHECK-NEXT: ret i32 0 ; entry: diff --git a/llvm/test/Transforms/ArgumentPromotion/basictest.ll b/llvm/test/Transforms/ArgumentPromotion/basictest.ll index ba84ac126fe49b..47518f73ec07a7 100644 --- a/llvm/test/Transforms/ArgumentPromotion/basictest.ll +++ b/llvm/test/Transforms/ArgumentPromotion/basictest.ll @@ -3,7 +3,7 @@ target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128" define internal i32 @test(ptr %X, ptr %Y) { -; CHECK-LABEL: define {{[^@]+}}@test +; CHECK-LABEL: define {{[^@]+}}@test.argprom ; CHECK-SAME: (i32 [[X_0_VAL:%.*]], i32 [[Y_0_VAL:%.*]]) { ; CHECK-NEXT: [[C:%.*]] = add i32 [[X_0_VAL]], [[Y_0_VAL]] ; CHECK-NEXT: ret i32 [[C]] @@ -15,9 +15,9 @@ define internal i32 @test(ptr %X, ptr %Y) { } define internal i32 @caller(ptr %B) { -; CHECK-LABEL: define {{[^@]+}}@caller +; CHECK-LABEL: define {{[^@]+}}@caller.argprom ; CHECK-SAME: (i32 [[B_0_VAL:%.*]]) { -; CHECK-NEXT: [[C:%.*]] = call i32 @test(i32 1, i32 [[B_0_VAL]]) +; CHECK-NEXT: [[C:%.*]] = call i32 @test.argprom(i32 1, i32 [[B_0_VAL]]) ; CHECK-NEXT: ret i32 [[C]] ; %A = alloca i32 @@ -28,7 +28,7 @@ define internal i32 @caller(ptr %B) { define i32 @callercaller() { ; CHECK-LABEL: define {{[^@]+}}@callercaller() { -; CHECK-NEXT: [[X:%.*]] = call i32 @caller(i32 2) +; CHECK-NEXT: [[X:%.*]] = call i32 @caller.argprom(i32 2) ; CHECK-NEXT: ret i32 [[X]] ; %B = alloca i32 diff --git a/llvm/test/Transforms/ArgumentPromotion/bitcasts.ll b/llvm/test/Transforms/ArgumentPromotion/bitcasts.ll index 6f2c322d7877be..bc4e5cc13b160a 100644 --- a/llvm/test/Transforms/ArgumentPromotion/bitcasts.ll +++ b/llvm/test/Transforms/ArgumentPromotion/bitcasts.ll @@ -6,7 +6,7 @@ %opaque = type opaque define internal i32 @callee_basic(ptr %p) { -; CHECK-LABEL: define {{[^@]+}}@callee_basic +; CHECK-LABEL: define {{[^@]+}}@callee_basic.argprom ; CHECK-SAME: (i32 [[P_0_VAL:%.*]], i32 [[P_4_VAL:%.*]]) { ; CHECK-NEXT: [[Z:%.*]] = add i32 [[P_0_VAL]], [[P_4_VAL]] ; CHECK-NEXT: ret i32 [[Z]] @@ -24,7 +24,7 @@ define void @caller_basic(ptr %p) { ; CHECK-NEXT: [[P_VAL:%.*]] = load i32, ptr [[P]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[P]], i64 4 ; CHECK-NEXT: [[P_VAL1:%.*]] = load i32, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @callee_basic(i32 [[P_VAL]], i32 [[P_VAL1]]) +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @callee_basic.argprom(i32 [[P_VAL]], i32 [[P_VAL1]]) ; CHECK-NEXT: ret void ; call i32 @callee_basic(ptr %p) @@ -32,7 +32,7 @@ define void @caller_basic(ptr %p) { } define internal i32 @callee_opaque(ptr %p) { -; CHECK-LABEL: define {{[^@]+}}@callee_opaque +; CHECK-LABEL: define {{[^@]+}}@callee_opaque.argprom ; CHECK-SAME: (i32 [[P_0_VAL:%.*]], i32 [[P_4_VAL:%.*]]) { ; CHECK-NEXT: [[Z:%.*]] = add i32 [[P_0_VAL]], [[P_4_VAL]] ; CHECK-NEXT: ret i32 [[Z]] @@ -50,7 +50,7 @@ define void @caller_opaque(ptr %p) { ; CHECK-NEXT: [[P_VAL:%.*]] = load i32, ptr [[P]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[P]], i64 4 ; CHECK-NEXT: [[P_VAL1:%.*]] = load i32, ptr [[TMP3]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @callee_opaque(i32 [[P_VAL]], i32 [[P_VAL1]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @callee_opaque.argprom(i32 [[P_VAL]], i32 [[P_VAL1]]) ; CHECK-NEXT: ret void ; call i32 @callee_opaque(ptr %p) diff --git a/llvm/test/Transforms/ArgumentPromotion/byval-2.ll b/llvm/test/Transforms/ArgumentPromotion/byval-2.ll index 3d0e9f2958444f..9147a42fc7fc6b 100644 --- a/llvm/test/Transforms/ArgumentPromotion/byval-2.ll +++ b/llvm/test/Transforms/ArgumentPromotion/byval-2.ll @@ -6,7 +6,7 @@ %struct.ss = type { i32, i64 } define internal void @f(ptr byval(%struct.ss) align 8 %b, ptr byval(i32) align 4 %X) nounwind { -; CHECK-LABEL: define {{[^@]+}}@f +; CHECK-LABEL: define {{[^@]+}}@f.argprom ; CHECK-SAME: (i32 [[B_0:%.*]], i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TEMP:%.*]] = add i32 [[B_0]], 1 @@ -31,7 +31,7 @@ define i32 @test(ptr %X) { ; CHECK-NEXT: store i64 2, ptr [[TEMP4]], align 4 ; CHECK-NEXT: [[S_0_VAL:%.*]] = load i32, ptr [[S]], align 4 ; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr [[X]], align 4 -; CHECK-NEXT: call void @f(i32 [[S_0_VAL]], i32 [[X_VAL]]) +; CHECK-NEXT: call void @f.argprom(i32 [[S_0_VAL]], i32 [[X_VAL]]) ; CHECK-NEXT: ret i32 0 ; entry: diff --git a/llvm/test/Transforms/ArgumentPromotion/byval-with-padding.ll b/llvm/test/Transforms/ArgumentPromotion/byval-with-padding.ll index 9089470b7d3853..fe3820617e2d77 100644 --- a/llvm/test/Transforms/ArgumentPromotion/byval-with-padding.ll +++ b/llvm/test/Transforms/ArgumentPromotion/byval-with-padding.ll @@ -4,7 +4,7 @@ %struct.A = type { float, [12 x i8], i64, [8 x i8] } define internal float @callee(ptr byval(%struct.A) align 32 %0) { -; CHECK-LABEL: define {{[^@]+}}@callee +; CHECK-LABEL: define {{[^@]+}}@callee.argprom ; CHECK-SAME: (float [[DOT0_VAL:%.*]], i64 [[DOT16_VAL:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = fadd float 0.000000e+00, [[DOT0_VAL]] ; CHECK-NEXT: [[TMP2:%.*]] = uitofp i64 [[DOT16_VAL]] to float @@ -30,7 +30,7 @@ define float @caller(float %0) { ; CHECK-NEXT: [[DOTVAL:%.*]] = load float, ptr [[TMP2]], align 32 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP2]], i64 16 ; CHECK-NEXT: [[DOTVAL1:%.*]] = load i64, ptr [[TMP4]], align 16 -; CHECK-NEXT: [[TMP5:%.*]] = call noundef float @callee(float [[DOTVAL]], i64 [[DOTVAL1]]) +; CHECK-NEXT: [[TMP5:%.*]] = call noundef float @callee.argprom(float [[DOTVAL]], i64 [[DOTVAL1]]) ; CHECK-NEXT: ret float [[TMP5]] ; %2 = alloca %struct.A, align 32 diff --git a/llvm/test/Transforms/ArgumentPromotion/byval.ll b/llvm/test/Transforms/ArgumentPromotion/byval.ll index 13a60a96359212..424425b30767ed 100644 --- a/llvm/test/Transforms/ArgumentPromotion/byval.ll +++ b/llvm/test/Transforms/ArgumentPromotion/byval.ll @@ -6,7 +6,7 @@ target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:1 %struct.ss = type { i32, i64 } define internal void @f(ptr byval(%struct.ss) align 4 %b) nounwind { -; CHECK-LABEL: define {{[^@]+}}@f +; CHECK-LABEL: define {{[^@]+}}@f.argprom ; CHECK-SAME: (i32 [[B_0_VAL:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TEMP2:%.*]] = add i32 [[B_0_VAL]], 1 @@ -20,7 +20,7 @@ entry: } define internal void @g(ptr byval(%struct.ss) align 32 %b) nounwind { -; CHECK-LABEL: define {{[^@]+}}@g +; CHECK-LABEL: define {{[^@]+}}@g.argprom ; CHECK-SAME: (i32 [[B_0_VAL:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TEMP2:%.*]] = add i32 [[B_0_VAL]], 1 @@ -56,7 +56,7 @@ entry: ; Transform even if an argument is written to and then is loaded from. define internal void @k(ptr byval(%struct.ss) align 4 %b) nounwind { -; CHECK-LABEL: define {{[^@]+}}@k +; CHECK-LABEL: define {{[^@]+}}@k.argprom ; CHECK-SAME: (i32 [[B_0_VAL:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TEMP2:%.*]] = add i32 [[B_0_VAL]], 1 @@ -72,7 +72,7 @@ entry: ; Transform even if a store instruction is the single user. define internal void @l(ptr byval(%struct.ss) align 4 %b) nounwind { -; CHECK-LABEL: define {{[^@]+}}@l +; CHECK-LABEL: define {{[^@]+}}@l.argprom ; CHECK-SAME: (i32 [[B_0_VAL:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: ret void @@ -85,7 +85,7 @@ entry: ; Transform all the arguments creating the required number of 'alloca's and ; then optimize them out. define internal void @m(ptr byval(%struct.ss) align 4 %b, ptr byval(%struct.ss) align 4 %c) nounwind { -; CHECK-LABEL: define {{[^@]+}}@m +; CHECK-LABEL: define {{[^@]+}}@m.argprom ; CHECK-SAME: (i32 [[B_0_VAL:%.*]], i32 [[C_0_VAL:%.*]], i64 [[C_4_VAL:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TEMP2:%.*]] = add i32 [[B_0_VAL]], 1 @@ -116,19 +116,19 @@ define i32 @main() nounwind { ; CHECK-NEXT: [[TEMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1 ; CHECK-NEXT: store i64 2, ptr [[TEMP4]], align 4 ; CHECK-NEXT: [[S_VAL:%.*]] = load i32, ptr [[S]], align 4 -; CHECK-NEXT: call void @f(i32 [[S_VAL]]) +; CHECK-NEXT: call void @f.argprom(i32 [[S_VAL]]) ; CHECK-NEXT: [[S_VAL1:%.*]] = load i32, ptr [[S]], align 4 -; CHECK-NEXT: call void @g(i32 [[S_VAL1]]) +; CHECK-NEXT: call void @g.argprom(i32 [[S_VAL1]]) ; CHECK-NEXT: call void @h(ptr byval([[STRUCT_SS]]) [[S]]) ; CHECK-NEXT: [[S_VAL2:%.*]] = load i32, ptr [[S]], align 4 -; CHECK-NEXT: call void @k(i32 [[S_VAL2]]) +; CHECK-NEXT: call void @k.argprom(i32 [[S_VAL2]]) ; CHECK-NEXT: [[S_VAL3:%.*]] = load i32, ptr [[S]], align 4 -; CHECK-NEXT: call void @l(i32 [[S_VAL3]]) +; CHECK-NEXT: call void @l.argprom(i32 [[S_VAL3]]) ; CHECK-NEXT: [[S_VAL4:%.*]] = load i32, ptr [[S]], align 4 ; CHECK-NEXT: [[S_VAL5:%.*]] = load i32, ptr [[S]], align 4 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[S]], i64 4 ; CHECK-NEXT: [[S_VAL6:%.*]] = load i64, ptr [[TMP0]], align 8 -; CHECK-NEXT: call void @m(i32 [[S_VAL4]], i32 [[S_VAL5]], i64 [[S_VAL6]]) +; CHECK-NEXT: call void @m.argprom(i32 [[S_VAL4]], i32 [[S_VAL5]], i64 [[S_VAL6]]) ; CHECK-NEXT: ret i32 0 ; entry: diff --git a/llvm/test/Transforms/ArgumentPromotion/chained.ll b/llvm/test/Transforms/ArgumentPromotion/chained.ll index 2fb80a39875688..dba6726ea9b1f7 100644 --- a/llvm/test/Transforms/ArgumentPromotion/chained.ll +++ b/llvm/test/Transforms/ArgumentPromotion/chained.ll @@ -5,7 +5,7 @@ @G2 = constant ptr @G1 define internal i32 @test(ptr %x) { -; CHECK-LABEL: define {{[^@]+}}@test +; CHECK-LABEL: define {{[^@]+}}@test.argprom.argprom ; CHECK-SAME: (i32 [[X_0_VAL_0_VAL:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: ret i32 [[X_0_VAL_0_VAL]] @@ -21,7 +21,7 @@ define i32 @caller() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[G2_VAL:%.*]] = load ptr, ptr @G2, align 8 ; CHECK-NEXT: [[G2_VAL_VAL:%.*]] = load i32, ptr [[G2_VAL]], align 4 -; CHECK-NEXT: [[X:%.*]] = call i32 @test(i32 [[G2_VAL_VAL]]) +; CHECK-NEXT: [[X:%.*]] = call i32 @test.argprom.argprom(i32 [[G2_VAL_VAL]]) ; CHECK-NEXT: ret i32 [[X]] ; entry: diff --git a/llvm/test/Transforms/ArgumentPromotion/control-flow2.ll b/llvm/test/Transforms/ArgumentPromotion/control-flow2.ll index 8df89033c0d8da..7fb572551b0cfa 100644 --- a/llvm/test/Transforms/ArgumentPromotion/control-flow2.ll +++ b/llvm/test/Transforms/ArgumentPromotion/control-flow2.ll @@ -4,7 +4,7 @@ target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128" define internal i32 @callee(i1 %C, ptr %P) { -; CHECK-LABEL: define {{[^@]+}}@callee +; CHECK-LABEL: define {{[^@]+}}@callee.argprom ; CHECK-SAME: (i1 [[C:%.*]], i32 [[P_0_VAL:%.*]]) { ; CHECK-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] ; CHECK: T: @@ -27,7 +27,7 @@ define i32 @foo() { ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 ; CHECK-NEXT: store i32 17, ptr [[A]], align 4 ; CHECK-NEXT: [[A_VAL:%.*]] = load i32, ptr [[A]], align 4 -; CHECK-NEXT: [[X:%.*]] = call i32 @callee(i1 false, i32 [[A_VAL]]) +; CHECK-NEXT: [[X:%.*]] = call i32 @callee.argprom(i1 false, i32 [[A_VAL]]) ; CHECK-NEXT: ret i32 [[X]] ; %A = alloca i32 ; [#uses=2] diff --git a/llvm/test/Transforms/ArgumentPromotion/crash.ll b/llvm/test/Transforms/ArgumentPromotion/crash.ll index 12caae4dbef8df..0d15d7876dae61 100644 --- a/llvm/test/Transforms/ArgumentPromotion/crash.ll +++ b/llvm/test/Transforms/ArgumentPromotion/crash.ll @@ -44,7 +44,7 @@ bb: } define internal i1 @eggs(ptr %arg) { -; ARGPROMOTION-LABEL: define {{[^@]+}}@eggs() { +; ARGPROMOTION-LABEL: define {{[^@]+}}@eggs.argprom() { ; ARGPROMOTION-NEXT: bb: ; ARGPROMOTION-NEXT: unreachable ; diff --git a/llvm/test/Transforms/ArgumentPromotion/dbg.ll b/llvm/test/Transforms/ArgumentPromotion/dbg.ll index 6a14facfb36a22..15ed2cc2d20a6a 100644 --- a/llvm/test/Transforms/ArgumentPromotion/dbg.ll +++ b/llvm/test/Transforms/ArgumentPromotion/dbg.ll @@ -4,7 +4,7 @@ declare void @sink(i32) define internal void @test(ptr %X) !dbg !2 { -; CHECK-LABEL: define {{[^@]+}}@test +; CHECK-LABEL: define {{[^@]+}}@test.argprom.argprom ; CHECK-SAME: (i32 [[X_0_VAL_0_VAL:%.*]]) !dbg [[DBG3:![0-9]+]] { ; CHECK-NEXT: call void @sink(i32 [[X_0_VAL_0_VAL]]) ; CHECK-NEXT: ret void @@ -37,7 +37,7 @@ define void @caller(ptr %Y, ptr %P) { ; CHECK-SAME: (ptr [[Y:%.*]], ptr [[P:%.*]]) { ; CHECK-NEXT: [[Y_VAL:%.*]] = load ptr, ptr [[Y]], align 8, !dbg [[DBG4:![0-9]+]] ; CHECK-NEXT: [[Y_VAL_VAL:%.*]] = load i32, ptr [[Y_VAL]], align 8, !dbg [[DBG4]] -; CHECK-NEXT: call void @test(i32 [[Y_VAL_VAL]]), !dbg [[DBG4]] +; CHECK-NEXT: call void @test.argprom.argprom(i32 [[Y_VAL_VAL]]), !dbg [[DBG4]] ; CHECK-NEXT: call void @test_byval(ptr byval([[STRUCT_PAIR:%.*]]) align 4 [[P]]), !dbg [[DBG5:![0-9]+]] ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/ArgumentPromotion/fp80.ll b/llvm/test/Transforms/ArgumentPromotion/fp80.ll index 1e3d01a2361b92..7e2a595b8f95f8 100644 --- a/llvm/test/Transforms/ArgumentPromotion/fp80.ll +++ b/llvm/test/Transforms/ArgumentPromotion/fp80.ll @@ -16,12 +16,12 @@ define void @run() { ; CHECK-LABEL: define {{[^@]+}}@run() { ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr @b, i64 10 ; CHECK-NEXT: [[B_VAL:%.*]] = load i8, ptr [[TMP1]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = tail call i8 @UseLongDoubleUnsafely(i8 [[B_VAL]]) +; CHECK-NEXT: [[TMP2:%.*]] = tail call i8 @UseLongDoubleUnsafely.argprom(i8 [[B_VAL]]) ; CHECK-NEXT: [[B_VAL1:%.*]] = load x86_fp80, ptr @b, align 16 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_fp80 @UseLongDoubleSafely(x86_fp80 [[B_VAL1]]) +; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_fp80 @UseLongDoubleSafely.argprom(x86_fp80 [[B_VAL1]]) ; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_fp80 @UseLongDoubleSafelyNoPromotion(ptr byval([[UNION_U:%.*]]) align 16 @b) ; CHECK-NEXT: [[A_VAL:%.*]] = load i64, ptr @a, align 8 -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @AccessPaddingOfStruct(i64 [[A_VAL]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @AccessPaddingOfStruct.argprom(i64 [[A_VAL]]) ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @CaptureAStruct(ptr byval([[STRUCT_FOO:%.*]]) @a) ; CHECK-NEXT: ret void ; @@ -34,7 +34,7 @@ define void @run() { } define internal i8 @UseLongDoubleUnsafely(ptr byval(%union.u) align 16 %arg) { -; CHECK-LABEL: define {{[^@]+}}@UseLongDoubleUnsafely +; CHECK-LABEL: define {{[^@]+}}@UseLongDoubleUnsafely.argprom ; CHECK-SAME: (i8 [[ARG_10_VAL:%.*]]) { ; CHECK-NEXT: ret i8 [[ARG_10_VAL]] ; @@ -44,7 +44,7 @@ define internal i8 @UseLongDoubleUnsafely(ptr byval(%union.u) align 16 %arg) { } define internal x86_fp80 @UseLongDoubleSafely(ptr byval(%union.u) align 16 %arg) { -; CHECK-LABEL: define {{[^@]+}}@UseLongDoubleSafely +; CHECK-LABEL: define {{[^@]+}}@UseLongDoubleSafely.argprom ; CHECK-SAME: (x86_fp80 [[ARG_0_VAL:%.*]]) { ; CHECK-NEXT: ret x86_fp80 [[ARG_0_VAL]] ; @@ -71,7 +71,7 @@ define internal x86_fp80 @UseLongDoubleSafelyNoPromotion(ptr byval(%union.u) ali } define internal i64 @AccessPaddingOfStruct(ptr byval(%struct.Foo) %a) { -; CHECK-LABEL: define {{[^@]+}}@AccessPaddingOfStruct +; CHECK-LABEL: define {{[^@]+}}@AccessPaddingOfStruct.argprom ; CHECK-SAME: (i64 [[A_0_VAL:%.*]]) { ; CHECK-NEXT: ret i64 [[A_0_VAL]] ; diff --git a/llvm/test/Transforms/ArgumentPromotion/inalloca.ll b/llvm/test/Transforms/ArgumentPromotion/inalloca.ll index f6a101cf38a47a..6a5cf841f99440 100644 --- a/llvm/test/Transforms/ArgumentPromotion/inalloca.ll +++ b/llvm/test/Transforms/ArgumentPromotion/inalloca.ll @@ -7,7 +7,7 @@ target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:1 ; Argpromote + sroa should change this to passing the two integers by value. define internal i32 @f(ptr inalloca(%struct.ss) %s) { -; CHECK-LABEL: define {{[^@]+}}@f +; CHECK-LABEL: define {{[^@]+}}@f.argprom ; CHECK-SAME: (i32 [[S_0_VAL:%.*]], i32 [[S_4_VAL:%.*]]) unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[R:%.*]] = add i32 [[S_0_VAL]], [[S_4_VAL]] @@ -24,7 +24,7 @@ entry: define i32 @main() { ; CHECK-LABEL: define {{[^@]+}}@main() local_unnamed_addr { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[R:%.*]] = call fastcc i32 @f(i32 1, i32 2) +; CHECK-NEXT: [[R:%.*]] = call fastcc i32 @f.argprom(i32 1, i32 2) ; CHECK-NEXT: ret i32 [[R]] ; entry: diff --git a/llvm/test/Transforms/ArgumentPromotion/invalidation.ll b/llvm/test/Transforms/ArgumentPromotion/invalidation.ll index fe8f3b52f8dc5f..66de6bdfec503d 100644 --- a/llvm/test/Transforms/ArgumentPromotion/invalidation.ll +++ b/llvm/test/Transforms/ArgumentPromotion/invalidation.ll @@ -12,7 +12,7 @@ @G = constant i32 0 define internal i32 @a(ptr %x) { -; CHECK-LABEL: define {{[^@]+}}@a +; CHECK-LABEL: define {{[^@]+}}@a.argprom ; CHECK-SAME: (i32 [[X_0_VAL:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: ret i32 [[X_0_VAL]] @@ -26,7 +26,7 @@ define i32 @b() { ; CHECK-LABEL: define {{[^@]+}}@b() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[G_VAL:%.*]] = load i32, ptr @G, align 4 -; CHECK-NEXT: [[V:%.*]] = call i32 @a(i32 [[G_VAL]]) +; CHECK-NEXT: [[V:%.*]] = call i32 @a.argprom(i32 [[G_VAL]]) ; CHECK-NEXT: ret i32 [[V]] ; entry: @@ -38,7 +38,7 @@ define i32 @c() { ; CHECK-LABEL: define {{[^@]+}}@c() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[G_VAL:%.*]] = load i32, ptr @G, align 4 -; CHECK-NEXT: [[V1:%.*]] = call i32 @a(i32 [[G_VAL]]) +; CHECK-NEXT: [[V1:%.*]] = call i32 @a.argprom(i32 [[G_VAL]]) ; CHECK-NEXT: [[V2:%.*]] = call i32 @b() ; CHECK-NEXT: [[RESULT:%.*]] = add i32 [[V1]], [[V2]] ; CHECK-NEXT: ret i32 [[RESULT]] diff --git a/llvm/test/Transforms/ArgumentPromotion/load-alignment-value-overflows-addrspace-size.ll b/llvm/test/Transforms/ArgumentPromotion/load-alignment-value-overflows-addrspace-size.ll index 659d1331700a0f..e263330caaf06f 100644 --- a/llvm/test/Transforms/ArgumentPromotion/load-alignment-value-overflows-addrspace-size.ll +++ b/llvm/test/Transforms/ArgumentPromotion/load-alignment-value-overflows-addrspace-size.ll @@ -66,7 +66,7 @@ define internal void @call_load_maxalign_alloca_maxalign() { ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [13 x i16], align 4294967296, addrspace(5) ; CHECK-NEXT: [[ADDRSPACECAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr ; CHECK-NEXT: [[ADDRSPACECAST_VAL:%.*]] = load i32, ptr [[ADDRSPACECAST]], align 4294967296 -; CHECK-NEXT: call void @load_maxalign1(i32 [[ADDRSPACECAST_VAL]]) +; CHECK-NEXT: call void @load_maxalign1.argprom(i32 [[ADDRSPACECAST_VAL]]) ; CHECK-NEXT: ret void ; bb: @@ -77,7 +77,7 @@ bb: } define internal void @load_maxalign1(ptr %arg) { -; CHECK-LABEL: define internal void @load_maxalign1 +; CHECK-LABEL: define internal void @load_maxalign1.argprom ; CHECK-SAME: (i32 [[ARG_0_VAL:%.*]]) { ; CHECK-NEXT: bb: ; CHECK-NEXT: br label [[BB1:%.*]] @@ -110,7 +110,7 @@ define internal void @call_load_maxalign_alloca_ptr128() { ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [13 x i16], align 4294967296, addrspace(6) ; CHECK-NEXT: [[ADDRSPACECAST:%.*]] = addrspacecast ptr addrspace(6) [[ALLOCA]] to ptr ; CHECK-NEXT: [[ADDRSPACECAST_VAL:%.*]] = load i32, ptr [[ADDRSPACECAST]], align 4294967296 -; CHECK-NEXT: call void @load_maxalign2(i32 [[ADDRSPACECAST_VAL]]) +; CHECK-NEXT: call void @load_maxalign2.argprom(i32 [[ADDRSPACECAST_VAL]]) ; CHECK-NEXT: ret void ; bb: @@ -121,7 +121,7 @@ bb: } define internal void @load_maxalign2(ptr %arg) { -; CHECK-LABEL: define internal void @load_maxalign2 +; CHECK-LABEL: define internal void @load_maxalign2.argprom ; CHECK-SAME: (i32 [[ARG_0_VAL:%.*]]) { ; CHECK-NEXT: bb: ; CHECK-NEXT: br label [[BB1:%.*]] diff --git a/llvm/test/Transforms/ArgumentPromotion/max-elements-limit.ll b/llvm/test/Transforms/ArgumentPromotion/max-elements-limit.ll index 06293e8bbe7580..424238280f7fce 100644 --- a/llvm/test/Transforms/ArgumentPromotion/max-elements-limit.ll +++ b/llvm/test/Transforms/ArgumentPromotion/max-elements-limit.ll @@ -2,7 +2,7 @@ ; RUN: opt -passes=argpromotion -S %s | FileCheck %s define internal i32 @callee2(ptr noundef %0) { -; CHECK-LABEL: define {{[^@]+}}@callee2 +; CHECK-LABEL: define {{[^@]+}}@callee2.argprom ; CHECK-SAME: (i32 [[DOT0_VAL:%.*]], i32 [[DOT4_VAL:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = add nsw i32 [[DOT0_VAL]], [[DOT4_VAL]] ; CHECK-NEXT: ret i32 [[TMP1]] @@ -24,7 +24,7 @@ define i32 @caller2(i32 %0, i32 %1) { ; CHECK-NEXT: [[DOTVAL:%.*]] = load i32, ptr [[TMP3]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP3]], i64 4 ; CHECK-NEXT: [[DOTVAL1:%.*]] = load i32, ptr [[TMP5]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @callee2(i32 [[DOTVAL]], i32 [[DOTVAL1]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @callee2.argprom(i32 [[DOTVAL]], i32 [[DOTVAL1]]) ; CHECK-NEXT: ret i32 [[TMP6]] ; %3 = alloca [2 x i32], align 4 diff --git a/llvm/test/Transforms/ArgumentPromotion/metadata.ll b/llvm/test/Transforms/ArgumentPromotion/metadata.ll index b3f9fb0c5510e1..caac625cea30f2 100644 --- a/llvm/test/Transforms/ArgumentPromotion/metadata.ll +++ b/llvm/test/Transforms/ArgumentPromotion/metadata.ll @@ -5,7 +5,7 @@ declare void @use.i32(i32) declare void @use.p32(ptr) define internal void @callee(ptr %p1, ptr %p2, ptr %p3, ptr %p4, ptr %p5, ptr %p6, ptr %p7, ptr %p8, ptr %p9, ptr %p10) { -; CHECK-LABEL: define {{[^@]+}}@callee +; CHECK-LABEL: define {{[^@]+}}@callee.argprom ; CHECK-SAME: (i32 [[P1_0_VAL:%.*]], i32 [[P2_0_VAL:%.*]], ptr [[P3_0_VAL:%.*]], ptr [[P4_0_VAL:%.*]], ptr [[P5_0_VAL:%.*]], ptr [[P6_0_VAL:%.*]], ptr [[P7_0_VAL:%.*]], ptr [[P8_0_VAL:%.*]], ptr [[P9_0_VAL:%.*]], ptr [[P10_0_VAL:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = icmp ne ptr [[P4_0_VAL]], null ; CHECK-NEXT: call void @llvm.assume(i1 [[TMP1]]) @@ -57,7 +57,7 @@ define void @caller(ptr %p1, ptr %p2, ptr %p3, ptr %p4, ptr %p5, ptr %p6, ptr %p ; CHECK-NEXT: [[P8_VAL:%.*]] = load ptr, ptr [[P8]], align 8, !align !3, !noundef !1 ; CHECK-NEXT: [[P9_VAL:%.*]] = load ptr, ptr [[P9]], align 8, !noundef !1 ; CHECK-NEXT: [[P10_VAL:%.*]] = load ptr, ptr [[P10]], align 8, !nontemporal !4 -; CHECK-NEXT: call void @callee(i32 [[P1_VAL]], i32 [[P2_VAL]], ptr [[P3_VAL]], ptr [[P4_VAL]], ptr [[P5_VAL]], ptr [[P6_VAL]], ptr [[P7_VAL]], ptr [[P8_VAL]], ptr [[P9_VAL]], ptr [[P10_VAL]]) +; CHECK-NEXT: call void @callee.argprom(i32 [[P1_VAL]], i32 [[P2_VAL]], ptr [[P3_VAL]], ptr [[P4_VAL]], ptr [[P5_VAL]], ptr [[P6_VAL]], ptr [[P7_VAL]], ptr [[P8_VAL]], ptr [[P9_VAL]], ptr [[P10_VAL]]) ; CHECK-NEXT: ret void ; call void @callee(ptr %p1, ptr %p2, ptr %p3, ptr %p4, ptr %p5, ptr %p6, ptr %p7, ptr %p8, ptr %p9, ptr %p10) @@ -65,7 +65,7 @@ define void @caller(ptr %p1, ptr %p2, ptr %p3, ptr %p4, ptr %p5, ptr %p6, ptr %p } define internal ptr @callee_conditional(i1 %c, ptr dereferenceable(8) align 8 %p) { -; CHECK-LABEL: define {{[^@]+}}@callee_conditional +; CHECK-LABEL: define {{[^@]+}}@callee_conditional.argprom ; CHECK-SAME: (i1 [[C:%.*]], ptr [[P_0_VAL:%.*]]) { ; CHECK-NEXT: br i1 [[C]], label [[IF:%.*]], label [[ELSE:%.*]] ; CHECK: if: @@ -89,7 +89,7 @@ define void @caller_conditional(i1 %c, ptr %p) { ; CHECK-LABEL: define {{[^@]+}}@caller_conditional ; CHECK-SAME: (i1 [[C:%.*]], ptr [[P:%.*]]) { ; CHECK-NEXT: [[P_VAL:%.*]] = load ptr, ptr [[P]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = call ptr @callee_conditional(i1 [[C]], ptr [[P_VAL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call ptr @callee_conditional.argprom(i1 [[C]], ptr [[P_VAL]]) ; CHECK-NEXT: ret void ; call ptr @callee_conditional(i1 %c, ptr %p) diff --git a/llvm/test/Transforms/ArgumentPromotion/min-legal-vector-width.ll b/llvm/test/Transforms/ArgumentPromotion/min-legal-vector-width.ll index a5a0fc0cf186b7..8812dc2104feb2 100644 --- a/llvm/test/Transforms/ArgumentPromotion/min-legal-vector-width.ll +++ b/llvm/test/Transforms/ArgumentPromotion/min-legal-vector-width.ll @@ -2,11 +2,11 @@ ; CHECK-LABEL: define i32 @foo() #0 { ; CHECK-NEXT: %.val = load <32 x half>, ptr undef, align 4 -; CHECK-NEXT: call void @bar(<32 x half> %.val) +; CHECK-NEXT: call void @bar.argprom(<32 x half> %.val) ; CHECK-NEXT: ret i32 0 ; CHECK-NEXT: } -; CHECK-LABEL: define internal void @bar(<32 x half> %.0.val) #0 { +; CHECK-LABEL: define internal void @bar.argprom(<32 x half> %.0.val) #0 { ; CHECK-NEXT: ret void ; CHECK-NEXT: } diff --git a/llvm/test/Transforms/ArgumentPromotion/nonzero-address-spaces.ll b/llvm/test/Transforms/ArgumentPromotion/nonzero-address-spaces.ll index 6cabc5bb8f3a90..335275380c11b0 100644 --- a/llvm/test/Transforms/ArgumentPromotion/nonzero-address-spaces.ll +++ b/llvm/test/Transforms/ArgumentPromotion/nonzero-address-spaces.ll @@ -11,7 +11,7 @@ target datalayout = "e-P1-p:16:8-i8:8-i16:8-i32:8-i64:8-f32:8-f64:8-n8-a:8" define i32 @bar() { ; CHECK-LABEL: define {{[^@]+}}@bar() addrspace(1) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CALL:%.*]] = call addrspace(1) i32 @foo() +; CHECK-NEXT: [[CALL:%.*]] = call addrspace(1) i32 @foo.argprom() ; CHECK-NEXT: ret i32 [[CALL]] ; @@ -21,7 +21,7 @@ entry: } define internal i32 @foo(ptr) { -; CHECK-LABEL: define {{[^@]+}}@foo() addrspace(1) { +; CHECK-LABEL: define {{[^@]+}}@foo.argprom() addrspace(1) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 ; CHECK-NEXT: call addrspace(0) void asm sideeffect "ldr r0, [r0] \0Abx lr \0A", ""() diff --git a/llvm/test/Transforms/ArgumentPromotion/opaque-ptr.ll b/llvm/test/Transforms/ArgumentPromotion/opaque-ptr.ll index 59699675577cfe..5ca798b3a9d918 100644 --- a/llvm/test/Transforms/ArgumentPromotion/opaque-ptr.ll +++ b/llvm/test/Transforms/ArgumentPromotion/opaque-ptr.ll @@ -2,7 +2,7 @@ ; RUN: opt -S -passes=argpromotion < %s | FileCheck %s define internal i32 @callee_basic(ptr %p) { -; CHECK-LABEL: define {{[^@]+}}@callee_basic +; CHECK-LABEL: define {{[^@]+}}@callee_basic.argprom ; CHECK-SAME: (i32 [[P_0_VAL:%.*]], i32 [[P_4_VAL:%.*]]) { ; CHECK-NEXT: [[Z:%.*]] = add i32 [[P_0_VAL]], [[P_4_VAL]] ; CHECK-NEXT: ret i32 [[Z]] @@ -20,7 +20,7 @@ define void @caller_basic(ptr %p) { ; CHECK-NEXT: [[P_VAL:%.*]] = load i32, ptr [[P]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[P]], i64 4 ; CHECK-NEXT: [[P_VAL1:%.*]] = load i32, ptr [[TMP1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @callee_basic(i32 [[P_VAL]], i32 [[P_VAL1]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @callee_basic.argprom(i32 [[P_VAL]], i32 [[P_VAL1]]) ; CHECK-NEXT: ret void ; call i32 @callee_basic(ptr %p) diff --git a/llvm/test/Transforms/ArgumentPromotion/pr27568.ll b/llvm/test/Transforms/ArgumentPromotion/pr27568.ll index cc25088edf52f9..1164bcf4141ed9 100644 --- a/llvm/test/Transforms/ArgumentPromotion/pr27568.ll +++ b/llvm/test/Transforms/ArgumentPromotion/pr27568.ll @@ -5,7 +5,7 @@ target triple = "x86_64-pc-windows-msvc" define internal void @callee(ptr) { -; CHECK-LABEL: define {{[^@]+}}@callee() { +; CHECK-LABEL: define {{[^@]+}}@callee.argprom() { ; CHECK-NEXT: entry: ; CHECK-NEXT: call void @thunk() ; CHECK-NEXT: ret void @@ -24,7 +24,7 @@ define void @test1() personality ptr @__CxxFrameHandler3 { ; CHECK-NEXT: ret void ; CHECK: cpad: ; CHECK-NEXT: [[PAD:%.*]] = cleanuppad within none [] -; CHECK-NEXT: call void @callee() [ "funclet"(token [[PAD]]) ] +; CHECK-NEXT: call void @callee.argprom() [ "funclet"(token [[PAD]]) ] ; CHECK-NEXT: cleanupret from [[PAD]] unwind to caller ; entry: diff --git a/llvm/test/Transforms/ArgumentPromotion/pr32917.ll b/llvm/test/Transforms/ArgumentPromotion/pr32917.ll index dd089a910f5a31..ac0d30999ce0e9 100644 --- a/llvm/test/Transforms/ArgumentPromotion/pr32917.ll +++ b/llvm/test/Transforms/ArgumentPromotion/pr32917.ll @@ -12,7 +12,7 @@ define i32 @fn2() local_unnamed_addr { ; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i64 -4 ; CHECK-NEXT: [[DOTVAL:%.*]] = load i32, ptr [[TMP4]], align 4 -; CHECK-NEXT: call fastcc void @fn1(i32 [[DOTVAL]]) +; CHECK-NEXT: call fastcc void @fn1.argprom(i32 [[DOTVAL]]) ; CHECK-NEXT: ret i32 undef ; %1 = load i32, ptr @b, align 4 @@ -23,7 +23,7 @@ define i32 @fn2() local_unnamed_addr { } define internal fastcc void @fn1(ptr nocapture readonly) unnamed_addr { -; CHECK-LABEL: define {{[^@]+}}@fn1 +; CHECK-LABEL: define {{[^@]+}}@fn1.argprom ; CHECK-SAME: (i32 [[DOT_4_VAL:%.*]]) unnamed_addr { ; CHECK-NEXT: store i32 [[DOT_4_VAL]], ptr @a, align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll b/llvm/test/Transforms/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll index 8db0a28e680587..42728abb81e722 100644 --- a/llvm/test/Transforms/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll +++ b/llvm/test/Transforms/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll @@ -14,7 +14,7 @@ define void @foo() { } define internal void @bar(ptr %p) { -; CHECK-LABEL: define {{.*}}void @bar() +; CHECK-LABEL: define {{.*}}void @bar.argprom() ; CHECK-NEXT: #dbg_value(ptr undef, !3, !DIExpression(), !5 call void @llvm.dbg.value(metadata ptr %p, metadata !3, metadata !DIExpression()), !dbg !5 ret void diff --git a/llvm/test/Transforms/ArgumentPromotion/profile.ll b/llvm/test/Transforms/ArgumentPromotion/profile.ll index 58d7376b8b7da0..b932f7c762431e 100644 --- a/llvm/test/Transforms/ArgumentPromotion/profile.ll +++ b/llvm/test/Transforms/ArgumentPromotion/profile.ll @@ -6,7 +6,7 @@ target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:1 define void @caller() #0 { ; CHECK-LABEL: define {{[^@]+}}@caller() { -; CHECK-NEXT: call void @promote_i32_ptr(i32 42), !prof [[PROF0:![0-9]+]] +; CHECK-NEXT: call void @promote_i32_ptr.argprom(i32 42), !prof [[PROF0:![0-9]+]] ; CHECK-NEXT: ret void ; %x = alloca i32 @@ -16,7 +16,7 @@ define void @caller() #0 { } define internal void @promote_i32_ptr(ptr %xp) !prof !1 { -; CHECK-LABEL: define {{[^@]+}}@promote_i32_ptr +; CHECK-LABEL: define {{[^@]+}}@promote_i32_ptr.argprom ; CHECK-SAME: (i32 [[XP_0_VAL:%.*]]) !prof [[PROF1:![0-9]+]] { ; CHECK-NEXT: call void @use_i32(i32 [[XP_0_VAL]]) ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/ArgumentPromotion/propagate-remove-dead-args.ll b/llvm/test/Transforms/ArgumentPromotion/propagate-remove-dead-args.ll index 87a14533cfda26..584ec42ae995c5 100644 --- a/llvm/test/Transforms/ArgumentPromotion/propagate-remove-dead-args.ll +++ b/llvm/test/Transforms/ArgumentPromotion/propagate-remove-dead-args.ll @@ -4,7 +4,7 @@ %ptr.struct = type { ptr, ptr, ptr } define internal void @child(ptr %this, ptr %y, ptr %x) { -; CHECK-LABEL: define internal void @child +; CHECK-LABEL: define internal void @child.argprom ; CHECK-SAME: (ptr [[Y:%.*]], half [[X_0_VAL:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: store half [[X_0_VAL]], ptr [[Y]], align 2 @@ -17,15 +17,15 @@ entry: } define internal void @parent(ptr %this, ptr %p1, ptr %p2) { -; CHECK-LABEL: define internal void @parent +; CHECK-LABEL: define internal void @parent.argprom ; CHECK-SAME: (ptr [[P1:%.*]], ptr [[P2:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[P2_VAL2:%.*]] = load half, ptr [[P2]], align 2 -; CHECK-NEXT: call void @child(ptr [[P1]], half [[P2_VAL2]]) +; CHECK-NEXT: call void @child.argprom(ptr [[P1]], half [[P2_VAL2]]) ; CHECK-NEXT: [[P2_VAL1:%.*]] = load half, ptr [[P2]], align 2 -; CHECK-NEXT: call void @child(ptr [[P1]], half [[P2_VAL1]]) +; CHECK-NEXT: call void @child.argprom(ptr [[P1]], half [[P2_VAL1]]) ; CHECK-NEXT: [[P2_VAL:%.*]] = load half, ptr [[P2]], align 2 -; CHECK-NEXT: call void @child(ptr [[P1]], half [[P2_VAL]]) +; CHECK-NEXT: call void @child.argprom(ptr [[P1]], half [[P2_VAL]]) ; CHECK-NEXT: ret void ; entry: @@ -46,7 +46,7 @@ define void @grandparent() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[XPTR:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[YPTR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: call void @parent(ptr [[XPTR]], ptr [[YPTR]]) +; CHECK-NEXT: call void @parent.argprom(ptr [[XPTR]], ptr [[YPTR]]) ; CHECK-NEXT: ret void ; entry: @@ -58,7 +58,7 @@ entry: } define internal ptr @callee(ptr %dead) { -; CHECK-LABEL: define internal ptr @callee() { +; CHECK-LABEL: define internal ptr @callee.argprom() { ; CHECK-NEXT: ret ptr null ; ret ptr null @@ -66,8 +66,8 @@ define internal ptr @callee(ptr %dead) { define void @caller() { ; CHECK-LABEL: define void @caller() { -; CHECK-NEXT: [[TMP1:%.*]] = call ptr @callee() -; CHECK-NEXT: [[TMP2:%.*]] = call ptr @callee() +; CHECK-NEXT: [[TMP1:%.*]] = call ptr @callee.argprom() +; CHECK-NEXT: [[TMP2:%.*]] = call ptr @callee.argprom() ; CHECK-NEXT: ret void ; %ret = call ptr @callee(ptr null) diff --git a/llvm/test/Transforms/ArgumentPromotion/recursion/aggregate-promote-recursive.ll b/llvm/test/Transforms/ArgumentPromotion/recursion/aggregate-promote-recursive.ll index 011ebe4eee76e7..b1d5898a9a1c7b 100644 --- a/llvm/test/Transforms/ArgumentPromotion/recursion/aggregate-promote-recursive.ll +++ b/llvm/test/Transforms/ArgumentPromotion/recursion/aggregate-promote-recursive.ll @@ -5,11 +5,11 @@ @G = constant %T { i32 0, i32 0, i32 17, i32 25 } define internal i32 @test(ptr %p) { -; CHECK-LABEL: define {{[^@]+}}@test +; CHECK-LABEL: define {{[^@]+}}@test.argprom ; CHECK-SAME: (i32 [[P_8_VAL:%.*]], i32 [[P_12_VAL:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[V:%.*]] = add i32 [[P_12_VAL]], [[P_8_VAL]] -; CHECK-NEXT: [[RET:%.*]] = call i32 @test(i32 [[P_8_VAL]], i32 [[P_12_VAL]]) +; CHECK-NEXT: [[RET:%.*]] = call i32 @test.argprom(i32 [[P_8_VAL]], i32 [[P_12_VAL]]) ; CHECK-NEXT: [[ARET:%.*]] = add i32 [[V]], [[RET]] ; CHECK-NEXT: ret i32 [[ARET]] ; @@ -31,7 +31,7 @@ define i32 @caller() { ; CHECK-NEXT: [[G_VAL:%.*]] = load i32, ptr [[TMP0]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr @G, i64 12 ; CHECK-NEXT: [[G_VAL1:%.*]] = load i32, ptr [[TMP1]], align 4 -; CHECK-NEXT: [[V:%.*]] = call i32 @test(i32 [[G_VAL]], i32 [[G_VAL1]]) +; CHECK-NEXT: [[V:%.*]] = call i32 @test.argprom(i32 [[G_VAL]], i32 [[G_VAL1]]) ; CHECK-NEXT: ret i32 [[V]] ; entry: diff --git a/llvm/test/Transforms/ArgumentPromotion/recursion/argpromotion-recursion-pr1259.ll b/llvm/test/Transforms/ArgumentPromotion/recursion/argpromotion-recursion-pr1259.ll index e160dbad92e7b1..28bdc8fc45050f 100644 --- a/llvm/test/Transforms/ArgumentPromotion/recursion/argpromotion-recursion-pr1259.ll +++ b/llvm/test/Transforms/ArgumentPromotion/recursion/argpromotion-recursion-pr1259.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S -passes=argpromotion < %s | FileCheck %s define internal i32 @foo(ptr %x, i32 %n, i32 %m) { -; CHECK-LABEL: define internal i32 @foo( +; CHECK-LABEL: define internal i32 @foo.argprom( ; CHECK-SAME: i32 [[X_0_VAL:%.*]], i32 [[N:%.*]], i32 [[M:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[N]], 0 @@ -10,9 +10,9 @@ define internal i32 @foo(ptr %x, i32 %n, i32 %m) { ; CHECK-NEXT: br label %[[RETURN:.*]] ; CHECK: [[COND_FALSE]]: ; CHECK-NEXT: [[SUBVAL:%.*]] = sub i32 [[N]], 1 -; CHECK-NEXT: [[CALLRET:%.*]] = call i32 @foo(i32 [[X_0_VAL]], i32 [[SUBVAL]], i32 [[X_0_VAL]]) +; CHECK-NEXT: [[CALLRET:%.*]] = call i32 @foo.argprom(i32 [[X_0_VAL]], i32 [[SUBVAL]], i32 [[X_0_VAL]]) ; CHECK-NEXT: [[SUBVAL2:%.*]] = sub i32 [[N]], 2 -; CHECK-NEXT: [[CALLRET2:%.*]] = call i32 @foo(i32 [[X_0_VAL]], i32 [[SUBVAL2]], i32 [[M]]) +; CHECK-NEXT: [[CALLRET2:%.*]] = call i32 @foo.argprom(i32 [[X_0_VAL]], i32 [[SUBVAL2]], i32 [[M]]) ; CHECK-NEXT: [[CMP2:%.*]] = add i32 [[CALLRET]], [[CALLRET2]] ; CHECK-NEXT: br label %[[RETURN]] ; CHECK: [[COND_NEXT:.*]]: @@ -51,7 +51,7 @@ define i32 @bar(ptr align(4) dereferenceable(4) %x, i32 %n, i32 %m) { ; CHECK-SAME: ptr align 4 dereferenceable(4) [[X:%.*]], i32 [[N:%.*]], i32 [[M:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr [[X]], align 4 -; CHECK-NEXT: [[CALLRET3:%.*]] = call i32 @foo(i32 [[X_VAL]], i32 [[N]], i32 [[M]]) +; CHECK-NEXT: [[CALLRET3:%.*]] = call i32 @foo.argprom(i32 [[X_VAL]], i32 [[N]], i32 [[M]]) ; CHECK-NEXT: br label %[[RETURN:.*]] ; CHECK: [[RETURN]]: ; CHECK-NEXT: ret i32 [[CALLRET3]] diff --git a/llvm/test/Transforms/ArgumentPromotion/recursion/recursion-mixed-calls.ll b/llvm/test/Transforms/ArgumentPromotion/recursion/recursion-mixed-calls.ll index 0ec4137aadeb4c..0e048c2726a312 100644 --- a/llvm/test/Transforms/ArgumentPromotion/recursion/recursion-mixed-calls.ll +++ b/llvm/test/Transforms/ArgumentPromotion/recursion/recursion-mixed-calls.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S -passes=argpromotion < %s | FileCheck %s define internal i32 @zoo(ptr %x, i32 %m) { -; CHECK-LABEL: define internal i32 @zoo( +; CHECK-LABEL: define internal i32 @zoo.argprom( ; CHECK-SAME: i32 [[X_0_VAL:%.*]], i32 [[M:%.*]]) { ; CHECK-NEXT: [[RESZOO:%.*]] = add i32 [[X_0_VAL]], [[M]] ; CHECK-NEXT: ret i32 [[X_0_VAL]] @@ -12,7 +12,7 @@ define internal i32 @zoo(ptr %x, i32 %m) { } define internal i32 @foo(ptr %x, ptr %y, i32 %n, i32 %m) { -; CHECK-LABEL: define internal i32 @foo( +; CHECK-LABEL: define internal i32 @foo.argprom( ; CHECK-SAME: ptr [[X:%.*]], i32 [[Y_0_VAL:%.*]], i32 [[N:%.*]], i32 [[M:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[N]], 0 @@ -23,12 +23,12 @@ define internal i32 @foo(ptr %x, ptr %y, i32 %n, i32 %m) { ; CHECK: [[COND_FALSE]]: ; CHECK-NEXT: [[VAL2:%.*]] = load i32, ptr [[X]], align 4 ; CHECK-NEXT: [[SUBVAL:%.*]] = sub i32 [[N]], [[Y_0_VAL]] -; CHECK-NEXT: [[CALLRET:%.*]] = call i32 @foo(ptr [[X]], i32 [[Y_0_VAL]], i32 [[SUBVAL]], i32 [[VAL2]]) +; CHECK-NEXT: [[CALLRET:%.*]] = call i32 @foo.argprom(ptr [[X]], i32 [[Y_0_VAL]], i32 [[SUBVAL]], i32 [[VAL2]]) ; CHECK-NEXT: [[SUBVAL2:%.*]] = sub i32 [[N]], 2 -; CHECK-NEXT: [[CALLRET2:%.*]] = call i32 @foo(ptr [[X]], i32 [[Y_0_VAL]], i32 [[SUBVAL2]], i32 [[M]]) +; CHECK-NEXT: [[CALLRET2:%.*]] = call i32 @foo.argprom(ptr [[X]], i32 [[Y_0_VAL]], i32 [[SUBVAL2]], i32 [[M]]) ; CHECK-NEXT: [[CMP1:%.*]] = add i32 [[CALLRET]], [[CALLRET2]] ; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr [[X]], align 4 -; CHECK-NEXT: [[CALLRETFINAL:%.*]] = call i32 @zoo(i32 [[X_VAL]], i32 [[M]]) +; CHECK-NEXT: [[CALLRETFINAL:%.*]] = call i32 @zoo.argprom(i32 [[X_VAL]], i32 [[M]]) ; CHECK-NEXT: [[CMP2:%.*]] = add i32 [[CMP1]], [[CALLRETFINAL]] ; CHECK-NEXT: br label %[[RETURN]] ; CHECK: [[COND_NEXT:.*]]: @@ -70,7 +70,7 @@ define i32 @bar(ptr align(4) dereferenceable(4) %x, ptr align(4) dereferenceable ; CHECK-SAME: ptr align 4 dereferenceable(4) [[X:%.*]], ptr align 4 dereferenceable(4) [[Y:%.*]], i32 [[N:%.*]], i32 [[M:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[Y_VAL:%.*]] = load i32, ptr [[Y]], align 4 -; CHECK-NEXT: [[CALLRET3:%.*]] = call i32 @foo(ptr [[X]], i32 [[Y_VAL]], i32 [[N]], i32 [[M]]) +; CHECK-NEXT: [[CALLRET3:%.*]] = call i32 @foo.argprom(ptr [[X]], i32 [[Y_VAL]], i32 [[N]], i32 [[M]]) ; CHECK-NEXT: br label %[[RETURN:.*]] ; CHECK: [[RETURN]]: ; CHECK-NEXT: ret i32 [[CALLRET3]] diff --git a/llvm/test/Transforms/ArgumentPromotion/recursion/recursion-non-zero-offset.ll b/llvm/test/Transforms/ArgumentPromotion/recursion/recursion-non-zero-offset.ll index 805414de17f133..1ec8ab1edca669 100644 --- a/llvm/test/Transforms/ArgumentPromotion/recursion/recursion-non-zero-offset.ll +++ b/llvm/test/Transforms/ArgumentPromotion/recursion/recursion-non-zero-offset.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S -passes=argpromotion < %s | FileCheck %s define internal i32 @foo(ptr %x, i32 %n, i32 %m) { -; CHECK-LABEL: define internal i32 @foo( +; CHECK-LABEL: define internal i32 @foo.argprom( ; CHECK-SAME: i32 [[X_0_VAL:%.*]], i32 [[N:%.*]], i32 [[M:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[N]], 0 @@ -10,9 +10,9 @@ define internal i32 @foo(ptr %x, i32 %n, i32 %m) { ; CHECK-NEXT: br label %[[RETURN:.*]] ; CHECK: [[COND_FALSE]]: ; CHECK-NEXT: [[SUBVAL:%.*]] = sub i32 [[N]], 1 -; CHECK-NEXT: [[CALLRET:%.*]] = call i32 @foo(i32 [[X_0_VAL]], i32 [[SUBVAL]], i32 [[X_0_VAL]]) +; CHECK-NEXT: [[CALLRET:%.*]] = call i32 @foo.argprom(i32 [[X_0_VAL]], i32 [[SUBVAL]], i32 [[X_0_VAL]]) ; CHECK-NEXT: [[SUBVAL2:%.*]] = sub i32 [[N]], 2 -; CHECK-NEXT: [[CALLRET2:%.*]] = call i32 @foo(i32 [[X_0_VAL]], i32 [[SUBVAL2]], i32 [[M]]) +; CHECK-NEXT: [[CALLRET2:%.*]] = call i32 @foo.argprom(i32 [[X_0_VAL]], i32 [[SUBVAL2]], i32 [[M]]) ; CHECK-NEXT: [[CMP2:%.*]] = add i32 [[CALLRET]], [[CALLRET2]] ; CHECK-NEXT: br label %[[RETURN]] ; CHECK: [[COND_NEXT:.*]]: @@ -52,7 +52,7 @@ define i32 @bar(ptr align(4) dereferenceable(4) %x, i32 %n, i32 %m) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[GEPVAL:%.*]] = getelementptr ptr, ptr [[X]], i32 0 ; CHECK-NEXT: [[GEPVAL_VAL:%.*]] = load i32, ptr [[GEPVAL]], align 4 -; CHECK-NEXT: [[CALLRET3:%.*]] = call i32 @foo(i32 [[GEPVAL_VAL]], i32 [[N]], i32 [[M]]) +; CHECK-NEXT: [[CALLRET3:%.*]] = call i32 @foo.argprom(i32 [[GEPVAL_VAL]], i32 [[N]], i32 [[M]]) ; CHECK-NEXT: br label %[[RETURN:.*]] ; CHECK: [[RETURN]]: ; CHECK-NEXT: ret i32 [[CALLRET3]] diff --git a/llvm/test/Transforms/ArgumentPromotion/reserve-tbaa.ll b/llvm/test/Transforms/ArgumentPromotion/reserve-tbaa.ll index f60dd48a464d22..2f13767c643184 100644 --- a/llvm/test/Transforms/ArgumentPromotion/reserve-tbaa.ll +++ b/llvm/test/Transforms/ArgumentPromotion/reserve-tbaa.ll @@ -14,7 +14,7 @@ @d = global i8 0, align 1 define internal fastcc void @fn(ptr nocapture readonly %p1, ptr nocapture readonly %p2) { -; CHECK-LABEL: define {{[^@]+}}@fn +; CHECK-LABEL: define {{[^@]+}}@fn.argprom ; CHECK-SAME: (i32 [[P1_0_VAL:%.*]], i64 [[P2_0_VAL:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CONV:%.*]] = trunc i64 [[P2_0_VAL]] to i32 @@ -40,7 +40,7 @@ define i32 @main() { ; CHECK-NEXT: store i32 1, ptr [[TMP1]], align 4, !tbaa [[TBAA5:![0-9]+]] ; CHECK-NEXT: [[G_VAL:%.*]] = load i32, ptr @g, align 4, !tbaa [[TBAA5]] ; CHECK-NEXT: [[C_VAL:%.*]] = load i64, ptr @c, align 8, !tbaa [[TBAA7:![0-9]+]] -; CHECK-NEXT: call fastcc void @fn(i32 [[G_VAL]], i64 [[C_VAL]]) +; CHECK-NEXT: call fastcc void @fn.argprom(i32 [[G_VAL]], i64 [[C_VAL]]) ; CHECK-NEXT: ret i32 0 ; entry: diff --git a/llvm/test/Transforms/ArgumentPromotion/sret.ll b/llvm/test/Transforms/ArgumentPromotion/sret.ll index fcc868954bc951..80403e1d92d527 100644 --- a/llvm/test/Transforms/ArgumentPromotion/sret.ll +++ b/llvm/test/Transforms/ArgumentPromotion/sret.ll @@ -5,7 +5,7 @@ target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-pc-windows-msvc" define internal void @add(ptr %this, ptr sret(i32) %r) { -; CHECK-LABEL: define {{[^@]+}}@add +; CHECK-LABEL: define {{[^@]+}}@add.argprom ; CHECK-SAME: (i32 [[THIS_0_VAL:%.*]], i32 [[THIS_4_VAL:%.*]], ptr noalias [[R:%.*]]) { ; CHECK-NEXT: [[AB:%.*]] = add i32 [[THIS_0_VAL]], [[THIS_4_VAL]] ; CHECK-NEXT: store i32 [[AB]], ptr [[R]], align 4 @@ -27,7 +27,7 @@ define void @f() { ; CHECK-NEXT: [[PAIR_VAL:%.*]] = load i32, ptr [[PAIR]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[PAIR]], i64 4 ; CHECK-NEXT: [[PAIR_VAL1:%.*]] = load i32, ptr [[TMP1]], align 4 -; CHECK-NEXT: call void @add(i32 [[PAIR_VAL]], i32 [[PAIR_VAL1]], ptr noalias [[R]]) +; CHECK-NEXT: call void @add.argprom(i32 [[PAIR_VAL]], i32 [[PAIR_VAL1]], ptr noalias [[R]]) ; CHECK-NEXT: ret void ; %r = alloca i32 diff --git a/llvm/test/Transforms/ArgumentPromotion/store-into-inself.ll b/llvm/test/Transforms/ArgumentPromotion/store-into-inself.ll index 0db42a97841f48..ecf8eb557786dc 100644 --- a/llvm/test/Transforms/ArgumentPromotion/store-into-inself.ll +++ b/llvm/test/Transforms/ArgumentPromotion/store-into-inself.ll @@ -59,7 +59,7 @@ entry: } define internal void @l(ptr byval(ptr) align 4 %p) nounwind { -; CHECK-LABEL: define {{[^@]+}}@l +; CHECK-LABEL: define {{[^@]+}}@l.argprom.argprom ; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: ret void @@ -83,7 +83,7 @@ define i32 @main() nounwind { ; CHECK-NEXT: call void @g(ptr byval(ptr) align 4 [[S]]) #[[ATTR0]] ; CHECK-NEXT: call void @h(ptr byval(ptr) align 4 [[S]]) #[[ATTR0]] ; CHECK-NEXT: call void @k(ptr byval(ptr) align 4 [[S]]) #[[ATTR0]] -; CHECK-NEXT: call void @l() #[[ATTR0]] +; CHECK-NEXT: call void @l.argprom.argprom() #[[ATTR0]] ; CHECK-NEXT: ret i32 0 ; entry: diff --git a/llvm/test/Transforms/ArgumentPromotion/unused-argument.ll b/llvm/test/Transforms/ArgumentPromotion/unused-argument.ll index f648d20f47311e..ec1503d3022154 100644 --- a/llvm/test/Transforms/ArgumentPromotion/unused-argument.ll +++ b/llvm/test/Transforms/ArgumentPromotion/unused-argument.ll @@ -4,7 +4,7 @@ ; while the used arguments should be promoted if they are pointers. ; The pass should not touch any unused non-pointer arguments. define internal i32 @callee(i1 %c, i1 %d, ptr %used, ptr %unused) nounwind { -; CHECK-LABEL: define {{[^@]+}}@callee +; CHECK-LABEL: define {{[^@]+}}@callee.argprom ; CHECK-SAME: (i1 [[C:%.*]], i1 [[D:%.*]], i32 [[USED_VAL:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[C]], label %if, label %else @@ -28,7 +28,7 @@ else: ; while the used arguments should be promoted if they are pointers. ; The pass should not touch any unused non-pointer arguments. define internal i32 @callee_byval(i1 %c, i1 %d, ptr byval(i32) align 4 %used, ptr byval(i32) align 4 %unused) nounwind { -; CHECK-LABEL: define {{[^@]+}}@callee_byval +; CHECK-LABEL: define {{[^@]+}}@callee_byval.argprom ; CHECK-SAME: (i1 [[C:%.*]], i1 [[D:%.*]], i32 [[USED_VAL:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[C]], label %if, label %else @@ -53,9 +53,9 @@ define i32 @caller(i1 %c, i1 %d, ptr %arg) nounwind { ; CHECK-SAME: (i1 [[C:%.*]], i1 [[D:%.*]], ptr [[ARG:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARG_VAL_0:%.*]] = load i32, ptr [[ARG]], align 4 -; CHECK-NEXT: [[RES_0:%.*]] = call i32 @callee_byval(i1 [[C]], i1 [[D]], i32 [[ARG_VAL_0]]) #[[ATTR0]] +; CHECK-NEXT: [[RES_0:%.*]] = call i32 @callee_byval.argprom(i1 [[C]], i1 [[D]], i32 [[ARG_VAL_0]]) #[[ATTR0]] ; CHECK-NEXT: [[ARG_VAL_1:%.*]] = load i32, ptr [[ARG]], align 4 -; CHECK-NEXT: [[RES_1:%.*]] = call i32 @callee(i1 [[C]], i1 [[D]], i32 [[ARG_VAL_1]]) #[[ATTR0]] +; CHECK-NEXT: [[RES_1:%.*]] = call i32 @callee.argprom(i1 [[C]], i1 [[D]], i32 [[ARG_VAL_1]]) #[[ATTR0]] ; CHECK-NEXT: ret i32 1 ; entry: diff --git a/llvm/test/Transforms/Attributor/reduced/clear_cached_analysis_for_deleted_functions.ll b/llvm/test/Transforms/Attributor/reduced/clear_cached_analysis_for_deleted_functions.ll index b3e3b2497194cb..db8f86ea1b06cc 100644 --- a/llvm/test/Transforms/Attributor/reduced/clear_cached_analysis_for_deleted_functions.ll +++ b/llvm/test/Transforms/Attributor/reduced/clear_cached_analysis_for_deleted_functions.ll @@ -19,7 +19,7 @@ define i32 @clause_LiteralComputeWeight(ptr %call23) { ; CGSCC-NEXT: [[TERM_0:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[CALL24:%.*]], [[DO_BODY]] ] ; CGSCC-NEXT: [[CALL2:%.*]] = load volatile i32, ptr [[TERM_0]], align 4 ; CGSCC-NEXT: [[CALL23_VAL:%.*]] = load ptr, ptr [[CALL23]], align 8 -; CGSCC-NEXT: [[CALL24]] = call ptr @list_Car(ptr nofree readonly [[CALL23_VAL]]) #[[ATTR3:[0-9]+]] +; CGSCC-NEXT: [[CALL24]] = call ptr @list_Car.argprom(ptr nofree readonly [[CALL23_VAL]]) #[[ATTR3:[0-9]+]] ; CGSCC-NEXT: br label [[DO_BODY]] ; entry: @@ -46,7 +46,7 @@ entry: define internal ptr @list_Car(ptr %L) #0 { ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read) -; CGSCC-LABEL: define {{[^@]+}}@list_Car +; CGSCC-LABEL: define {{[^@]+}}@list_Car.argprom ; CGSCC-SAME: (ptr nofree [[L_0_VAL:%.*]]) #[[ATTR2:[0-9]+]] { ; CGSCC-NEXT: entry: ; CGSCC-NEXT: ret ptr [[L_0_VAL]] diff --git a/llvm/test/Transforms/DeadArgElim/2007-02-07-FuncRename.ll b/llvm/test/Transforms/DeadArgElim/2007-02-07-FuncRename.ll index 5096aff3eb0298..ee6a16e834718a 100644 --- a/llvm/test/Transforms/DeadArgElim/2007-02-07-FuncRename.ll +++ b/llvm/test/Transforms/DeadArgElim/2007-02-07-FuncRename.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -passes=deadargelim -S | grep "@test(" +; RUN: opt < %s -passes=deadargelim -S | grep "@test.argelim(" ; RUN: opt < %s -passes=deadargelim -S | not grep dead define internal i32 @test(i32 %X, i32 %dead) { diff --git a/llvm/test/Transforms/DeadArgElim/2007-12-20-ParamAttrs.ll b/llvm/test/Transforms/DeadArgElim/2007-12-20-ParamAttrs.ll index c3f7d7df90bc4f..79fe6eb81bd077 100644 --- a/llvm/test/Transforms/DeadArgElim/2007-12-20-ParamAttrs.ll +++ b/llvm/test/Transforms/DeadArgElim/2007-12-20-ParamAttrs.ll @@ -5,7 +5,7 @@ @g = global i8 0 -; CHECK: define internal void @foo(i8 signext %y) [[NUW:#[0-9]+]] +; CHECK: define internal void @foo.argelim(i8 signext %y) [[NUW:#[0-9]+]] ; ; REMARK-LABEL: Function: foo ; REMARK: Args: @@ -21,7 +21,7 @@ define internal zeroext i8 @foo(ptr inreg %p, i8 signext %y, ... ) nounwind { } define i32 @bar() { -; CHECK: call void @foo(i8 signext 1) [[NUW]] +; CHECK: call void @foo.argelim(i8 signext 1) [[NUW]] %A = call zeroext i8(ptr, i8, ...) @foo(ptr inreg null, i8 signext 1, ptr byval(%struct) null ) nounwind ret i32 0 } diff --git a/llvm/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll b/llvm/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll index 485275b11160ff..8d6b1d13c52a34 100644 --- a/llvm/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll +++ b/llvm/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll @@ -15,7 +15,7 @@ define ptr @vfs_addname(ptr %name, i32 %len, i32 %hash, i32 %flags) nounwind ssp ; CHECK-NEXT: #dbg_value(i32 [[LEN]], [[META13:![0-9]+]], !DIExpression(), [[META12]]) ; CHECK-NEXT: #dbg_value(i32 [[HASH]], [[META14:![0-9]+]], !DIExpression(), [[META12]]) ; CHECK-NEXT: #dbg_value(i32 [[FLAGS]], [[META15:![0-9]+]], !DIExpression(), [[META12]]) -; CHECK-NEXT: [[TMP0:%.*]] = call fastcc ptr @add_name_internal(ptr [[NAME]], i32 [[HASH]]) #[[ATTR2:[0-9]+]], !dbg [[DBG16:![0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = call fastcc ptr @add_name_internal.argelim(ptr [[NAME]], i32 [[HASH]]) #[[ATTR2:[0-9]+]], !dbg [[DBG16:![0-9]+]] ; CHECK-NEXT: ret ptr [[TMP0]], !dbg [[DBG16]] ; entry: @@ -31,7 +31,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone define internal fastcc ptr @add_name_internal(ptr %name, i32 %len, i32 %hash, i8 zeroext %extra, i32 %flags) noinline nounwind ssp !dbg !16 { ; -; CHECK-LABEL: define {{[^@]+}}@add_name_internal +; CHECK-LABEL: define {{[^@]+}}@add_name_internal.argelim ; CHECK-SAME: (ptr [[NAME:%.*]], i32 [[HASH:%.*]]) #[[ATTR1:[0-9]+]] !dbg [[DBG18:![0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: #dbg_value(ptr [[NAME]], [[META22:![0-9]+]], !DIExpression(), [[META23:![0-9]+]]) diff --git a/llvm/test/Transforms/DeadArgElim/aggregates.ll b/llvm/test/Transforms/DeadArgElim/aggregates.ll index 784ac3af64c754..4671fa63c5c918 100644 --- a/llvm/test/Transforms/DeadArgElim/aggregates.ll +++ b/llvm/test/Transforms/DeadArgElim/aggregates.ll @@ -4,7 +4,7 @@ ; actually only used in ways we can eliminate. We gain benefit from analysing ; the "use" and applying its results to all sub-values. -; CHECK-LABEL: define internal void @agguse_dead() +; CHECK-LABEL: define internal void @agguse_dead.retelim() define internal { i32, i32 } @agguse_dead() { ret { i32, i32 } { i32 0, i32 1 } @@ -20,7 +20,7 @@ define internal { i32, i32 } @test_agguse_dead() { ; Case 1: an opaque use of the aggregate exists (in this case dead). Otherwise ; only one value is used, so function can be simplified. -; CHECK-LABEL: define internal i32 @rets_independent_if_agguse_dead() +; CHECK-LABEL: define internal i32 @rets_independent_if_agguse_dead.retelim() ; CHECK: [[RET:%.*]] = extractvalue { i32, i32 } { i32 0, i32 1 }, 1 ; CHECK: ret i32 [[RET]] @@ -89,7 +89,7 @@ define [2 x i32] @test_array_rets_have_multiple_slots() { ; Case 4: we can remove some retvals from the array. It's nice to produce an ; array again having done so (rather than converting it to a struct). -; CHECK-LABEL: define internal [2 x i32] @can_shrink_arrays() +; CHECK-LABEL: define internal [2 x i32] @can_shrink_arrays.retelim() ; CHECK: [[VAL0:%.*]] = extractvalue [3 x i32] [i32 42, i32 43, i32 44], 0 ; CHECK: [[RESTMP:%.*]] = insertvalue [2 x i32] poison, i32 [[VAL0]], 0 ; CHECK: [[VAL2:%.*]] = extractvalue [3 x i32] [i32 42, i32 43, i32 44], 2 @@ -117,7 +117,7 @@ define void @test_can_shrink_arrays() { ; Case 5: %in gets passed directly to the return. It should mark be marked as ; used if *any* of the return values are, not just if value 0 is. -; CHECK-LABEL: define internal i32 @ret_applies_to_all({ i32, i32 } %in) +; CHECK-LABEL: define internal i32 @ret_applies_to_all.retelim({ i32, i32 } %in) ; CHECK: [[RET:%.*]] = extractvalue { i32, i32 } %in, 1 ; CHECK: ret i32 [[RET]] @@ -167,7 +167,7 @@ entry: } ; CHECK-LABEL: define void @PR24906 -; CHECK: %[[invoke:.*]] = invoke i32 @agg_ret() +; CHECK: %[[invoke:.*]] = invoke i32 @agg_ret.retelim() ; CHECK: %[[oldret:.*]] = insertvalue { i32 } poison, i32 %[[invoke]], 0 ; CHECK: phi { i32 } [ %[[oldret]], define void @PR24906() personality ptr poison { diff --git a/llvm/test/Transforms/DeadArgElim/call_profile.ll b/llvm/test/Transforms/DeadArgElim/call_profile.ll index 94dbbef6a6e983..93572a3c540b89 100644 --- a/llvm/test/Transforms/DeadArgElim/call_profile.ll +++ b/llvm/test/Transforms/DeadArgElim/call_profile.ll @@ -3,8 +3,8 @@ ; Checks if !prof metadata is corret in deadargelim. define void @caller() #0 { -; CHECK: call void @test_vararg(), !prof ![[PROF:[0-9]]] -; CHECK: call void @test(), !prof ![[PROF]] +; CHECK: call void @test_vararg.argelim(), !prof ![[PROF:[0-9]]] +; CHECK: call void @test.argelim(), !prof ![[PROF]] call void (i32, ...) @test_vararg(i32 1), !prof !0 call void @test(i32 1), !prof !0 ret void diff --git a/llvm/test/Transforms/DeadArgElim/comdat.ll b/llvm/test/Transforms/DeadArgElim/comdat.ll index ea80d0dec0d1e9..0175ffe436e2d2 100644 --- a/llvm/test/Transforms/DeadArgElim/comdat.ll +++ b/llvm/test/Transforms/DeadArgElim/comdat.ll @@ -11,4 +11,4 @@ define internal void @g(i32 %dead) comdat($f) { ret void } -; CHECK: define internal void @g() comdat($f) { +; CHECK: define internal void @g.argelim() comdat($f) { diff --git a/llvm/test/Transforms/DeadArgElim/dbginfo-update-dbgval-local.ll b/llvm/test/Transforms/DeadArgElim/dbginfo-update-dbgval-local.ll index 0e834013fe40b4..514bfd72d48b3f 100644 --- a/llvm/test/Transforms/DeadArgElim/dbginfo-update-dbgval-local.ll +++ b/llvm/test/Transforms/DeadArgElim/dbginfo-update-dbgval-local.ll @@ -5,7 +5,7 @@ ; Reproducer for PR23260. -; CHECK-LABEL: define internal void @bar() +; CHECK-LABEL: define internal void @bar.argelim() ; CHECK: #dbg_value(i32 poison, ![[LOCAL1:[0-9]+]] ; CHECK: call void @sink() @@ -18,9 +18,9 @@ entry: } ; CHECK-LABEL: define void @foo() -; CHECK: call void @bar() +; CHECK: call void @bar.argelim() ; CHECK: #dbg_value(i32 poison, ![[LOCAL2:[0-9]+]] -; CHECK: call void @bar() +; CHECK: call void @bar.argelim() ; Function Attrs: nounwind uwtable define void @foo() #0 !dbg !6 { diff --git a/llvm/test/Transforms/DeadArgElim/dbginfo.ll b/llvm/test/Transforms/DeadArgElim/dbginfo.ll index a27ca9dd70c245..c86fc457860519 100644 --- a/llvm/test/Transforms/DeadArgElim/dbginfo.ll +++ b/llvm/test/Transforms/DeadArgElim/dbginfo.ll @@ -14,7 +14,7 @@ ; the function->debug info mapping on update to ensure it's accurate when used ; again for the next removal. -; CHECK: define internal void @_ZL2f1iz({{.*}} !dbg [[SP:![0-9]+]] +; CHECK: define internal void @_ZL2f1iz.argelim({{.*}} !dbg [[SP:![0-9]+]] ; CHECK: [[SP]] = distinct !DISubprogram(name: "f1" ; Check that debug info metadata for subprograms stores pointers to diff --git a/llvm/test/Transforms/DeadArgElim/deadretval.ll b/llvm/test/Transforms/DeadArgElim/deadretval.ll index 910aa7b9bd2238..74359f29ccbd2b 100644 --- a/llvm/test/Transforms/DeadArgElim/deadretval.ll +++ b/llvm/test/Transforms/DeadArgElim/deadretval.ll @@ -23,7 +23,7 @@ define i32 @test3() { ; The callee function's return type shouldn't be changed if the call result is ; used. -; CHECK-LABEL: define internal ptr @callee4() +; CHECK-LABEL: define internal ptr @callee4.argelim() define internal ptr @callee4(ptr %a0) { ret ptr @g0; @@ -32,7 +32,7 @@ define internal ptr @callee4(ptr %a0) { declare void @llvm.objc.clang.arc.noop.use(...) ; CHECK-LABEL: define ptr @test4( -; CHECK: tail call ptr @callee4() [ "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ] +; CHECK: tail call ptr @callee4.argelim() [ "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ] define ptr @test4() { %call = tail call ptr @callee4(ptr @g0) [ "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ] diff --git a/llvm/test/Transforms/DeadArgElim/fct_ptr.ll b/llvm/test/Transforms/DeadArgElim/fct_ptr.ll index bf54fb2e8b7286..6c02bd5ee9c3c3 100644 --- a/llvm/test/Transforms/DeadArgElim/fct_ptr.ll +++ b/llvm/test/Transforms/DeadArgElim/fct_ptr.ll @@ -22,7 +22,7 @@ define i32 @call_indirect(ptr readnone %fct_ptr, i32 %arg1, i32 %arg2, i32 %arg3 ; CHECK-NEXT: [[RES2:%.*]] = tail call i32 @internal_fct(i32 poison, i32 [[ARG2]], i32 poison) ; CHECK-NEXT: br label [[END]] ; CHECK: call_other: -; CHECK-NEXT: [[RES3:%.*]] = tail call i32 @other_fct(i32 [[ARG2]]) +; CHECK-NEXT: [[RES3:%.*]] = tail call i32 @other_fct.argelim(i32 [[ARG2]]) ; CHECK-NEXT: br label [[END]] ; CHECK: end: ; CHECK-NEXT: [[FINAL_RES:%.*]] = phi i32 [ [[RES1]], [[CALL_EXT]] ], [ [[RES2]], [[CALL_INT]] ], [ [[RES3]], [[CALL_OTHER]] ] diff --git a/llvm/test/Transforms/DeadArgElim/func_metadata.ll b/llvm/test/Transforms/DeadArgElim/func_metadata.ll index 4922798b3aaa35..2d25d916b24426 100644 --- a/llvm/test/Transforms/DeadArgElim/func_metadata.ll +++ b/llvm/test/Transforms/DeadArgElim/func_metadata.ll @@ -8,7 +8,7 @@ target triple = "x86_64-unknown-linux-gnu" @s = common dso_local local_unnamed_addr global i32 0, align 4 define internal i32 @va_func(i32 %num, ...) !prof !28 !PGOFuncName !29{ -; CHECK: define internal void @va_func(i32 %num) !prof ![[ENTRYCOUNT:[0-9]+]] !PGOFuncName ![[PGOFUNCNAME1:[0-9]+]] { +; CHECK: define internal void @va_func.retelim(i32 %num) !prof ![[ENTRYCOUNT:[0-9]+]] !PGOFuncName ![[PGOFUNCNAME1:[0-9]+]] { entry: %0 = load i32, ptr @s, align 4, !tbaa !31 %add = add nsw i32 %0, %num @@ -17,7 +17,7 @@ entry: } define internal fastcc i32 @foo() unnamed_addr !prof !28 !PGOFuncName !30 { -; CHECK: define internal fastcc void @foo() unnamed_addr !prof ![[ENTRYCOUNT:[0-9]+]] !PGOFuncName ![[PGOFUNCNAME2:[0-9]+]] { +; CHECK: define internal fastcc void @foo.retelim() unnamed_addr !prof ![[ENTRYCOUNT:[0-9]+]] !PGOFuncName ![[PGOFUNCNAME2:[0-9]+]] { entry: %0 = load i32, ptr @s, align 4, !tbaa !31 %add = add nsw i32 %0, 8 diff --git a/llvm/test/Transforms/DeadArgElim/funclet.ll b/llvm/test/Transforms/DeadArgElim/funclet.ll index 3115c8b341415f..d56720f96379da 100644 --- a/llvm/test/Transforms/DeadArgElim/funclet.ll +++ b/llvm/test/Transforms/DeadArgElim/funclet.ll @@ -22,7 +22,7 @@ bad1: ; preds = %entry-block } ; CHECK-LABEL: define void @test1( ; CHECK: %[[pad:.*]] = cleanuppad within none [] -; CHECK-NEXT: call void @callee() [ "funclet"(token %[[pad]]) ] +; CHECK-NEXT: call void @callee.argelim() [ "funclet"(token %[[pad]]) ] declare void @thunk() diff --git a/llvm/test/Transforms/DeadArgElim/keepalive.ll b/llvm/test/Transforms/DeadArgElim/keepalive.ll index bcb9f1d5f302cf..43dd8791ff4565 100644 --- a/llvm/test/Transforms/DeadArgElim/keepalive.ll +++ b/llvm/test/Transforms/DeadArgElim/keepalive.ll @@ -10,7 +10,7 @@ declare ptr @llvm.call.preallocated.arg(token, i32) ; the function and then changing too much. ; This checks if the return value attributes are not removed -; CHECK: define internal zeroext i32 @test1() #1 +; CHECK: define internal zeroext i32 @test1.argelim() #1 define internal zeroext i32 @test1(i32 %DEADARG1) nounwind { ; ; @@ -18,7 +18,7 @@ define internal zeroext i32 @test1(i32 %DEADARG1) nounwind { } ; This checks if the struct doesn't get non-packed -; CHECK-LABEL: define internal <{ i32, i32 }> @test2( +; CHECK-LABEL: define internal <{ i32, i32 }> @test2.argelim( define internal <{ i32, i32 }> @test2(i32 %DEADARG1) { ; ; diff --git a/llvm/test/Transforms/DeadArgElim/nonzero-address-spaces.ll b/llvm/test/Transforms/DeadArgElim/nonzero-address-spaces.ll index ddd9aaac628d54..fd9d4e0c411072 100644 --- a/llvm/test/Transforms/DeadArgElim/nonzero-address-spaces.ll +++ b/llvm/test/Transforms/DeadArgElim/nonzero-address-spaces.ll @@ -5,14 +5,14 @@ target datalayout = "e-P1-p:16:8-i8:8-i16:8-i32:8-i64:8-f32:8-f64:8-n8-a:8" -; CHECK: define internal i32 @foo() addrspace(1) +; CHECK: define internal i32 @foo.argelim() addrspace(1) define internal i32 @foo(i32 %x) #0 { tail call void asm sideeffect inteldialect "mov eax, [esp + $$4]\0A\09ret", "~{eax},~{dirflag},~{fpsr},~{flags}"() unreachable } define i32 @f(i32 %x, i32 %y) { - ; CHECK: %r = call addrspace(1) i32 @foo() + ; CHECK: %r = call addrspace(1) i32 @foo.argelim() %r = call i32 @foo(i32 %x) ret i32 %r diff --git a/llvm/test/Transforms/DeadArgElim/returned.ll b/llvm/test/Transforms/DeadArgElim/returned.ll index 73f23ffa725eff..94b1c9654d4d64 100644 --- a/llvm/test/Transforms/DeadArgElim/returned.ll +++ b/llvm/test/Transforms/DeadArgElim/returned.ll @@ -3,14 +3,14 @@ %Ty = type { i32, i32 } ; Validate that the argument and return value are both dead -; CHECK-LABEL: define internal void @test1() +; CHECK-LABEL: define internal void @test1.argelim() define internal ptr @test1(ptr %this) { ret ptr %this } ; do not keep alive the return value of a function with a dead 'returned' argument -; CHECK-LABEL: define internal void @test2() +; CHECK-LABEL: define internal void @test2.argelim() define internal ptr @test2(ptr returned %this) { ret ptr %this @@ -20,7 +20,7 @@ define internal ptr @test2(ptr returned %this) { @dummy = global ptr null ; Validate that return value is dead -; CHECK-LABEL: define internal void @test3(ptr %this) +; CHECK-LABEL: define internal void @test3.argelim(ptr %this) define internal ptr @test3(ptr %this) { store volatile ptr %this, ptr @dummy @@ -36,7 +36,7 @@ define internal ptr @test4(ptr returned %this) { } ; don't do this if 'returned' is on the call site... -; CHECK-LABEL: define internal void @test5(ptr %this) +; CHECK-LABEL: define internal void @test5.argelim(ptr %this) define internal ptr @test5(ptr %this) { store volatile ptr %this, ptr @dummy @@ -55,7 +55,7 @@ define ptr @caller(ptr %this) { %3 = call ptr @test3(ptr %this) %4 = call ptr @test4(ptr %this) ; ...instead, drop 'returned' form the call site -; CHECK: call void @test5(ptr %this) +; CHECK: call void @test5.argelim(ptr %this) %5 = call ptr @test5(ptr returned %this) %6 = call ptr @test6() ret ptr %this diff --git a/llvm/test/Transforms/DeadArgElim/variadic_safety.ll b/llvm/test/Transforms/DeadArgElim/variadic_safety.ll index 2147e4d0b8372d..d9fc4a1c822056 100644 --- a/llvm/test/Transforms/DeadArgElim/variadic_safety.ll +++ b/llvm/test/Transforms/DeadArgElim/variadic_safety.ll @@ -34,5 +34,5 @@ define void @call_deadret(i32 %in) { store i32 42, ptr %stacked call i32 (i32, i32, ...) @va_deadret_func(i32 poison, i32 %in, [6 x i32] poison, ptr byval(i32) %stacked) ret void -; CHECK: call void (i32, i32, ...) @va_deadret_func(i32 poison, i32 poison, [6 x i32] poison, ptr byval(i32) %stacked) +; CHECK: call void (i32, i32, ...) @va_deadret_func.retelim(i32 poison, i32 poison, [6 x i32] poison, ptr byval(i32) %stacked) } diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization2.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization2.ll index b6cdcf18eea429..6e92084625294d 100644 --- a/llvm/test/Transforms/FunctionSpecialization/function-specialization2.ll +++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization2.ll @@ -42,15 +42,15 @@ define internal void @decrement(ptr nocapture %0) { } define i32 @main(ptr %0, i32 %1) { -; CHECK: call void @func.specialized.2(ptr [[TMP0:%.*]], i32 [[TMP1:%.*]]) +; CHECK: call void @func.specialized.2.argelim(ptr [[TMP0:%.*]], i32 [[TMP1:%.*]]) %3 = call i32 @func(ptr %0, i32 %1, ptr nonnull @increment) -; CHECK: call void @func.specialized.1(ptr [[TMP0]], i32 0) +; CHECK: call void @func.specialized.1.argelim(ptr [[TMP0]], i32 0) %4 = call i32 @func(ptr %0, i32 %3, ptr nonnull @decrement) ; CHECK: ret i32 0 ret i32 %4 } -; CHECK: @func.specialized.1( +; CHECK: @func.specialized.1.argelim( ; CHECK: [[TMP3:%.*]] = alloca i32, align 4 ; CHECK: store i32 [[TMP1:%.*]], ptr [[TMP3]], align 4 ; CHECK: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 @@ -63,13 +63,13 @@ define i32 @main(ptr %0, i32 %1) { ; CHECK: call void @decrement(ptr [[TMP9]]) ; CHECK: [[TMP10:%.*]] = load i32, ptr [[TMP3]], align 4 ; CHECK: [[TMP11:%.*]] = add nsw i32 [[TMP10]], -1 -; CHECK: call void @func.specialized.1(ptr [[TMP0]], i32 [[TMP11]]) +; CHECK: call void @func.specialized.1.argelim(ptr [[TMP0]], i32 [[TMP11]]) ; CHECK: br label [[TMP12:%.*]] ; CHECK: 12: ; CHECK: ret void ; ; -; CHECK: @func.specialized.2( +; CHECK: @func.specialized.2.argelim( ; CHECK: [[TMP3:%.*]] = alloca i32, align 4 ; CHECK: store i32 [[TMP1:%.*]], ptr [[TMP3]], align 4 ; CHECK: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 @@ -82,7 +82,7 @@ define i32 @main(ptr %0, i32 %1) { ; CHECK: call void @increment(ptr [[TMP9]]) ; CHECK: [[TMP10:%.*]] = load i32, ptr [[TMP3]], align 4 ; CHECK: [[TMP11:%.*]] = add nsw i32 [[TMP10]], -1 -; CHECK: call void @func.specialized.2(ptr [[TMP0]], i32 [[TMP11]]) +; CHECK: call void @func.specialized.2.argelim(ptr [[TMP0]], i32 [[TMP11]]) ; CHECK: br label [[TMP12:%.*]] ; CHECK: 12: ; CHECK: ret void diff --git a/llvm/test/Transforms/FunctionSpecialization/global-var-constants.ll b/llvm/test/Transforms/FunctionSpecialization/global-var-constants.ll index a576d9aa32e140..65b859b3a44801 100644 --- a/llvm/test/Transforms/FunctionSpecialization/global-var-constants.ll +++ b/llvm/test/Transforms/FunctionSpecialization/global-var-constants.ll @@ -49,11 +49,11 @@ entry: ; Check if specialisation on the address of a non-const global variable ; is not allowed, then it is not performed. -; NO-GLOBALS-LABEL: define internal range(i32 -2147483646, -2147483648) i32 @g() +; NO-GLOBALS-LABEL: define internal range(i32 -2147483646, -2147483648) i32 @g.argelim() ; NO-GLOBALS: call i32 @f(ptr @G) ; NO-GLOBALS-LABEL: define range(i32 -2147483646, -2147483648) i32 @h0(ptr %p) -; NO-GLOBALS:call i32 @g() +; NO-GLOBALS:call i32 @g.argelim() ; NO-GLOBALS-LABEL: define i32 @h1() ; NO-GLOBALS: call i32 @f(ptr @G) @@ -64,15 +64,15 @@ entry: ; Check if specialisation on the address of a non-const global variable ; is allowed, then it is performed where possible. -; GLOBALS-LABEL: define internal range(i32 -2147483646, -2147483648) i32 @g() -; GLOBALS: call i32 @f.specialized.2() +; GLOBALS-LABEL: define internal range(i32 -2147483646, -2147483648) i32 @g.argelim() +; GLOBALS: call i32 @f.specialized.2.argelim() ; GLOBALS-LABEL: define range(i32 -2147483646, -2147483648) i32 @h0(ptr %p) -; GLOBALS: call i32 @g() +; GLOBALS: call i32 @g.argelim() ; GLOBALS-LABEL: define i32 @h1() -; GLOBALS: call i32 @f.specialized.2() +; GLOBALS: call i32 @f.specialized.2.argelim() ; GLOBALS-LABEL: define i32 @h2() -; GLOBALS: call i32 @f.specialized.1() +; GLOBALS: call i32 @f.specialized.1.argelim() diff --git a/llvm/test/Transforms/FunctionSpecialization/non-argument-tracked.ll b/llvm/test/Transforms/FunctionSpecialization/non-argument-tracked.ll index 9446e557da7581..85ff084e90b198 100644 --- a/llvm/test/Transforms/FunctionSpecialization/non-argument-tracked.ll +++ b/llvm/test/Transforms/FunctionSpecialization/non-argument-tracked.ll @@ -29,9 +29,9 @@ define internal i32 @f2(i32 %i) { ;; All calls are to specilisation instances. ; CHECK-LABEL: define i32 @g0 -; CHECK: call void @f0.specialized.[[#A:]]() -; CHECK-NEXT: call void @f1.specialized.[[#B:]]() -; CHECK-NEXT: call void @f2.specialized.[[#C:]]() +; CHECK: call void @f0.specialized.[[#A:]].argelim() +; CHECK-NEXT: call void @f1.specialized.[[#B:]].argelim() +; CHECK-NEXT: call void @f2.specialized.[[#C:]].argelim() ; CHECK-NEXT: ret i32 9 define i32 @g0(i32 %i) { %u0 = call i32 @f0(i32 1) @@ -43,9 +43,9 @@ define i32 @g0(i32 %i) { } ; CHECK-LABEL: define i32 @g1 -; CHECK: call void @f0.specialized.[[#D:]]() -; CHECK-NEXT: call void @f1.specialized.[[#E:]]() -; CHECK-NEXT: call void @f2.specialized.[[#F:]]() +; CHECK: call void @f0.specialized.[[#D:]].argelim() +; CHECK-NEXT: call void @f1.specialized.[[#E:]].argelim() +; CHECK-NEXT: call void @f2.specialized.[[#F:]].argelim() ; CHECK-NEXT: ret i32 12 define i32 @g1(i32 %i) { %u0 = call i32 @f0(i32 2) @@ -58,9 +58,9 @@ define i32 @g1(i32 %i) { ; All of the function are specialized and all clones are with internal linkage. -; CHECK-DAG: define internal void @f0.specialized.[[#A]]() { -; CHECK-DAG: define internal void @f1.specialized.[[#B]]() { -; CHECK-DAG: define internal void @f2.specialized.[[#C]]() { -; CHECK-DAG: define internal void @f0.specialized.[[#D]]() { -; CHECK-DAG: define internal void @f1.specialized.[[#E]]() { -; CHECK-DAG: define internal void @f2.specialized.[[#F]]() { +; CHECK-DAG: define internal void @f0.specialized.[[#A]].argelim() { +; CHECK-DAG: define internal void @f1.specialized.[[#B]].argelim() { +; CHECK-DAG: define internal void @f2.specialized.[[#C]].argelim() { +; CHECK-DAG: define internal void @f0.specialized.[[#D]].argelim() { +; CHECK-DAG: define internal void @f1.specialized.[[#E]].argelim() { +; CHECK-DAG: define internal void @f2.specialized.[[#F]].argelim() { diff --git a/llvm/test/Transforms/FunctionSpecialization/specialization-order.ll b/llvm/test/Transforms/FunctionSpecialization/specialization-order.ll index da4cb40fb6dc50..1e81f2ebc409a0 100644 --- a/llvm/test/Transforms/FunctionSpecialization/specialization-order.ll +++ b/llvm/test/Transforms/FunctionSpecialization/specialization-order.ll @@ -21,7 +21,7 @@ entry: define dso_local i32 @g0(i32 %x, i32 %y) { ; CHECK-LABEL: @g0 -; CHECK: call i32 @f.specialized.3(i32 [[X:%.*]], i32 [[Y:%.*]]) +; CHECK: call i32 @f.specialized.3.argelim(i32 [[X:%.*]], i32 [[Y:%.*]]) entry: %call = tail call i32 @f(i32 %x, i32 %y, ptr @add, ptr @add) ret i32 %call @@ -30,7 +30,7 @@ entry: define dso_local i32 @g1(i32 %x, i32 %y) { ; CHECK-LABEL: @g1( -; CHECK: call i32 @f.specialized.2(i32 [[X:%.*]], i32 [[Y:%.*]]) +; CHECK: call i32 @f.specialized.2.argelim(i32 [[X:%.*]], i32 [[Y:%.*]]) entry: %call = tail call i32 @f(i32 %x, i32 %y, ptr @sub, ptr @add) ret i32 %call @@ -38,21 +38,21 @@ entry: define dso_local i32 @g2(i32 %x, i32 %y, ptr %v) { ; CHECK-LABEL: @g2 -; CHECK: call i32 @f.specialized.1(i32 [[X:%.*]], i32 [[Y:%.*]], ptr [[V:%.*]]) +; CHECK: call i32 @f.specialized.1.argelim(i32 [[X:%.*]], i32 [[Y:%.*]], ptr [[V:%.*]]) entry: %call = tail call i32 @f(i32 %x, i32 %y, ptr @sub, ptr %v) ret i32 %call } -; CHECK-LABEL: define {{.*}} i32 @f.specialized.1 +; CHECK-LABEL: define {{.*}} i32 @f.specialized.1.argelim ; CHECK: call i32 @sub(i32 %x, i32 %y) ; CHECK-NEXT: call i32 %v(i32 %x, i32 %y) -; CHECK-LABEL: define {{.*}} i32 @f.specialized.2 +; CHECK-LABEL: define {{.*}} i32 @f.specialized.2.argelim ; CHECK: call i32 @sub(i32 %x, i32 %y) ; CHECK-NEXT: call i32 @add(i32 %x, i32 %y) -; CHECK-LABEL: define {{.*}} i32 @f.specialized.3 +; CHECK-LABEL: define {{.*}} i32 @f.specialized.3.argelim ; CHECK: call i32 @add(i32 %x, i32 %y) ; CHECK-NEXT: call i32 @add(i32 %x, i32 %y) diff --git a/llvm/test/Transforms/PhaseOrdering/dae-dce.ll b/llvm/test/Transforms/PhaseOrdering/dae-dce.ll index 7ff3c5dc5536f7..389c2f5612488d 100644 --- a/llvm/test/Transforms/PhaseOrdering/dae-dce.ll +++ b/llvm/test/Transforms/PhaseOrdering/dae-dce.ll @@ -14,7 +14,8 @@ define void @do_trap(ptr %ptr) { } define internal void @capture_and_trap(ptr %ptr) noinline { -; CHECK-LABEL: @capture_and_trap( +; DEFAULT-LABEL: @capture_and_trap.argelim( +; LTO-LABEL: @capture_and_trap.argprom( ; CHECK-NEXT: tail call void @llvm.trap() ; CHECK-NEXT: unreachable ; @@ -34,7 +35,8 @@ define internal void @dead_fn2() { define void @test(i1 %c) { ; CHECK-LABEL: @test( -; CHECK-NEXT: tail call fastcc void @capture_and_trap() +; DEFAULT-NEXT: tail call fastcc void @capture_and_trap.argelim() +; LTO-NEXT: tail call fastcc void @capture_and_trap.argprom() ; CHECK-NEXT: unreachable ; br i1 %c, label %if, label %else diff --git a/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion.ll b/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion.ll index c33fcfbe6ed973..9361ec16d23d55 100644 --- a/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion.ll +++ b/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion.ll @@ -8,7 +8,7 @@ ; CHECK: [[DUMMY:@.*]] = local_unnamed_addr global i32 0 define internal void @f(ptr byval(%struct.ss) align 8 %b, ptr byval(i32) align 4 %X) noinline nounwind { -; CHECK-LABEL: define {{[^@]+}}@f +; CHECK-LABEL: define {{[^@]+}}@f.argprom.argelim ; CHECK-SAME: (i32 [[B_0:%.*]]){{[^#]*}} #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TEMP:%.*]] = add i32 [[B_0]], 1 @@ -27,7 +27,7 @@ define i32 @test(ptr %X) { ; CHECK-LABEL: define {{[^@]+}}@test ; CHECK-SAME: (ptr {{[^%]*}} [[X:%.*]]){{[^#]*}} #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: tail call {{.*}}void @f(i32 1) +; CHECK-NEXT: tail call {{.*}}void @f.argprom.argelim(i32 1) ; CHECK-NEXT: ret i32 0 ; entry: diff --git a/llvm/test/Transforms/SCCP/recursion.ll b/llvm/test/Transforms/SCCP/recursion.ll index f6556bee3eaba1..bc036f71d0c7e2 100644 --- a/llvm/test/Transforms/SCCP/recursion.ll +++ b/llvm/test/Transforms/SCCP/recursion.ll @@ -4,8 +4,8 @@ ; CHECK-NOT: %X define internal i32 @foo(i32 %X) { -; CHECK-LABEL: @foo( -; CHECK-NEXT: [[Y:%.*]] = call i32 @foo() +; CHECK-LABEL: @foo.argelim( +; CHECK-NEXT: [[Y:%.*]] = call i32 @foo.argelim() ; CHECK-NEXT: [[Z:%.*]] = add i32 [[Y]], 1 ; CHECK-NEXT: ret i32 [[Z]] ; @@ -16,7 +16,7 @@ define internal i32 @foo(i32 %X) { define void @bar() { ; CHECK-LABEL: @bar( -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @foo() +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @foo.argelim() ; CHECK-NEXT: ret void ; call i32 @foo( i32 17 ) ; :1 [#uses=0] From 60a8b2b1d0842e257e2add6fb1b27cf45699b641 Mon Sep 17 00:00:00 2001 From: Aditi Medhane Date: Thu, 19 Sep 2024 13:57:44 +0530 Subject: [PATCH 175/321] [AMDGPU] Add MachineVerifier check to detect illegal copies from vector register to SGPR (#105494) Addition of a check in the MachineVerifier to detect and report illegal vector registers to SGPR copies in the AMDGPU backend, ensuring correct code generation. We can enforce this check only after SIFixSGPRCopies pass. This is half-fix in the pipeline with the help of isSSA MachineFuction property, the check is happening for passes after phi-node-elimination. --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 29 ++++++++- llvm/lib/Target/AMDGPU/SIInstrInfo.h | 3 + .../CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir | 1 - llvm/test/CodeGen/AMDGPU/wqm.mir | 9 ++- .../AMDGPU/fix-illegal-vector-copies.mir | 59 +++++++++++++++++++ 5 files changed, 92 insertions(+), 9 deletions(-) create mode 100644 llvm/test/MachineVerifier/AMDGPU/fix-illegal-vector-copies.mir diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 30aa36be99c95f..c6a9a627d457e7 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4600,15 +4600,38 @@ static bool isSubRegOf(const SIRegisterInfo &TRI, SubReg.getReg() == SuperVec.getReg(); } +// Verify the illegal copy from vector register to SGPR for generic opcode COPY +bool SIInstrInfo::verifyCopy(const MachineInstr &MI, + const MachineRegisterInfo &MRI, + StringRef &ErrInfo) const { + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + // This is a check for copy from vector register to SGPR + if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) { + ErrInfo = "illegal copy from vector register to SGPR"; + return false; + } + return true; +} + bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const { uint16_t Opcode = MI.getOpcode(); - if (SIInstrInfo::isGenericOpcode(MI.getOpcode())) - return true; - const MachineFunction *MF = MI.getParent()->getParent(); const MachineRegisterInfo &MRI = MF->getRegInfo(); + // FIXME: At this point the COPY verify is done only for non-ssa forms. + // Find a better property to recognize the point where instruction selection + // is just done. + // We can only enforce this check after SIFixSGPRCopies pass so that the + // illegal copies are legalized and thereafter we don't expect a pass + // inserting similar copies. + if (!MRI.isSSA() && MI.isCopy()) + return verifyCopy(MI, MRI, ErrInfo); + + if (SIInstrInfo::isGenericOpcode(MI.getOpcode())) + return true; + int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 4fd9b4366159be..d560792aa1a894 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -178,6 +178,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { Register findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const; + bool verifyCopy(const MachineInstr &MI, const MachineRegisterInfo &MRI, + StringRef &ErrInfo) const; + protected: /// If the specific machine instruction is a instruction that moves/copies /// value from one register to another register return destination and source diff --git a/llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir b/llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir index d21dbd290accea..c2c5340639a16b 100644 --- a/llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir +++ b/llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir @@ -112,7 +112,6 @@ body: | S_BRANCH %bb.2 ... - --- name: phi_moveimm_bad_opcode_input tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AMDGPU/wqm.mir b/llvm/test/CodeGen/AMDGPU/wqm.mir index 3013aabbd3bd42..4762760c4ba24b 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.mir +++ b/llvm/test/CodeGen/AMDGPU/wqm.mir @@ -190,9 +190,9 @@ body: | # Ensure that strict_wwm is not put around an EXEC copy #CHECK-LABEL: name: copy_exec #CHECK: %7:sreg_64 = COPY $exec -#CHECK-NEXT: %14:sreg_64 = ENTER_STRICT_WWM -1, implicit-def $exec, implicit-def $scc, implicit $exec +#CHECK-NEXT: %13:sreg_64 = ENTER_STRICT_WWM -1, implicit-def $exec, implicit-def $scc, implicit $exec #CHECK-NEXT: %8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec -#CHECK-NEXT: $exec = EXIT_STRICT_WWM %14 +#CHECK-NEXT: $exec = EXIT_STRICT_WWM %13 #CHECK-NEXT: %9:vgpr_32 = V_MBCNT_LO_U32_B32_e64 %7.sub0, 0, implicit $exec name: copy_exec tracksRegLiveness: true @@ -213,10 +213,9 @@ body: | %10:vgpr_32 = V_MBCNT_LO_U32_B32_e64 %8.sub0:sreg_64, 0, implicit $exec %11:vgpr_32 = V_MOV_B32_dpp %9:vgpr_32, %10:vgpr_32, 312, 15, 15, 0, implicit $exec %12:sreg_32 = V_READLANE_B32 %11:vgpr_32, 63 - early-clobber %13:sreg_32 = STRICT_WWM %9:vgpr_32, implicit $exec + early-clobber %13:vgpr_32 = STRICT_WWM %9:vgpr_32, implicit $exec - %14:vgpr_32 = COPY %13 - BUFFER_STORE_DWORD_OFFSET_exact killed %14, %4, %5, 4, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET_exact killed %13, %4, %5, 4, 0, 0, implicit $exec S_ENDPGM 0 ... diff --git a/llvm/test/MachineVerifier/AMDGPU/fix-illegal-vector-copies.mir b/llvm/test/MachineVerifier/AMDGPU/fix-illegal-vector-copies.mir new file mode 100644 index 00000000000000..edafd3825374f7 --- /dev/null +++ b/llvm/test/MachineVerifier/AMDGPU/fix-illegal-vector-copies.mir @@ -0,0 +1,59 @@ +# RUN: not --crash llc -march=amdgcn -mcpu=gfx1200 -run-pass=none -o /dev/null %s 2>&1 | FileCheck %s + +--- +name: fix-illegal-vector-copies +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + %0:vgpr_32 = IMPLICIT_DEF + %0:vgpr_32 = IMPLICIT_DEF ; Break SSA format + %1:vgpr_32 = IMPLICIT_DEF + %2:sgpr_32 = IMPLICIT_DEF + %3:sgpr_32 = IMPLICIT_DEF + %4:agpr_32 = IMPLICIT_DEF + %5:agpr_32 = IMPLICIT_DEF + + ; copy from virtual VGPR to virtual SGPR + ; CHECK: *** Bad machine code: illegal copy from vector register to SGPR *** + ; CHECK: - instruction: %6:sgpr_32 = COPY %0:vgpr_32 + %6:sgpr_32 = COPY %0:vgpr_32 + + ; copy from virtual VGPR to physical SGPR + ; CHECK: *** Bad machine code: illegal copy from vector register to SGPR *** + ; CHECK: - instruction: $sgpr0 = COPY %0:vgpr_32 + $sgpr0 = COPY %0:vgpr_32 + + ; copy from physical VGPR to physical SGPR + ; CHECK: *** Bad machine code: illegal copy from vector register to SGPR *** + ; CHECK: - instruction: $sgpr1 = COPY $vgpr0 + $sgpr1 = COPY $vgpr0 + + ; copy from virtual AGPR to virtual SGPR + ; CHECK: *** Bad machine code: illegal copy from vector register to SGPR *** + ; CHECK: - instruction: %7:sgpr_32 = COPY %4:agpr_32 + %7:sgpr_32 = COPY %4:agpr_32 + + ; copy from virtual AGPR to physical SGPR + ; CHECK: *** Bad machine code: illegal copy from vector register to SGPR *** + ; CHECK: - instruction: $sgpr2 = COPY %4:agpr_32 + $sgpr2 = COPY %4:agpr_32 + + ; copy from physical AGPR to physical SGPR + ; CHECK: *** Bad machine code: illegal copy from vector register to SGPR *** + ; CHECK: - instruction: $sgpr3 = COPY $agpr0 + $sgpr3 = COPY $agpr0 + + ; copy from tuple of physical VGPRs to tuple of physical SGPRs + ; CHECK: *** Bad machine code: illegal copy from vector register to SGPR *** + ; CHECK: - instruction: $sgpr4_sgpr5 = COPY $vgpr0_vgpr1 + $sgpr4_sgpr5 = COPY $vgpr0_vgpr1 + + ; copy from tuple of physical AGPRs to tuple of physical SGPRs + ; CHECK: *** Bad machine code: illegal copy from vector register to SGPR *** + ; CHECK: - instruction: $sgpr6_sgpr7 = COPY $agpr0_agpr1 + $sgpr6_sgpr7 = COPY $agpr0_agpr1 + + S_ENDPGM 0 +... From e762d4dac762a3fc27c6e251086b6645d7543bb2 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Thu, 19 Sep 2024 09:41:25 +0100 Subject: [PATCH 176/321] [LoopVectorize] Teach LoopVectorizationLegality about more early exits (#107004) This patch is split off from PR #88385 and concerns only the code related to the legality of vectorising early exit loops. It is the first step in adding support for vectorisation of a simple class of loops that typically involves searching for something, i.e. for (int i = 0; i < n; i++) { if (p[i] == val) return i; } return n; or for (int i = 0; i < n; i++) { if (p1[i] != p2[i]) return i; } return n; In this initial commit LoopVectorizationLegality will only consider early exit loops legal for vectorising if they follow these criteria: 1. There are no stores in the loop. 2. The loop must have only one early exit like those shown in the above example. I have referred to such exits as speculative early exits, to distinguish from existing support for early exits where the exit-not-taken count is known exactly at compile time. 3. The early exit block dominates the latch block. 4. The latch block must have an exact exit count. 5. There are no loads after the early exit block. 6. The loop must not contain reductions or recurrences. I don't see anything fundamental blocking vectorisation of such loops, but I just haven't done the work to support them yet. 7. We must be able to prove at compile-time that loops will not contain faulting loads. Tests have been added here: Transforms/LoopVectorize/AArch64/simple_early_exit.ll --- .../Vectorize/LoopVectorizationLegality.h | 62 + .../Vectorize/LoopVectorizationLegality.cpp | 160 +- .../Transforms/Vectorize/LoopVectorize.cpp | 8 + .../X86/vectorization-remarks-missed.ll | 10 +- .../Transforms/LoopVectorize/control-flow.ll | 2 +- .../LoopVectorize/remarks-multi-exit-loops.ll | 2 +- .../LoopVectorize/simple_early_exit.ll | 1941 +++++++++++++++++ 7 files changed, 2168 insertions(+), 17 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/simple_early_exit.ll diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h index 0f4d1355dd2bfe..091061442ae120 100644 --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -377,6 +377,24 @@ class LoopVectorizationLegality { return LAI->getDepChecker().getMaxSafeVectorWidthInBits(); } + /// Returns true if the loop has a speculative early exit, i.e. an + /// uncountable exit that isn't the latch block. + bool hasSpeculativeEarlyExit() const { return HasSpeculativeEarlyExit; } + + /// Returns the speculative early exiting block. + BasicBlock *getSpeculativeEarlyExitingBlock() const { + assert(getUncountableExitingBlocks().size() == 1 && + "Expected only a single uncountable exiting block"); + return getUncountableExitingBlocks()[0]; + } + + /// Returns the destination of a speculative early exiting block. + BasicBlock *getSpeculativeEarlyExitBlock() const { + assert(getUncountableExitBlocks().size() == 1 && + "Expected only a single uncountable exit block"); + return getUncountableExitBlocks()[0]; + } + /// Returns true if vector representation of the instruction \p I /// requires mask. bool isMaskRequired(const Instruction *I) const { @@ -404,6 +422,22 @@ class LoopVectorizationLegality { DominatorTree *getDominatorTree() const { return DT; } + /// Returns all exiting blocks with a countable exit, i.e. the + /// exit-not-taken count is known exactly at compile time. + const SmallVector &getCountableExitingBlocks() const { + return CountableExitingBlocks; + } + + /// Returns all the exiting blocks with an uncountable exit. + const SmallVector &getUncountableExitingBlocks() const { + return UncountableExitingBlocks; + } + + /// Returns all the exit blocks from uncountable exiting blocks. + SmallVector getUncountableExitBlocks() const { + return UncountableExitBlocks; + } + private: /// Return true if the pre-header, exiting and latch blocks of \p Lp and all /// its nested loops are considered legal for vectorization. These legal @@ -446,6 +480,23 @@ class LoopVectorizationLegality { /// specific checks for outer loop vectorization. bool canVectorizeOuterLoop(); + /// Returns true if this is an early exit loop that can be vectorized. + /// Currently, a loop with an uncountable early exit is considered + /// vectorizable if: + /// 1. There are no writes to memory in the loop. + /// 2. The loop has only one early uncountable exit + /// 3. The early exit block dominates the latch block. + /// 4. The latch block has an exact exit count. + /// 5. The loop does not contain reductions or recurrences. + /// 6. We can prove at compile-time that loops will not contain faulting + /// loads. + /// 7. It is safe to speculatively execute instructions such as divide or + /// call instructions. + /// The list above is not based on theoretical limitations of vectorization, + /// but simply a statement that more work is needed to support these + /// additional cases safely. + bool isVectorizableEarlyExitLoop(); + /// Return true if all of the instructions in the block can be speculatively /// executed, and record the loads/stores that require masking. /// \p SafePtrs is a list of addresses that are known to be legal and we know @@ -551,6 +602,17 @@ class LoopVectorizationLegality { /// (potentially) make a better decision on the maximum VF and enable /// the use of those function variants. bool VecCallVariantsFound = false; + + /// Indicates whether this loop has a speculative early exit, i.e. an + /// uncountable exiting block that is not the latch. + bool HasSpeculativeEarlyExit = false; + + /// Keep track of all the loop exiting blocks. + SmallVector CountableExitingBlocks; + SmallVector UncountableExitingBlocks; + + /// Keep track of the destinations of all uncountable exits. + SmallVector UncountableExitBlocks; }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 7062e21383a5fc..9645bd877fbf5e 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -1445,6 +1445,145 @@ bool LoopVectorizationLegality::canVectorizeLoopNestCFG( return Result; } +bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { + BasicBlock *LatchBB = TheLoop->getLoopLatch(); + if (!LatchBB) { + reportVectorizationFailure("Loop does not have a latch", + "Cannot vectorize early exit loop", + "NoLatchEarlyExit", ORE, TheLoop); + return false; + } + + if (Reductions.size() || FixedOrderRecurrences.size()) { + reportVectorizationFailure( + "Found reductions or recurrences in early-exit loop", + "Cannot vectorize early exit loop with reductions or recurrences", + "RecurrencesInEarlyExitLoop", ORE, TheLoop); + return false; + } + + SmallVector ExitingBlocks; + TheLoop->getExitingBlocks(ExitingBlocks); + + // Keep a record of all the exiting blocks. + SmallVector Predicates; + for (BasicBlock *BB1 : ExitingBlocks) { + const SCEV *EC = + PSE.getSE()->getPredicatedExitCount(TheLoop, BB1, &Predicates); + if (isa(EC)) { + UncountableExitingBlocks.push_back(BB1); + + SmallVector Succs(successors(BB1)); + if (Succs.size() != 2) { + reportVectorizationFailure( + "Early exiting block does not have exactly two successors", + "Incorrect number of successors from early exiting block", + "EarlyExitTooManySuccessors", ORE, TheLoop); + return false; + } + + BasicBlock *BB2; + if (!TheLoop->contains(Succs[0])) + BB2 = Succs[0]; + else { + assert(!TheLoop->contains(Succs[1])); + BB2 = Succs[1]; + } + UncountableExitBlocks.push_back(BB2); + } else + CountableExitingBlocks.push_back(BB1); + } + Predicates.clear(); + + // We only support one uncountable early exit. + if (getUncountableExitingBlocks().size() != 1) { + reportVectorizationFailure( + "Loop has too many uncountable exits", + "Cannot vectorize early exit loop with more than one early exit", + "TooManyUncountableEarlyExits", ORE, TheLoop); + return false; + } + + // The only supported early exit loops so far are ones where the early + // exiting block is a unique predecessor of the latch block. + BasicBlock *LatchPredBB = LatchBB->getUniquePredecessor(); + if (LatchPredBB != getSpeculativeEarlyExitingBlock()) { + reportVectorizationFailure("Early exit is not the latch predecessor", + "Cannot vectorize early exit loop", + "EarlyExitNotLatchPredecessor", ORE, TheLoop); + return false; + } + + // Check to see if there are instructions that could potentially generate + // exceptions or have side-effects. + auto IsSafeOperation = [](Instruction *I) -> bool { + switch (I->getOpcode()) { + case Instruction::Load: + case Instruction::Store: + case Instruction::PHI: + case Instruction::Br: + // These are checked separately. + return true; + default: + return isSafeToSpeculativelyExecute(I); + } + }; + + for (auto *BB : TheLoop->blocks()) + for (auto &I : *BB) { + if (I.mayWriteToMemory()) { + // We don't support writes to memory. + reportVectorizationFailure( + "Writes to memory unsupported in early exit loops", + "Cannot vectorize early exit loop with writes to memory", + "WritesInEarlyExitLoop", ORE, TheLoop); + return false; + } else if (!IsSafeOperation(&I)) { + reportVectorizationFailure("Early exit loop contains operations that " + "cannot be speculatively executed", + "Early exit loop contains operations that " + "cannot be speculatively executed", + "UnsafeOperationsEarlyExitLoop", ORE, + TheLoop); + return false; + } + } + + // The latch block must have a countable exit. + if (isa( + PSE.getSE()->getPredicatedExitCount(TheLoop, LatchBB, &Predicates))) { + reportVectorizationFailure( + "Cannot determine exact exit count for latch block", + "Cannot vectorize early exit loop", + "UnknownLatchExitCountEarlyExitLoop", ORE, TheLoop); + return false; + } + + // The vectoriser cannot handle loads that occur after the early exit block. + assert(LatchBB->getUniquePredecessor() == getSpeculativeEarlyExitingBlock() && + "Expected latch predecessor to be the early exiting block"); + + // TODO: Handle loops that may fault. + if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC)) { + reportVectorizationFailure( + "Loop may fault", + "Cannot vectorize potentially faulting early exit loop", + "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop); + return false; + } + + LLVM_DEBUG( + dbgs() + << "LV: Found an early exit. Retrying with speculative exit count.\n"); + const SCEV *SpecExitCount = PSE.getSymbolicMaxBackedgeTakenCount(); + assert(!isa(SpecExitCount) && + "Failed to get symbolic expression for backedge taken count"); + + LLVM_DEBUG(dbgs() << "LV: Found speculative backedge taken count: " + << *SpecExitCount << '\n'); + return true; +} + bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { // Store the result and return it at the end instead of exiting early, in case // allowExtraAnalysis is used to report multiple reasons for not vectorizing. @@ -1505,6 +1644,17 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { return false; } + HasSpeculativeEarlyExit = false; + if (isa(PSE.getBackedgeTakenCount())) { + if (!isVectorizableEarlyExitLoop()) { + if (DoExtraAnalysis) + Result = false; + else + return false; + } else + HasSpeculativeEarlyExit = true; + } + // Go over each instruction and look at memory deps. if (!canVectorizeMemory()) { LLVM_DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n"); @@ -1514,16 +1664,6 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { return false; } - if (isa(PSE.getBackedgeTakenCount())) { - reportVectorizationFailure("could not determine number of loop iterations", - "could not determine number of loop iterations", - "CantComputeNumberOfIterations", ORE, TheLoop); - if (DoExtraAnalysis) - Result = false; - else - return false; - } - if (Result) { LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop" << (LAI->getRuntimePointerChecking()->Need diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 9fb684427cfe9d..5ee8f9db32aac8 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9807,6 +9807,14 @@ bool LoopVectorizePass::processLoop(Loop *L) { return false; } + if (LVL.hasSpeculativeEarlyExit()) { + reportVectorizationFailure( + "Auto-vectorization of early exit loops is not yet supported.", + "Auto-vectorization of early exit loops is not yet supported.", + "EarlyExitLoopsUnsupported", ORE, L); + return false; + } + // Entrance to the VPlan-native vectorization path. Outer loops are processed // here. They may require CFG and instruction level transformations before // even evaluating whether vectorization is profitable. Since we cannot modify diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll index ac33f6e3e6f728..99911b251c81e1 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll @@ -12,7 +12,7 @@ ; } ; } ; File, line, and column should match those specified in the metadata -; CHECK: remark: source.cpp:5:9: loop not vectorized: could not determine number of loop iterations +; CHECK: remark: source.cpp:5:9: loop not vectorized: Cannot vectorize early exit loop ; CHECK: remark: source.cpp:5:9: loop not vectorized ; void test_disabled(int *A, int Length) { @@ -46,12 +46,12 @@ ; YAML: --- !Analysis ; YAML-NEXT: Pass: loop-vectorize -; YAML-NEXT: Name: CantComputeNumberOfIterations +; YAML-NEXT: Name: EarlyExitNotLatchPredecessor ; YAML-NEXT: DebugLoc: { File: source.cpp, Line: 5, Column: 9 } ; YAML-NEXT: Function: _Z4testPii ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'loop not vectorized: ' -; YAML-NEXT: - String: could not determine number of loop iterations +; YAML-NEXT: - String: Cannot vectorize early exit loop ; YAML-NEXT: ... ; YAML-NEXT: --- !Missed ; YAML-NEXT: Pass: loop-vectorize @@ -117,12 +117,12 @@ ; YAML-NEXT: ... ; YAML-NEXT: --- !Analysis ; YAML-NEXT: Pass: loop-vectorize -; YAML-NEXT: Name: CantComputeNumberOfIterations +; YAML-NEXT: Name: EarlyExitNotLatchPredecessor ; YAML-NEXT: DebugLoc: { File: source.cpp, Line: 27, Column: 3 } ; YAML-NEXT: Function: test_multiple_failures ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'loop not vectorized: ' -; YAML-NEXT: - String: could not determine number of loop iterations +; YAML-NEXT: - String: Cannot vectorize early exit loop ; YAML-NEXT: ... ; YAML: --- !Missed ; YAML-NEXT: Pass: loop-vectorize diff --git a/llvm/test/Transforms/LoopVectorize/control-flow.ll b/llvm/test/Transforms/LoopVectorize/control-flow.ll index a27f2f0841bca8..3a8aec34dfe43e 100644 --- a/llvm/test/Transforms/LoopVectorize/control-flow.ll +++ b/llvm/test/Transforms/LoopVectorize/control-flow.ll @@ -10,7 +10,7 @@ ; return 0; ; } -; CHECK: remark: source.cpp:5:9: loop not vectorized: could not determine number of loop iterations +; CHECK: remark: source.cpp:5:9: loop not vectorized: Cannot vectorize early exit loop with writes to memory ; CHECK: remark: source.cpp:5:9: loop not vectorized ; CHECK: _Z4testPii diff --git a/llvm/test/Transforms/LoopVectorize/remarks-multi-exit-loops.ll b/llvm/test/Transforms/LoopVectorize/remarks-multi-exit-loops.ll index 2a5240e73c6f8b..46a4592d0208ee 100644 --- a/llvm/test/Transforms/LoopVectorize/remarks-multi-exit-loops.ll +++ b/llvm/test/Transforms/LoopVectorize/remarks-multi-exit-loops.ll @@ -3,7 +3,7 @@ ; Make sure LV does not crash when generating remarks for loops with non-unique ; exit blocks. define i32 @test_non_unique_exit_blocks(ptr nocapture readonly align 4 dereferenceable(1024) %data, i32 %x) { -; CHECK: loop not vectorized: could not determine number of loop iterations +; CHECK: loop not vectorized: Cannot vectorize early exit loop ; entry: br label %for.header diff --git a/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll new file mode 100644 index 00000000000000..f905bcd73dc952 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll @@ -0,0 +1,1941 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S < %s -p loop-vectorize -debug-only=loop-vectorize 2>%t | FileCheck %s --check-prefixes=CHECK +; RUN: cat %t | FileCheck %s --check-prefix=DEBUG + +declare void @init_mem(ptr, i64); + +define i64 @same_exit_block_pre_inc_use1() { +; DEBUG-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1' +; DEBUG: LV: Found an early exit. Retrying with speculative exit count. +; DEBUG-NEXT: LV: Found speculative backedge taken count: 63 +; DEBUG-NEXT: LV: We can vectorize this loop! +; DEBUG-NEXT: LV: Not vectorizing: Auto-vectorization of early exit loops is not yet supported. +; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LAND_RHS:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[TMP38]], [[TMP39]] +; CHECK-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]] +; CHECK: loop.end: +; CHECK-NEXT: [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 67, [[FOR_INC]] ] +; CHECK-NEXT: ret i64 [[START_0_LCSSA]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + +define i64 @same_exit_block_pre_inc_use1_gep_two_indices() { +; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_gep_two_indices() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P1]], i64 0, i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P2]], i64 0, i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds [1024 x i8], ptr %p1, i64 0, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds [1024 x i8], ptr %p2, i64 0, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + +define i64 @same_exit_block_pre_inc_use1_alloca_diff_type() { +; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_alloca_diff_type() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [40 x i32], align 4 +; CHECK-NEXT: [[P2:%.*]] = alloca [40 x i32], align 4 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LAND_RHS:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[TMP38]], [[TMP39]] +; CHECK-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]] +; CHECK: loop.end: +; CHECK-NEXT: [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 67, [[FOR_INC]] ] +; CHECK-NEXT: ret i64 [[START_0_LCSSA]] +; +entry: + %p1 = alloca [40 x i32] + %p2 = alloca [40 x i32] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + +define i64 @same_exit_block_pre_inc_use2() { +; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use2() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 67, [[LOOP]] ], [ [[INDEX]], [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ 67, %loop ], [ %index, %loop.inc ] + ret i64 %retval +} + +define i64 @same_exit_block_pre_inc_use3() { +; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use3() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[INDEX_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ], [ [[INDEX]], [[LOOP]] ] +; CHECK-NEXT: ret i64 [[INDEX_LCSSA]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + ret i64 %index +} + + +; In this example the early exit block appears in the list of ExitNotTaken +; SCEVs, but is not computable. +define i64 @same_exit_block_pre_inc_use4() { +; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use4() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i64], align 8 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i64], align 8 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i64, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp ult i64 [[INDEX]], [[LD1]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [1024 x i64] + %p2 = alloca [1024 x i64] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i64, ptr %p1, i64 %index + %ld1 = load i64, ptr %arrayidx, align 1 + %cmp3 = icmp ult i64 %index, %ld1 + br i1 %cmp3, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + + +define i64 @same_exit_block_post_inc_use() { +; CHECK-LABEL: define i64 @same_exit_block_post_inc_use() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ [[INDEX_NEXT]], [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ %index.next, %loop.inc ] + ret i64 %retval +} + +define i64 @same_exit_block_post_inc_use2() { +; CHECK-LABEL: define i64 @same_exit_block_post_inc_use2() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP]] ], [ [[INDEX]], [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %index.next = add i64 %index, 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.end + +loop.inc: + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index.next, %loop ], [ %index, %loop.inc ] + ret i64 %retval +} + +define i64 @same_exit_block_phi_of_consts() { +; CHECK-LABEL: define i64 @same_exit_block_phi_of_consts() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 0, [[LOOP]] ], [ 1, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ 0, %loop ], [ 1, %loop.inc ] + ret i64 %retval +} + + +define i64 @diff_exit_block_pre_inc_use1() { +; CHECK-LABEL: define i64 @diff_exit_block_pre_inc_use1() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]] +; CHECK: loop.early.exit: +; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ] +; CHECK-NEXT: ret i64 [[RETVAL1]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL2]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.early.exit + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.early.exit: + %retval1 = phi i64 [ %index, %loop ] + ret i64 %retval1 + +loop.end: + %retval2 = phi i64 [ 67, %loop.inc ] + ret i64 %retval2 +} + +define i64 @diff_exit_block_pre_inc_use2() { +; CHECK-LABEL: define i64 @diff_exit_block_pre_inc_use2() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]] +; CHECK: loop.early.exit: +; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ 67, [[LOOP]] ] +; CHECK-NEXT: ret i64 [[RETVAL1]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL2]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.early.exit + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.early.exit: + %retval1 = phi i64 [ 67, %loop ] + ret i64 %retval1 + +loop.end: + %retval2 = phi i64 [ %index, %loop.inc ] + ret i64 %retval2 +} + +define i64 @diff_exit_block_pre_inc_use3() { +; CHECK-LABEL: define i64 @diff_exit_block_pre_inc_use3() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]] +; CHECK: loop.early.exit: +; CHECK-NEXT: [[INDEX_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ] +; CHECK-NEXT: ret i64 [[INDEX_LCSSA]] +; CHECK: loop.end: +; CHECK-NEXT: [[INDEX_LCSSA1:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[INDEX_LCSSA1]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.early.exit + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.early.exit: + ret i64 %index + +loop.end: + ret i64 %index +} + + +define i64 @diff_exit_block_phi_of_consts() { +; CHECK-LABEL: define i64 @diff_exit_block_phi_of_consts() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]] +; CHECK: loop.early.exit: +; CHECK-NEXT: ret i64 0 +; CHECK: loop.end: +; CHECK-NEXT: ret i64 1 +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.early.exit + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.early.exit: + ret i64 0 + +loop.end: + ret i64 1 +} + + +define i64 @diff_exit_block_post_inc_use1() { +; CHECK-LABEL: define i64 @diff_exit_block_post_inc_use1() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]] +; CHECK: loop.early.exit: +; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ] +; CHECK-NEXT: ret i64 [[RETVAL1]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL2]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.early.exit + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.early.exit: + %retval1 = phi i64 [ %index, %loop ] + ret i64 %retval1 + +loop.end: + %retval2 = phi i64 [ %index.next, %loop.inc ] + ret i64 %retval2 +} + + +define i64 @diff_exit_block_post_inc_use2() { +; CHECK-LABEL: define i64 @diff_exit_block_post_inc_use2() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]] +; CHECK: loop.early.exit: +; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP]] ] +; CHECK-NEXT: ret i64 [[RETVAL1]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL2]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %index.next = add i64 %index, 1 + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.early.exit + +loop.inc: + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.early.exit: + %retval1 = phi i64 [ %index.next, %loop ] + ret i64 %retval1 + +loop.end: + %retval2 = phi i64 [ %index, %loop.inc ] + ret i64 %retval2 +} + + +; The early exit (i.e. unknown exit-not-taken count) is the latch - we don't +; support this yet. +define i64 @early_exit_on_last_block() { +; DEBUG-LABEL: LV: Checking a loop in 'early_exit_on_last_block' +; DEBUG: LV: Not vectorizing: Early exit is not the latch predecessor. +; CHECK-LABEL: define i64 @early_exit_on_last_block() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LAND_RHS:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[SEARCH:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[CMP1]], label [[SEARCH]], label [[FOR_END_LOOPEXIT:%.*]] +; CHECK: search: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP42:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[TMP41]], [[TMP42]] +; CHECK-NEXT: br i1 [[CMP3]], label [[FOR_END_LOOPEXIT]], label [[LAND_RHS]] +; CHECK: loop.end: +; CHECK-NEXT: [[START_0_LCSSA:%.*]] = phi i64 [ 64, [[LAND_RHS]] ], [ [[INDEX]], [[SEARCH]] ] +; CHECK-NEXT: ret i64 [[START_0_LCSSA]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %search ], [ 3, %entry ] + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %search, label %loop.end + +search: + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.end, label %loop + +loop.end: + %retval = phi i64 [ 64, %loop ], [ %index, %search ] + ret i64 %retval +} + + +; There are multiple exit blocks - two of them have an exact representation for the +; exit-not-taken counts and the other is unknown, i.e. the "early exit". +define i64 @multiple_exits_one_early() { +; CHECK-LABEL: define i64 @multiple_exits_one_early() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i64 [[INDEX]], 64 +; CHECK-NEXT: br i1 [[CMP1]], label [[SEARCH:%.*]], label [[LOOP_END:%.*]] +; CHECK: search: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_END]], label [[LOOP_INC]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 128 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 64, [[LOOP]] ], [ [[INDEX]], [[SEARCH]] ], [ 128, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %cmp1 = icmp ne i64 %index, 64 + br i1 %cmp1, label %search, label %loop.end + +search: + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.end, label %loop.inc + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 128 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ 64, %loop ], [ %index, %search ], [ 128, %loop.inc ] + ret i64 %retval +} + + +; We don't currently support multiple early exits. +define i64 @multiple_early_exits() { +; DEBUG-LABEL: LV: Checking a loop in 'multiple_early_exits' +; DEBUG: LV: Not vectorizing: Loop has too many uncountable exits. +; CHECK-LABEL: define i64 @multiple_early_exits() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LAND_RHS:%.*]] +; CHECK: search1: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC1:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP42:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[TMP41]], [[TMP42]] +; CHECK-NEXT: br i1 [[CMP3]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_INC:%.*]] +; CHECK: search2: +; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i8 [[TMP41]], 34 +; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_END_LOOPEXIT]], label [[FOR_INC1]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]] +; CHECK: loop.end: +; CHECK-NEXT: [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 100, [[FOR_INC]] ], [ 43, [[FOR_INC1]] ] +; CHECK-NEXT: ret i64 [[START_0_LCSSA]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %search1 + +search1: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp1 = icmp eq i8 %ld1, %ld2 + br i1 %cmp1, label %loop.end, label %search2 + +search2: + %cmp2 = icmp ult i8 %ld1, 34 + br i1 %cmp2, label %loop.end, label %loop.inc + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %search1, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %search1 ], [ 100, %search2 ], [ 43, %loop.inc ] + ret i64 %retval +} + + +define i64 @early_exit_infinite_loop() { +; DEBUG-LABEL: LV: Checking a loop in 'early_exit_infinite_loop' +; DEBUG: LV: Not vectorizing: Cannot determine exact exit count for latch block. +; CHECK-LABEL: define i64 @early_exit_infinite_loop() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LAND_RHS:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[TMP38]], [[TMP39]] +; CHECK-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br label [[LAND_RHS]] +; CHECK: loop.end: +; CHECK-NEXT: [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ] +; CHECK-NEXT: ret i64 [[START_0_LCSSA]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br label %loop + +loop.end: + %retval = phi i64 [ %index, %loop ] + ret i64 %retval +} + + +define i64 @same_exit_block_pre_inc_use_inv_cond(i1 %cond) { +; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use_inv_cond( +; CHECK-SAME: i1 [[COND:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: [[CMP4:%.*]] = select i1 [[COND]], i1 [[CMP3]], i1 false +; CHECK-NEXT: br i1 [[CMP4]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + %cmp4 = select i1 %cond, i1 %cmp3, i1 false + br i1 %cmp4, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + +define i64 @loop_contains_safe_call() { +; DEBUG-LABEL: LV: Checking a loop in 'loop_contains_safe_call' +; DEBUG: LV: Found an early exit. Retrying with speculative exit count. +; DEBUG-NEXT: LV: Found speculative backedge taken count: 63 +; DEBUG-NEXT: LV: We can vectorize this loop! +; CHECK-LABEL: define i64 @loop_contains_safe_call() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load float, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[SQRT:%.*]] = tail call fast float @llvm.sqrt.f32(float [[LD1]]) +; CHECK-NEXT: [[CMP:%.*]] = fcmp fast ult float [[SQRT]], 3.000000e+00 +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds float, ptr %p1, i64 %index + %ld1 = load float, ptr %arrayidx, align 1 + %sqrt = tail call fast float @llvm.sqrt.f32(float %ld1) + %cmp = fcmp fast ult float %sqrt, 3.0e+00 + br i1 %cmp, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + +define i64 @loop_contains_unsafe_call() { +; DEBUG-LABEL: LV: Checking a loop in 'loop_contains_unsafe_call' +; DEBUG: LV: Not vectorizing: Early exit loop contains operations that cannot be speculatively executed. +; CHECK-LABEL: define i64 @loop_contains_unsafe_call() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[BAD_CALL:%.*]] = call i32 @foo(i32 [[LD1]]) #[[ATTR2:[0-9]+]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[BAD_CALL]], 34 +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index + %ld1 = load i32, ptr %arrayidx, align 1 + %bad_call = call i32 @foo(i32 %ld1) #0 + %cmp = icmp eq i32 %bad_call, 34 + br i1 %cmp, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + +define i64 @loop_contains_safe_div() { +; DEBUG-LABEL: LV: Checking a loop in 'loop_contains_safe_div' +; DEBUG: LV: Found an early exit. Retrying with speculative exit count. +; DEBUG-NEXT: LV: Found speculative backedge taken count: 63 +; DEBUG-NEXT: LV: We can vectorize this loop! +; CHECK-LABEL: define i64 @loop_contains_safe_div() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP1:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX2]] +; CHECK-NEXT: [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[DIV:%.*]] = udiv i32 [[LD1]], 20000 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[DIV]], 1 +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC1]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX2]], [[LOOP1]] ], [ 67, [[LOOP_INC1]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index + %ld1 = load i32, ptr %arrayidx, align 1 + %div = udiv i32 %ld1, 20000 + %cmp = icmp eq i32 %div, 1 + br i1 %cmp, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + +define i64 @loop_contains_unsafe_div() { +; DEBUG-LABEL: LV: Checking a loop in 'loop_contains_unsafe_div' +; DEBUG: LV: Not vectorizing: Early exit loop contains operations that cannot be speculatively executed. +; CHECK-LABEL: define i64 @loop_contains_unsafe_div() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[DIV:%.*]] = udiv i32 20000, [[LD1]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[DIV]], 1 +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i32, ptr %arrayidx, align 1 + %div = udiv i32 20000, %ld1 + %cmp = icmp eq i32 %div, 1 + br i1 %cmp, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + +define i64 @loop_contains_store(ptr %dest) { +; DEBUG-LABEL: LV: Checking a loop in 'loop_contains_store' +; DEBUG: LV: Not vectorizing: Writes to memory unsupported in early exit loops +; CHECK-LABEL: define i64 @loop_contains_store( +; CHECK-SAME: ptr [[DEST:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DEST]], i64 [[INDEX]] +; CHECK-NEXT: store i32 [[LD1]], ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[LD1]], 1 +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index + %ld1 = load i32, ptr %arrayidx, align 1 + %arrayidx2 = getelementptr inbounds i32, ptr %dest, i64 %index + store i32 %ld1, ptr %arrayidx2, align 4 + %cmp = icmp eq i32 %ld1, 1 + br i1 %cmp, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + +define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(8) %p2) { +; DEBUG-LABEL: LV: Checking a loop in 'loop_contains_load_after_early_exit' +; DEBUG: LV: Found an early exit. Retrying with speculative exit count. +; DEBUG-NEXT: LV: Found speculative backedge taken count: 63 +; DEBUG-NEXT: LV: We can vectorize this loop! +; DEBUG-NEXT: LV: Not vectorizing: Auto-vectorization of early exit loops is not yet supported. +; CHECK-LABEL: define i64 @loop_contains_load_after_early_exit( +; CHECK-SAME: ptr align 8 dereferenceable(1024) [[P2:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[LD1]], 1 +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ [[LD2]], [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index + %ld1 = load i32, ptr %arrayidx, align 1 + %cmp = icmp eq i32 %ld1, 1 + br i1 %cmp, label %loop.inc, label %loop.end + +loop.inc: + %arrayidx2 = getelementptr inbounds i64, ptr %p2, i64 %index + %ld2 = load i64, ptr %arrayidx2, align 8 + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ %ld2, %loop.inc ] + ret i64 %retval +} + + +define i64 @early_exit_in_conditional_block(ptr %mask) { +; DEBUG-LABEL: LV: Checking a loop in 'early_exit_in_conditional_block' +; DEBUG: LV: Not vectorizing: Early exit is not the latch predecessor. +; CHECK-LABEL: define i64 @early_exit_in_conditional_block( +; CHECK-SAME: ptr [[MASK:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[MASK]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i8 [[LD1]], 0 +; CHECK-NEXT: br i1 [[CMP1]], label [[LOOP_SEARCH:%.*]], label [[LOOP_INC]] +; CHECK: loop.search: +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD3:%.*]] = load i8, ptr [[ARRAYIDX3]], align 1 +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i8 [[LD2]], [[LD3]] +; CHECK-NEXT: br i1 [[CMP2]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP_SEARCH]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx1 = getelementptr inbounds i8, ptr %mask, i64 %index + %ld1 = load i8, ptr %arrayidx1, align 1 + %cmp1 = icmp ne i8 %ld1, 0 + br i1 %cmp1, label %loop.search, label %loop.inc + +loop.search: + %arrayidx2 = getelementptr inbounds i8, ptr %p1, i64 %index + %ld2 = load i8, ptr %arrayidx2, align 1 + %arrayidx3 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld3 = load i8, ptr %arrayidx3, align 1 + %cmp2 = icmp eq i8 %ld2, %ld3 + br i1 %cmp2, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop.search ], [ 67, %loop.inc ] + ret i64 %retval +} + + +define i64 @same_exit_block_pre_inc_use1_reverse() { +; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_reverse() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 1023, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], -1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP_END]], label [[LOOP]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 1024, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 1023, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, -1 + %exitcond = icmp eq i64 %index.next, 0 + br i1 %exitcond, label %loop.end, label %loop + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 1024, %loop.inc ] + ret i64 %retval +} + + +define i64 @same_exit_block_pre_inc_use1_with_reduction() { +; DEBUG-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_with_reduction' +; DEBUG: LV: Not vectorizing: Found reductions or recurrences in early-exit loop. +; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_with_reduction() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LAND_RHS:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[RED:%.*]] = phi i64 [ [[RED_NEXT:%.*]], [[FOR_INC]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[LD2_ZEXT:%.*]] = zext i8 [[TMP39]] to i64 +; CHECK-NEXT: [[RED_NEXT]] = add i64 [[RED]], [[LD2_ZEXT]] +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[TMP38]], [[TMP39]] +; CHECK-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]] +; CHECK: loop.end: +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], [[FOR_INC]] ], [ [[RED_NEXT]], [[LAND_RHS]] ] +; CHECK-NEXT: [[FINAL_IND:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 67, [[FOR_INC]] ] +; CHECK-NEXT: [[START_0_LCSSA:%.*]] = add i64 [[RED_NEXT_LCSSA]], [[FINAL_IND]] +; CHECK-NEXT: ret i64 [[START_0_LCSSA]] +; +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %red = phi i64 [ %red.next, %loop.inc ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %ld2.zext = zext i8 %ld2 to i64 + %red.next = add i64 %red, %ld2.zext + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %final.ind = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + %retval = add i64 %red.next, %final.ind + ret i64 %retval +} + + +define i64 @same_exit_block_pre_inc_use1_deref_ptrs(ptr dereferenceable(1024) %p1, ptr dereferenceable(1024) %p2) { +; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_deref_ptrs( +; CHECK-SAME: ptr dereferenceable(1024) [[P1:%.*]], ptr dereferenceable(1024) [[P2:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + +; The form of the induction variables requires SCEV predicates. +; TODO: We should fix isDereferenceableAndAlignedInLoop and +; getSmallConstantMaxTripCount to cope with SCEV predicates when +; requesting the small constant max trip count. +define i32 @diff_exit_block_needs_scev_check(i32 %end) { +; DEBUG-LABEL: LV: Checking a loop in 'diff_exit_block_needs_scev_check' +; DEBUG: LV: Not vectorizing: Loop may fault. +; CHECK-LABEL: define i32 @diff_exit_block_needs_scev_check( +; CHECK-SAME: i32 [[END:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i32], align 4 +; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i32], align 4 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: [[END_CLAMPED:%.*]] = and i32 [[END]], 1023 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IND:%.*]] = phi i8 [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[GEP_IND:%.*]] = phi i64 [ [[GEP_IND_NEXT:%.*]], [[FOR_INC]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[GEP_IND]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 [[GEP_IND]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[CMP_EARLY:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]] +; CHECK-NEXT: br i1 [[CMP_EARLY]], label [[FOUND:%.*]], label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[IND_NEXT]] = add i8 [[IND]], 1 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[IND_NEXT]] to i32 +; CHECK-NEXT: [[GEP_IND_NEXT]] = add i64 [[GEP_IND]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[CONV]], [[END_CLAMPED]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT:%.*]] +; CHECK: found: +; CHECK-NEXT: ret i32 1 +; CHECK: exit: +; CHECK-NEXT: ret i32 0 +; +entry: + %p1 = alloca [1024 x i32] + %p2 = alloca [1024 x i32] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + %end.clamped = and i32 %end, 1023 + br label %for.body + +for.body: + %ind = phi i8 [ %ind.next, %for.inc ], [ 0, %entry ] + %gep.ind = phi i64 [ %gep.ind.next, %for.inc ], [ 0, %entry ] + %arrayidx1 = getelementptr inbounds i32, ptr %p1, i64 %gep.ind + %0 = load i32, ptr %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr %p2, i64 %gep.ind + %1 = load i32, ptr %arrayidx2, align 4 + %cmp.early = icmp eq i32 %0, %1 + br i1 %cmp.early, label %found, label %for.inc + +for.inc: + %ind.next = add i8 %ind, 1 + %conv = zext i8 %ind.next to i32 + %gep.ind.next = add i64 %gep.ind, 1 + %cmp = icmp ult i32 %conv, %end.clamped + br i1 %cmp, label %for.body, label %exit + +found: + ret i32 1 + +exit: + ret i32 0 +} + + +declare void @abort() + +; This is a variant of an early exit loop where the condition for leaving +; early is loop invariant. +define i32 @diff_blocks_invariant_early_exit_cond(ptr %s) { +; DEBUG-LABEL: LV: Checking a loop in 'diff_blocks_invariant_early_exit_cond' +; DEBUG: LV: Found an early exit. Retrying with speculative exit count. +; DEBUG-NEXT: LV: Found speculative backedge taken count: 275 +; DEBUG: LV: Not vectorizing: Auto-vectorization of early exit loops is not yet supported. +; CHECK-LABEL: define i32 @diff_blocks_invariant_early_exit_cond( +; CHECK-SAME: ptr [[S:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SVAL:%.*]] = load i32, ptr [[S]], align 4 +; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[SVAL]], 0 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IND:%.*]] = phi i32 [ -10, [[ENTRY:%.*]] ], [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_INC]], label [[EARLY_EXIT:%.*]] +; CHECK: for.inc: +; CHECK-NEXT: [[IND_NEXT]] = add nsw i32 [[IND]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IND_NEXT]], 266 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: early.exit: +; CHECK-NEXT: tail call void @abort() +; CHECK-NEXT: unreachable +; CHECK: for.end: +; CHECK-NEXT: ret i32 0 +; +entry: + %sval = load i32, ptr %s, align 4 + %cond = icmp eq i32 %sval, 0 + br label %for.body + +for.body: + %ind = phi i32 [ -10, %entry ], [ %ind.next, %for.inc ] + br i1 %cond, label %for.inc, label %early.exit + +for.inc: + %ind.next = add nsw i32 %ind, 1 + %exitcond.not = icmp eq i32 %ind.next, 266 + br i1 %exitcond.not, label %for.end, label %for.body + +early.exit: + tail call void @abort() + unreachable + +for.end: + ret i32 0 +} + + +define i64 @early_exit_has_multiple_outside_successors() { +; DEBUG-LABEL: LV: Checking a loop in 'early_exit_has_multiple_outside_successors' +; DEBUG: LV: Not vectorizing: Loop contains an unsupported switch +; CHECK-LABEL: define i64 @early_exit_has_multiple_outside_successors() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: switch i8 [[LD1]], label [[LOOP_INC]] [ +; CHECK-NEXT: i8 2, label [[LOOP_END:%.*]] +; CHECK-NEXT: i8 3, label [[LOOP_SURPRISE:%.*]] +; CHECK-NEXT: ] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.surprise: +; CHECK-NEXT: ret i64 3 +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + switch i8 %ld1, label %loop.inc [ + i8 2, label %loop.end + i8 3, label %loop.surprise + ] + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.surprise: + ret i64 3 + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + +define i64 @same_exit_block_pre_inc_use1_too_small_allocas() { +; DEBUG-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_too_small_allocas' +; DEBUG: LV: Not vectorizing: Loop may fault. +; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_allocas() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = alloca [42 x i8], align 1 +; CHECK-NEXT: [[P2:%.*]] = alloca [42 x i8], align 1 +; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + %p1 = alloca [42 x i8] + %p2 = alloca [42 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + +define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs(ptr dereferenceable(42) %p1, ptr dereferenceable(42) %p2) { +; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs( +; CHECK-SAME: ptr dereferenceable(42) [[P1:%.*]], ptr dereferenceable(42) [[P2:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + +define i64 @same_exit_block_pre_inc_use1_unknown_ptrs(ptr %p1, ptr %p2) { +; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_unknown_ptrs( +; CHECK-SAME: ptr [[P1:%.*]], ptr [[P2:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK: loop.inc: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK: loop.end: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] +; +entry: + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + ret i64 %retval +} + + + +declare i32 @foo(i32) readonly +declare @foo_vec() + +attributes #0 = { "vector-function-abi-variant"="_ZGVsNxv_foo(foo_vec)" } From 57777a5066a6b872f7576a81f021d18899595e38 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Thu, 19 Sep 2024 11:01:58 +0200 Subject: [PATCH 177/321] [LoopVectorize] Silence unused variable warning --- llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 9645bd877fbf5e..a4787483813a9a 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -1575,7 +1575,8 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { LLVM_DEBUG( dbgs() << "LV: Found an early exit. Retrying with speculative exit count.\n"); - const SCEV *SpecExitCount = PSE.getSymbolicMaxBackedgeTakenCount(); + [[maybe_unused]] const SCEV *SpecExitCount = + PSE.getSymbolicMaxBackedgeTakenCount(); assert(!isa(SpecExitCount) && "Failed to get symbolic expression for backedge taken count"); From bca507387ae1945137214ec7fb80b709927ee6e8 Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Thu, 19 Sep 2024 10:06:28 +0100 Subject: [PATCH 178/321] [lldb][FrameRecognizer] Display the first non-std frame on verbose_trap (#108825) This attempts to improve user-experience when LLDB stops on a verbose_trap. Currently if a `__builtin_verbose_trap` triggers, we display the first frame above the call to the verbose_trap. So in the newly added test case, we would've previously stopped here: ``` (lldb) run Process 28095 launched: '/Users/michaelbuch/a.out' (arm64) Process 28095 stopped * thread #1, queue = 'com.apple.main-thread', stop reason = Bounds error: out-of-bounds access frame #1: 0x0000000100003f5c a.out`std::__1::vector::operator[](this=0x000000016fdfebef size=0, (null)=10) at verbose_trap.cpp:6:9 3 template 4 struct vector { 5 void operator[](unsigned) { -> 6 __builtin_verbose_trap("Bounds error", "out-of-bounds access"); 7 } 8 }; ``` After this patch, we would stop in the first non-`std` frame: ``` (lldb) run Process 27843 launched: '/Users/michaelbuch/a.out' (arm64) Process 27843 stopped * thread #1, queue = 'com.apple.main-thread', stop reason = Bounds error: out-of-bounds access frame #2: 0x0000000100003f44 a.out`g() at verbose_trap.cpp:14:5 11 12 void g() { 13 std::vector v; -> 14 v[10]; 15 } 16 ``` rdar://134490328 --- .../Target/VerboseTrapFrameRecognizer.cpp | 35 ++++++++++++++++++- ...verbose_trap-in-stl-callback-user-leaf.cpp | 22 ++++++++++++ .../Inputs/verbose_trap-in-stl-callback.cpp | 22 ++++++++++++ .../Inputs/verbose_trap-in-stl-max-depth.cpp | 13 +++++++ .../Inputs/verbose_trap-in-stl-nested.cpp | 21 +++++++++++ .../Recognizer/Inputs/verbose_trap-in-stl.cpp | 17 +++++++++ ...erbose_trap-in-stl-callback-user-leaf.test | 22 ++++++++++++ .../verbose_trap-in-stl-callback.test | 21 +++++++++++ .../verbose_trap-in-stl-max-depth.test | 16 +++++++++ .../verbose_trap-in-stl-nested.test | 13 +++++++ .../Shell/Recognizer/verbose_trap-in-stl.test | 13 +++++++ 11 files changed, 214 insertions(+), 1 deletion(-) create mode 100644 lldb/test/Shell/Recognizer/Inputs/verbose_trap-in-stl-callback-user-leaf.cpp create mode 100644 lldb/test/Shell/Recognizer/Inputs/verbose_trap-in-stl-callback.cpp create mode 100644 lldb/test/Shell/Recognizer/Inputs/verbose_trap-in-stl-max-depth.cpp create mode 100644 lldb/test/Shell/Recognizer/Inputs/verbose_trap-in-stl-nested.cpp create mode 100644 lldb/test/Shell/Recognizer/Inputs/verbose_trap-in-stl.cpp create mode 100644 lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback-user-leaf.test create mode 100644 lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback.test create mode 100644 lldb/test/Shell/Recognizer/verbose_trap-in-stl-max-depth.test create mode 100644 lldb/test/Shell/Recognizer/verbose_trap-in-stl-nested.test create mode 100644 lldb/test/Shell/Recognizer/verbose_trap-in-stl.test diff --git a/lldb/source/Target/VerboseTrapFrameRecognizer.cpp b/lldb/source/Target/VerboseTrapFrameRecognizer.cpp index de710fcda54064..03ab58b8c59a9b 100644 --- a/lldb/source/Target/VerboseTrapFrameRecognizer.cpp +++ b/lldb/source/Target/VerboseTrapFrameRecognizer.cpp @@ -16,6 +16,39 @@ using namespace llvm; using namespace lldb; using namespace lldb_private; +/// The 0th frame is the artificial inline frame generated to store +/// the verbose_trap message. So, starting with the current parent frame, +/// find the first frame that's not inside of the STL. +static StackFrameSP FindMostRelevantFrame(Thread &selected_thread) { + // Defensive upper-bound of when we stop walking up the frames in + // case we somehow ended up looking at an infinite recursion. + const size_t max_stack_depth = 128; + + // Start at parent frame. + size_t stack_idx = 1; + StackFrameSP most_relevant_frame_sp = + selected_thread.GetStackFrameAtIndex(stack_idx); + + while (most_relevant_frame_sp && stack_idx <= max_stack_depth) { + auto const &sc = + most_relevant_frame_sp->GetSymbolContext(eSymbolContextEverything); + ConstString frame_name = sc.GetFunctionName(); + if (!frame_name) + return nullptr; + + // Found a frame outside of the `std` namespace. That's the + // first frame in user-code that ended up triggering the + // verbose_trap. Hence that's the one we want to display. + if (!frame_name.GetStringRef().starts_with("std::")) + return most_relevant_frame_sp; + + ++stack_idx; + most_relevant_frame_sp = selected_thread.GetStackFrameAtIndex(stack_idx); + } + + return nullptr; +} + VerboseTrapRecognizedStackFrame::VerboseTrapRecognizedStackFrame( StackFrameSP most_relevant_frame_sp, std::string stop_desc) : m_most_relevant_frame(most_relevant_frame_sp) { @@ -30,7 +63,7 @@ VerboseTrapFrameRecognizer::RecognizeFrame(lldb::StackFrameSP frame_sp) { ThreadSP thread_sp = frame_sp->GetThread(); ProcessSP process_sp = thread_sp->GetProcess(); - StackFrameSP most_relevant_frame_sp = thread_sp->GetStackFrameAtIndex(1); + StackFrameSP most_relevant_frame_sp = FindMostRelevantFrame(*thread_sp); if (!most_relevant_frame_sp) { Log *log = GetLog(LLDBLog::Unwind); diff --git a/lldb/test/Shell/Recognizer/Inputs/verbose_trap-in-stl-callback-user-leaf.cpp b/lldb/test/Shell/Recognizer/Inputs/verbose_trap-in-stl-callback-user-leaf.cpp new file mode 100644 index 00000000000000..6c36682626a6ef --- /dev/null +++ b/lldb/test/Shell/Recognizer/Inputs/verbose_trap-in-stl-callback-user-leaf.cpp @@ -0,0 +1,22 @@ +void definitely_aborts() { __builtin_verbose_trap("User", "Invariant violated"); } + +namespace std { +void aborts_soon() { definitely_aborts(); } +} // namespace std + +void g() { std::aborts_soon(); } + +namespace std { +namespace detail { +void eventually_aborts() { g(); } +} // namespace detail + +inline namespace __1 { +void eventually_aborts() { detail::eventually_aborts(); } +} // namespace __1 +} // namespace std + +int main() { + std::eventually_aborts(); + return 0; +} diff --git a/lldb/test/Shell/Recognizer/Inputs/verbose_trap-in-stl-callback.cpp b/lldb/test/Shell/Recognizer/Inputs/verbose_trap-in-stl-callback.cpp new file mode 100644 index 00000000000000..23beed4c62c3b3 --- /dev/null +++ b/lldb/test/Shell/Recognizer/Inputs/verbose_trap-in-stl-callback.cpp @@ -0,0 +1,22 @@ +namespace std { +void definitely_aborts() { __builtin_verbose_trap("Failed", "Invariant violated"); } + +void aborts_soon() { definitely_aborts(); } +} // namespace std + +void g() { std::aborts_soon(); } + +namespace std { +namespace detail { +void eventually_aborts() { g(); } +} // namespace detail + +inline namespace __1 { +void eventually_aborts() { detail::eventually_aborts(); } +} // namespace __1 +} // namespace std + +int main() { + std::eventually_aborts(); + return 0; +} diff --git a/lldb/test/Shell/Recognizer/Inputs/verbose_trap-in-stl-max-depth.cpp b/lldb/test/Shell/Recognizer/Inputs/verbose_trap-in-stl-max-depth.cpp new file mode 100644 index 00000000000000..48f564ce674e4d --- /dev/null +++ b/lldb/test/Shell/Recognizer/Inputs/verbose_trap-in-stl-max-depth.cpp @@ -0,0 +1,13 @@ +namespace std { +void recursively_aborts(int depth) { + if (depth == 0) + __builtin_verbose_trap("Error", "max depth"); + + recursively_aborts(--depth); +} +} // namespace std + +int main() { + std::recursively_aborts(256); + return 0; +} diff --git a/lldb/test/Shell/Recognizer/Inputs/verbose_trap-in-stl-nested.cpp b/lldb/test/Shell/Recognizer/Inputs/verbose_trap-in-stl-nested.cpp new file mode 100644 index 00000000000000..67fa65c9ceae22 --- /dev/null +++ b/lldb/test/Shell/Recognizer/Inputs/verbose_trap-in-stl-nested.cpp @@ -0,0 +1,21 @@ +namespace std { +namespace detail { +void function_that_aborts() { __builtin_verbose_trap("Bounds error", "out-of-bounds access"); } +} // namespace detail + +inline namespace __1 { +template struct vector { + void operator[](unsigned) { detail::function_that_aborts(); } +}; +} // namespace __1 +} // namespace std + +void g() { + std::vector v; + v[10]; +} + +int main() { + g(); + return 0; +} diff --git a/lldb/test/Shell/Recognizer/Inputs/verbose_trap-in-stl.cpp b/lldb/test/Shell/Recognizer/Inputs/verbose_trap-in-stl.cpp new file mode 100644 index 00000000000000..4f01827944e166 --- /dev/null +++ b/lldb/test/Shell/Recognizer/Inputs/verbose_trap-in-stl.cpp @@ -0,0 +1,17 @@ +namespace std { +inline namespace __1 { +template struct vector { + void operator[](unsigned) { __builtin_verbose_trap("Bounds error", "out-of-bounds access"); } +}; +} // namespace __1 +} // namespace std + +void g() { + std::vector v; + v[10]; +} + +int main() { + g(); + return 0; +} diff --git a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback-user-leaf.test b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback-user-leaf.test new file mode 100644 index 00000000000000..5a84c163453ccd --- /dev/null +++ b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback-user-leaf.test @@ -0,0 +1,22 @@ +# Tests that we show the first non-STL frame when +# a verbose_trap triggers from within the STL. +# +# Specifically tests that we correctly handle backtraces +# of the form: +# #0 __builtin_verbose_trap +# #1 user-code +# #2 STL +# #3 user-code +# #4 STL +# #5 user-code + +# UNSUPPORTED: system-windows +# +# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap-in-stl-callback-user-leaf.cpp -o %t.out +# RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK + +run +# CHECK: thread #{{.*}}stop reason = User: Invariant violated +frame info +# CHECK: frame #{{.*}}`definitely_aborts() at verbose_trap-in-stl-callback-user-leaf.cpp +q diff --git a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback.test b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback.test new file mode 100644 index 00000000000000..b15bcb3a384f98 --- /dev/null +++ b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback.test @@ -0,0 +1,21 @@ +# Tests that we show the first non-STL frame when +# a verbose_trap triggers from within the STL. +# +# Specifically tests that we correctly handle backtraces +# of the form: +# #0 __builtin_verbose_trap +# #1 STL +# #2 user-code +# #3 STL +# #4 user-code + +# UNSUPPORTED: system-windows +# +# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap-in-stl-callback.cpp -o %t.out +# RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK + +run +# CHECK: thread #{{.*}}stop reason = Failed: Invariant violated +frame info +# CHECK: frame #{{.*}}`g() at verbose_trap-in-stl-callback.cpp +q diff --git a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-max-depth.test b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-max-depth.test new file mode 100644 index 00000000000000..0c3275c571b3d9 --- /dev/null +++ b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-max-depth.test @@ -0,0 +1,16 @@ +# Tests that the VerboseTrapFrameRecognizer stops +# walking the stack once a certain implementation-defined +# threshold is reached. + +# UNSUPPORTED: system-windows +# +# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap-in-stl-max-depth.cpp -o %t.out +# RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK + +run +# CHECK: thread #{{.*}}stop reason = +frame recognizer info 0 +# CHECK: frame 0 is recognized by Verbose Trap StackFrame Recognizer +frame info +# CHECK: frame #0: {{.*}}`std::recursively_aborts(int) {{.*}} at verbose_trap-in-stl-max-depth.cpp +q diff --git a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-nested.test b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-nested.test new file mode 100644 index 00000000000000..81a492d1ed5791 --- /dev/null +++ b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-nested.test @@ -0,0 +1,13 @@ +# Tests that we show the first non-STL frame when +# a verbose_trap triggers from within the STL. + +# UNSUPPORTED: system-windows +# +# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap-in-stl-nested.cpp -o %t.out +# RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK + +run +# CHECK: thread #{{.*}}stop reason = Bounds error: out-of-bounds access +frame info +# CHECK: frame #{{.*}}`g() at verbose_trap-in-stl-nested.cpp +q diff --git a/lldb/test/Shell/Recognizer/verbose_trap-in-stl.test b/lldb/test/Shell/Recognizer/verbose_trap-in-stl.test new file mode 100644 index 00000000000000..dd08290174e3af --- /dev/null +++ b/lldb/test/Shell/Recognizer/verbose_trap-in-stl.test @@ -0,0 +1,13 @@ +# Tests that we show the first non-STL frame when +# a verbose_trap triggers from within the STL. + +# UNSUPPORTED: system-windows +# +# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap-in-stl.cpp -o %t.out +# RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK + +run +# CHECK: thread #{{.*}}stop reason = Bounds error: out-of-bounds access +frame info +# CHECK: frame #{{.*}}`g() at verbose_trap-in-stl.cpp +q From d4536bf5c9d5fa3afae4e6cbb34f9bd0859ab2ea Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Thu, 19 Sep 2024 10:06:48 +0100 Subject: [PATCH 179/321] Fix test issue introduced by e762d4dac762a3fc27c6e251086b6645d7543bb2 (#109254) --- llvm/test/Transforms/LoopVectorize/simple_early_exit.ll | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll index f905bcd73dc952..dcf5c9d8ac64d1 100644 --- a/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll +++ b/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; REQUIRES: asserts ; RUN: opt -S < %s -p loop-vectorize -debug-only=loop-vectorize 2>%t | FileCheck %s --check-prefixes=CHECK ; RUN: cat %t | FileCheck %s --check-prefix=DEBUG From bb5e66e31b2a5dbb2930728ff94281fd805f2d14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?kadir=20=C3=A7etinkaya?= Date: Thu, 19 Sep 2024 11:16:49 +0200 Subject: [PATCH 180/321] [include-cleaner] Suppress all clang warnings (#109099) This patch disables all clang warnings when running include-cleaner, as users aren't interested in other findings and in-development code might have them temporarily. This ensures tool can keep working even in presence of such issues. --- .../include-cleaner/test/tool-ignores-warnings.cpp | 5 +++++ .../include-cleaner/tool/IncludeCleaner.cpp | 12 +++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 clang-tools-extra/include-cleaner/test/tool-ignores-warnings.cpp diff --git a/clang-tools-extra/include-cleaner/test/tool-ignores-warnings.cpp b/clang-tools-extra/include-cleaner/test/tool-ignores-warnings.cpp new file mode 100644 index 00000000000000..e207a32c950d5d --- /dev/null +++ b/clang-tools-extra/include-cleaner/test/tool-ignores-warnings.cpp @@ -0,0 +1,5 @@ +// RUN: clang-include-cleaner %s -- -Wunused 2>&1 | FileCheck --allow-empty %s +static void foo() {} + +// Make sure that we don't get an unused warning +// CHECK-NOT: unused function diff --git a/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp b/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp index d8a44ab9b6e12e..afae4365587aea 100644 --- a/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp +++ b/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp @@ -139,7 +139,17 @@ class Action : public clang::ASTFrontendAction { } void ExecuteAction() override { - auto &P = getCompilerInstance().getPreprocessor(); + const auto &CI = getCompilerInstance(); + + // Disable all warnings when running include-cleaner, as we are only + // interested in include-cleaner related findings. This makes the tool both + // more resilient around in-development code, and possibly faster as we + // skip some extra analysis. + auto &Diags = CI.getDiagnostics(); + Diags.setEnableAllWarnings(false); + Diags.setSeverityForAll(clang::diag::Flavor::WarningOrError, + clang::diag::Severity::Ignored); + auto &P = CI.getPreprocessor(); P.addPPCallbacks(PP.record(P)); PI.record(getCompilerInstance()); ASTFrontendAction::ExecuteAction(); From 3d5e8e4693a51cd3ba336cec0c1a17fe389828a7 Mon Sep 17 00:00:00 2001 From: Daniil Kovalev Date: Thu, 19 Sep 2024 12:17:58 +0300 Subject: [PATCH 181/321] [PAC][CodeGen] Do not emit trivial 'mov xN, xN' on tail call (#109100) Under some conditions, a trivial `mov xN xN` instruction was emitted on tail calls. Consider the following code: ``` class Test { public: virtual void f() {} }; void call_f(Test *t) { t->f(); } ``` Correponding assembly: ``` _Z6call_fP4Test: ldr x16, [x0] mov x17, x0 movk x17, #6503, lsl #48 autda x16, x17 ldr x1, [x16] =====> mov x16, x16 movk x16, #54167, lsl #48 braa x1, x16 ``` This patch makes such movs being omitted. Co-authored-by: Anatoly Trosinenko --- llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp | 11 +++++----- llvm/test/CodeGen/AArch64/ptrauth-call.ll | 21 +++++++++++++++++++ 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index b8f9b58a216446..c6e88131d5a343 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -2510,11 +2510,12 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) { unsigned DiscReg = AddrDisc; if (Disc) { if (AddrDisc != AArch64::NoRegister) { - EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::ORRXrs) - .addReg(ScratchReg) - .addReg(AArch64::XZR) - .addReg(AddrDisc) - .addImm(0)); + if (ScratchReg != AddrDisc) + EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::ORRXrs) + .addReg(ScratchReg) + .addReg(AArch64::XZR) + .addReg(AddrDisc) + .addImm(0)); EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::MOVKXi) .addReg(ScratchReg) .addReg(ScratchReg) diff --git a/llvm/test/CodeGen/AArch64/ptrauth-call.ll b/llvm/test/CodeGen/AArch64/ptrauth-call.ll index 9f211b6e1796e6..5fd6116285122f 100644 --- a/llvm/test/CodeGen/AArch64/ptrauth-call.ll +++ b/llvm/test/CodeGen/AArch64/ptrauth-call.ll @@ -167,6 +167,27 @@ define i32 @test_tailcall_ib_var(ptr %arg0, ptr %arg1) #0 { ret i32 %tmp1 } +define void @test_tailcall_omit_mov_x16_x16(ptr %objptr) #0 { +; CHECK-LABEL: test_tailcall_omit_mov_x16_x16: +; CHECK: ldr x16, [x0] +; CHECK: mov x17, x0 +; CHECK: movk x17, #6503, lsl #48 +; CHECK: autda x16, x17 +; CHECK: ldr x1, [x16] +; CHECK: movk x16, #54167, lsl #48 +; CHECK: braa x1, x16 + %vtable.signed = load ptr, ptr %objptr, align 8 + %objptr.int = ptrtoint ptr %objptr to i64 + %vtable.discr = tail call i64 @llvm.ptrauth.blend(i64 %objptr.int, i64 6503) + %vtable.signed.int = ptrtoint ptr %vtable.signed to i64 + %vtable.unsigned.int = tail call i64 @llvm.ptrauth.auth(i64 %vtable.signed.int, i32 2, i64 %vtable.discr) + %vtable.unsigned = inttoptr i64 %vtable.unsigned.int to ptr + %virt.func.signed = load ptr, ptr %vtable.unsigned, align 8 + %virt.func.discr = tail call i64 @llvm.ptrauth.blend(i64 %vtable.unsigned.int, i64 54167) + tail call void %virt.func.signed(ptr %objptr) [ "ptrauth"(i32 0, i64 %virt.func.discr) ] + ret void +} + define i32 @test_call_ia_arg(ptr %arg0, i64 %arg1) #0 { ; DARWIN-LABEL: test_call_ia_arg: ; DARWIN-NEXT: stp x29, x30, [sp, #-16]! From d267daa9eb517b2e040a59121c15dba59223ebba Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Thu, 19 Sep 2024 11:59:38 +0200 Subject: [PATCH 182/321] [clang][bytecode] Diagnose loads from weak variables (#109256) --- clang/lib/AST/ByteCode/Interp.cpp | 16 +++++++++++++++- clang/test/AST/ByteCode/weak.cpp | 8 ++++++++ clang/test/SemaCXX/weak-init.cpp | 1 + 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp index 827a177f9bf830..17cf3ccdeb6a94 100644 --- a/clang/lib/AST/ByteCode/Interp.cpp +++ b/clang/lib/AST/ByteCode/Interp.cpp @@ -560,13 +560,25 @@ bool CheckGlobalInitialized(InterpState &S, CodePtr OpPC, const Pointer &Ptr) { return false; } +static bool CheckWeak(InterpState &S, CodePtr OpPC, const Pointer &Ptr) { + if (!Ptr.isWeak()) + return true; + + const auto *VD = Ptr.getDeclDesc()->asVarDecl(); + assert(VD); + S.FFDiag(S.Current->getLocation(OpPC), diag::note_constexpr_var_init_weak) + << VD; + S.Note(VD->getLocation(), diag::note_declared_at); + + return false; +} + bool CheckLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr, AccessKinds AK) { if (!CheckLive(S, OpPC, Ptr, AK)) return false; if (!CheckConstant(S, OpPC, Ptr)) return false; - if (!CheckDummy(S, OpPC, Ptr, AK)) return false; if (!CheckExtern(S, OpPC, Ptr)) @@ -579,6 +591,8 @@ bool CheckLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr, return false; if (!CheckTemporary(S, OpPC, Ptr, AK)) return false; + if (!CheckWeak(S, OpPC, Ptr)) + return false; if (!CheckMutable(S, OpPC, Ptr)) return false; if (!CheckVolatile(S, OpPC, Ptr, AK)) diff --git a/clang/test/AST/ByteCode/weak.cpp b/clang/test/AST/ByteCode/weak.cpp index 0322241beef83b..52c75229d9ec7a 100644 --- a/clang/test/AST/ByteCode/weak.cpp +++ b/clang/test/AST/ByteCode/weak.cpp @@ -7,3 +7,11 @@ int ha[(bool)&a]; // both-warning {{variable length arrays in C++ are a Clang ex int ha2[&a == nullptr]; // both-warning {{variable length arrays in C++ are a Clang extension}} \ // both-note {{comparison against address of weak declaration '&a' can only be performed at runtime}} \ // both-error {{variable length array declaration not allowed at file scope}} + +extern const int W1 __attribute__((weak)) = 10; // both-note {{declared here}} +static_assert(W1 == 10, ""); // both-error {{static assertion expression is not an integral constant expression}} \ + // both-note {{initializer of weak variable 'W1' is not considered constant because it may be different at runtime}} + +extern const int W2 __attribute__((weak)); // both-note {{declared here}} +static_assert(W2 == 10, ""); // both-error {{static assertion expression is not an integral constant expression}} \ + // both-note {{initializer of 'W2' is unknown}} diff --git a/clang/test/SemaCXX/weak-init.cpp b/clang/test/SemaCXX/weak-init.cpp index a88d32bdca5a17..0147b80f382402 100644 --- a/clang/test/SemaCXX/weak-init.cpp +++ b/clang/test/SemaCXX/weak-init.cpp @@ -1,4 +1,5 @@ // RUN: %clang_cc1 %s -verify -fsyntax-only +// RUN: %clang_cc1 %s -verify -fsyntax-only -fexperimental-new-constant-interpreter extern const int W1 __attribute__((weak)) = 10; // expected-note {{declared here}} From f3250858780b37a188875c87e57133f1192b2e60 Mon Sep 17 00:00:00 2001 From: Ivan Butygin Date: Thu, 19 Sep 2024 13:12:32 +0300 Subject: [PATCH 183/321] [mlir][vector] Relax strides check for 1-element vector load/stores (#108998) Single elememst vector load/stores are equivalent to scalar load/stores, so they don't need memref to be contigious. --- mlir/lib/Dialect/Vector/IR/VectorOps.cpp | 11 +++++++++-- mlir/test/Dialect/Vector/ops.mlir | 20 ++++++++++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp index d3aef4ac38af03..816447713de417 100644 --- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp +++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp @@ -4769,7 +4769,14 @@ void TransferWriteOp::getCanonicalizationPatterns(RewritePatternSet &results, //===----------------------------------------------------------------------===// static LogicalResult verifyLoadStoreMemRefLayout(Operation *op, + VectorType vecTy, MemRefType memRefTy) { + // If rank==0 or size==1 it's equivalent to scalar load/store, so we don't + // need any strides limitations. + if (!vecTy.isScalable() && + (vecTy.getRank() == 0 || vecTy.getNumElements() == 1)) + return success(); + if (!isLastMemrefDimUnitStride(memRefTy)) return op->emitOpError("most minor memref dim must have unit stride"); return success(); @@ -4779,7 +4786,7 @@ LogicalResult vector::LoadOp::verify() { VectorType resVecTy = getVectorType(); MemRefType memRefTy = getMemRefType(); - if (failed(verifyLoadStoreMemRefLayout(*this, memRefTy))) + if (failed(verifyLoadStoreMemRefLayout(*this, resVecTy, memRefTy))) return failure(); // Checks for vector memrefs. @@ -4811,7 +4818,7 @@ LogicalResult vector::StoreOp::verify() { VectorType valueVecTy = getVectorType(); MemRefType memRefTy = getMemRefType(); - if (failed(verifyLoadStoreMemRefLayout(*this, memRefTy))) + if (failed(verifyLoadStoreMemRefLayout(*this, valueVecTy, memRefTy))) return failure(); // Checks for vector memrefs. diff --git a/mlir/test/Dialect/Vector/ops.mlir b/mlir/test/Dialect/Vector/ops.mlir index 4759fcc9511fb2..08d1a189231bcc 100644 --- a/mlir/test/Dialect/Vector/ops.mlir +++ b/mlir/test/Dialect/Vector/ops.mlir @@ -732,6 +732,26 @@ func.func @vector_load_and_store_0d_scalar_memref(%memref : memref<200x100xf32>, return } +// CHECK-LABEL: @vector_load_and_store_0d_scalar_strided_memref +func.func @vector_load_and_store_0d_scalar_strided_memref(%memref : memref<200x100xf32, strided<[?, ?], offset: ?>>, + %i : index, %j : index) { + // CHECK: %[[ld:.*]] = vector.load %{{.*}}[%{{.*}}] : memref<200x100xf32, strided<[?, ?], offset: ?>>, vector + %0 = vector.load %memref[%i, %j] : memref<200x100xf32, strided<[?, ?], offset: ?>>, vector + // CHECK: vector.store %[[ld]], %{{.*}}[%{{.*}}] : memref<200x100xf32, strided<[?, ?], offset: ?>>, vector + vector.store %0, %memref[%i, %j] : memref<200x100xf32, strided<[?, ?], offset: ?>>, vector + return +} + +// CHECK-LABEL: @vector_load_and_store_unit_vec_strided_memref +func.func @vector_load_and_store_unit_vec_strided_memref(%memref : memref<200x100xf32, strided<[?, ?], offset: ?>>, + %i : index, %j : index) { + // CHECK: %[[ld:.*]] = vector.load %{{.*}}[%{{.*}}] : memref<200x100xf32, strided<[?, ?], offset: ?>>, vector<1xf32> + %0 = vector.load %memref[%i, %j] : memref<200x100xf32, strided<[?, ?], offset: ?>>, vector<1xf32> + // CHECK: vector.store %[[ld]], %{{.*}}[%{{.*}}] : memref<200x100xf32, strided<[?, ?], offset: ?>>, vector<1xf32> + vector.store %0, %memref[%i, %j] : memref<200x100xf32, strided<[?, ?], offset: ?>>, vector<1xf32> + return +} + // CHECK-LABEL: @vector_load_and_store_1d_scalar_memref func.func @vector_load_and_store_1d_scalar_memref(%memref : memref<200x100xf32>, %i : index, %j : index) { From 256100489de2d01d21ddd9720aad3993a83864c2 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 19 Sep 2024 11:20:31 +0100 Subject: [PATCH 184/321] [VPlan] Rename isDefinedOutside[Vector]Regions -> [Loop] (NFC) Clarify name of helper, split off from https://github.com/llvm/llvm-project/pull/95842/files#r1765556732. --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 5 ++--- llvm/lib/Transforms/Vectorize/VPlan.cpp | 2 +- llvm/lib/Transforms/Vectorize/VPlan.h | 8 ++++---- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 4 ++-- llvm/lib/Transforms/Vectorize/VPlanUtils.h | 2 +- llvm/lib/Transforms/Vectorize/VPlanValue.h | 4 ++-- 6 files changed, 12 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 5ee8f9db32aac8..107fb38be31969 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9452,9 +9452,8 @@ void VPReplicateRecipe::execute(VPTransformState &State) { // If the recipe is uniform across all parts (instead of just per VF), only // generate a single instance. if ((isa(UI) || isa(UI)) && - all_of(operands(), [](VPValue *Op) { - return Op->isDefinedOutsideVectorRegions(); - })) { + all_of(operands(), + [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) { State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State); if (user_begin() != user_end()) { for (unsigned Part = 1; Part < State.UF; ++Part) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 2169d78542cbaf..6a6ec363592c32 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -271,7 +271,7 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part, bool NeedsScalar) { return Data.PerPartOutput[Def][Part]; auto GetBroadcastInstrs = [this, Def](Value *V) { - bool SafeToHoist = Def->isDefinedOutsideVectorRegions(); + bool SafeToHoist = Def->isDefinedOutsideLoopRegions(); if (VF.isScalar()) return V; // Place the code for broadcasting invariant variables in the new preheader. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 9b9e710ddc88cb..73d218cdc7ac27 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1717,23 +1717,23 @@ struct VPWidenSelectRecipe : public VPSingleDefRecipe { } bool isInvariantCond() const { - return getCond()->isDefinedOutsideVectorRegions(); + return getCond()->isDefinedOutsideLoopRegions(); } }; /// A recipe for handling GEP instructions. class VPWidenGEPRecipe : public VPRecipeWithIRFlags { bool isPointerLoopInvariant() const { - return getOperand(0)->isDefinedOutsideVectorRegions(); + return getOperand(0)->isDefinedOutsideLoopRegions(); } bool isIndexLoopInvariant(unsigned I) const { - return getOperand(I + 1)->isDefinedOutsideVectorRegions(); + return getOperand(I + 1)->isDefinedOutsideLoopRegions(); } bool areAllOperandsInvariant() const { return all_of(operands(), [](VPValue *Op) { - return Op->isDefinedOutsideVectorRegions(); + return Op->isDefinedOutsideLoopRegions(); }); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index c077e2b4eac5f1..81c7d1025e8a0b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1271,7 +1271,7 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF, RHSInfo = Ctx.TTI.getOperandInfo(RHS->getLiveInIRValue()); if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue && - getOperand(1)->isDefinedOutsideVectorRegions()) + getOperand(1)->isDefinedOutsideLoopRegions()) RHSInfo.Kind = TargetTransformInfo::OK_UniformValue; Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); @@ -2093,7 +2093,7 @@ void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, /// TODO: Uniformity should be associated with a VPValue and there should be a /// generic way to check. static bool isUniformAcrossVFsAndUFs(VPScalarCastRecipe *C) { - return C->isDefinedOutsideVectorRegions() || + return C->isDefinedOutsideLoopRegions() || isa(C->getOperand(0)) || isa(C->getOperand(0)); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index 7b5d4300655f5a..cb7a4e443176ad 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -39,7 +39,7 @@ const SCEV *getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE); inline bool isUniformAfterVectorization(const VPValue *VPV) { // A value defined outside the vector region must be uniform after // vectorization inside a vector region. - if (VPV->isDefinedOutsideVectorRegions()) + if (VPV->isDefinedOutsideLoopRegions()) return true; const VPRecipeBase *Def = VPV->getDefiningRecipe(); assert(Def && "Must have definition for value defined inside vector region"); diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 1dd8d09ff62472..a47ce61e28c50b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -180,10 +180,10 @@ class VPValue { return getUnderlyingValue(); } - /// Returns true if the VPValue is defined outside any vector regions, i.e. it + /// Returns true if the VPValue is defined outside any loop region, i.e. it /// is a live-in value. /// TODO: Also handle recipes defined in pre-header blocks. - bool isDefinedOutsideVectorRegions() const { return !hasDefiningRecipe(); } + bool isDefinedOutsideLoopRegions() const { return !hasDefiningRecipe(); } // Set \p Val as the underlying Value of this VPValue. void setUnderlyingValue(Value *Val) { From f1ff3a279f3320d8e0d4abbc1a8357bb51de25a2 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 19 Sep 2024 12:27:33 +0200 Subject: [PATCH 185/321] [InstCombine] Rename TTI member for clarity (NFC) There is already a comment on the member and documentation in the InstCombine contributor guide, but also rename it to make add an additional speed bump. --- .../llvm/Transforms/InstCombine/InstCombiner.h | 4 ++-- .../Transforms/InstCombine/InstructionCombining.cpp | 13 ++++++++----- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h index 68d9ae862c1c23..3075b7ebae59e6 100644 --- a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h +++ b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h @@ -49,7 +49,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner { /// Only used to call target specific intrinsic combining. /// It must **NOT** be used for any other purpose, as InstCombine is a /// target-independent canonicalization transform. - TargetTransformInfo &TTI; + TargetTransformInfo &TTIForTargetIntrinsicsOnly; public: /// Maximum size of array considered when transforming. @@ -105,7 +105,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner { BlockFrequencyInfo *BFI, BranchProbabilityInfo *BPI, ProfileSummaryInfo *PSI, const DataLayout &DL, ReversePostOrderTraversal &RPOT) - : TTI(TTI), Builder(Builder), Worklist(Worklist), + : TTIForTargetIntrinsicsOnly(TTI), Builder(Builder), Worklist(Worklist), MinimizeSize(MinimizeSize), AA(AA), AC(AC), TLI(TLI), DT(DT), DL(DL), SQ(DL, &TLI, &DT, &AC, nullptr, /*UseInstrInfo*/ true, /*CanUseUndef*/ true, &DC), diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 1e606c51f72cdb..28c50136257e23 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -155,7 +155,7 @@ std::optional InstCombiner::targetInstCombineIntrinsic(IntrinsicInst &II) { // Handle target specific intrinsics if (II.getCalledFunction()->isTargetIntrinsic()) { - return TTI.instCombineIntrinsic(*this, II); + return TTIForTargetIntrinsicsOnly.instCombineIntrinsic(*this, II); } return std::nullopt; } @@ -165,8 +165,8 @@ std::optional InstCombiner::targetSimplifyDemandedUseBitsIntrinsic( bool &KnownBitsComputed) { // Handle target specific intrinsics if (II.getCalledFunction()->isTargetIntrinsic()) { - return TTI.simplifyDemandedUseBitsIntrinsic(*this, II, DemandedMask, Known, - KnownBitsComputed); + return TTIForTargetIntrinsicsOnly.simplifyDemandedUseBitsIntrinsic( + *this, II, DemandedMask, Known, KnownBitsComputed); } return std::nullopt; } @@ -178,7 +178,7 @@ std::optional InstCombiner::targetSimplifyDemandedVectorEltsIntrinsic( SimplifyAndSetOp) { // Handle target specific intrinsics if (II.getCalledFunction()->isTargetIntrinsic()) { - return TTI.simplifyDemandedVectorEltsIntrinsic( + return TTIForTargetIntrinsicsOnly.simplifyDemandedVectorEltsIntrinsic( *this, II, DemandedElts, PoisonElts, PoisonElts2, PoisonElts3, SimplifyAndSetOp); } @@ -186,7 +186,10 @@ std::optional InstCombiner::targetSimplifyDemandedVectorEltsIntrinsic( } bool InstCombiner::isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const { - return TTI.isValidAddrSpaceCast(FromAS, ToAS); + // Approved exception for TTI use: This queries a legality property of the + // target, not an profitability heuristic. Ideally this should be part of + // DataLayout instead. + return TTIForTargetIntrinsicsOnly.isValidAddrSpaceCast(FromAS, ToAS); } Value *InstCombinerImpl::EmitGEPOffset(GEPOperator *GEP, bool RewriteGEP) { From becc02ce93cd14f07f444fff6f7433c1ecf13664 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 19 Sep 2024 03:54:13 -0700 Subject: [PATCH 186/321] =?UTF-8?q?Revert=20"[Transforms][IPO]=20Add=20fun?= =?UTF-8?q?c=20suffix=20in=20ArgumentPromotion=20and=20DeadArgume=E2=80=A6?= =?UTF-8?q?=20(#105742)"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 959448fbd6bc6f74fb3f9655b1387d0e8a272ab8. Reverting because multiple test failures e.g. https://lab.llvm.org/buildbot/#/builders/187/builds/1290 https://lab.llvm.org/buildbot/#/builders/153/builds/9389 and maybe a few others. --- llvm/lib/Transforms/IPO/ArgumentPromotion.cpp | 1 - .../IPO/DeadArgumentElimination.cpp | 4 --- .../remove-dead-function-spurious-ref-edge.ll | 4 +-- llvm/test/BugPoint/remove_arguments_test.ll | 2 +- llvm/test/CodeGen/AArch64/arg_promotion.ll | 16 +++++----- llvm/test/CodeGen/AMDGPU/internalize.ll | 2 +- .../ThinLTO/X86/memprof-aliased-location1.ll | 24 +++++++------- .../ThinLTO/X86/memprof-aliased-location2.ll | 24 +++++++------- llvm/test/ThinLTO/X86/memprof-basic.ll | 19 +---------- .../X86/memprof-duplicate-context-ids.ll | 14 +------- .../ThinLTO/X86/memprof-funcassigncloning.ll | 19 +---------- llvm/test/ThinLTO/X86/memprof-indirectcall.ll | 15 +-------- llvm/test/ThinLTO/X86/memprof-inlined.ll | 15 +-------- .../2008-02-01-ReturnAttrs.ll | 4 +-- .../ArgumentPromotion/BPF/argpromotion.ll | 2 +- .../ArgumentPromotion/X86/attributes.ll | 4 +-- .../X86/min-legal-vector-width.ll | 32 +++++++++---------- .../ArgumentPromotion/X86/thiscall.ll | 4 +-- .../ArgumentPromotion/actual-arguments.ll | 10 +++--- .../aggregate-promote-dead-gep.ll | 4 +-- .../ArgumentPromotion/aggregate-promote.ll | 4 +-- .../Transforms/ArgumentPromotion/align.ll | 16 +++++----- .../Transforms/ArgumentPromotion/allocsize.ll | 16 +++++----- .../Transforms/ArgumentPromotion/attrs.ll | 4 +-- .../Transforms/ArgumentPromotion/basictest.ll | 8 ++--- .../Transforms/ArgumentPromotion/bitcasts.ll | 8 ++--- .../Transforms/ArgumentPromotion/byval-2.ll | 4 +-- .../ArgumentPromotion/byval-with-padding.ll | 4 +-- .../Transforms/ArgumentPromotion/byval.ll | 20 ++++++------ .../Transforms/ArgumentPromotion/chained.ll | 4 +-- .../ArgumentPromotion/control-flow2.ll | 4 +-- .../Transforms/ArgumentPromotion/crash.ll | 2 +- llvm/test/Transforms/ArgumentPromotion/dbg.ll | 4 +-- .../test/Transforms/ArgumentPromotion/fp80.ll | 12 +++---- .../Transforms/ArgumentPromotion/inalloca.ll | 4 +-- .../ArgumentPromotion/invalidation.ll | 6 ++-- ...lignment-value-overflows-addrspace-size.ll | 8 ++--- .../ArgumentPromotion/max-elements-limit.ll | 4 +-- .../Transforms/ArgumentPromotion/metadata.ll | 8 ++--- .../min-legal-vector-width.ll | 4 +-- .../nonzero-address-spaces.ll | 4 +-- .../ArgumentPromotion/opaque-ptr.ll | 4 +-- .../Transforms/ArgumentPromotion/pr27568.ll | 4 +-- .../Transforms/ArgumentPromotion/pr32917.ll | 4 +-- .../pr33641_remove_arg_dbgvalue.ll | 2 +- .../Transforms/ArgumentPromotion/profile.ll | 4 +-- .../propagate-remove-dead-args.ll | 18 +++++------ .../recursion/aggregate-promote-recursive.ll | 6 ++-- .../argpromotion-recursion-pr1259.ll | 8 ++--- .../recursion/recursion-mixed-calls.ll | 12 +++---- .../recursion/recursion-non-zero-offset.ll | 8 ++--- .../ArgumentPromotion/reserve-tbaa.ll | 4 +-- .../test/Transforms/ArgumentPromotion/sret.ll | 4 +-- .../ArgumentPromotion/store-into-inself.ll | 4 +-- .../ArgumentPromotion/unused-argument.ll | 8 ++--- ...r_cached_analysis_for_deleted_functions.ll | 4 +-- .../DeadArgElim/2007-02-07-FuncRename.ll | 2 +- .../DeadArgElim/2007-12-20-ParamAttrs.ll | 4 +-- .../DeadArgElim/2010-04-30-DbgInfo.ll | 4 +-- .../test/Transforms/DeadArgElim/aggregates.ll | 10 +++--- .../Transforms/DeadArgElim/call_profile.ll | 4 +-- llvm/test/Transforms/DeadArgElim/comdat.ll | 2 +- .../dbginfo-update-dbgval-local.ll | 6 ++-- llvm/test/Transforms/DeadArgElim/dbginfo.ll | 2 +- .../test/Transforms/DeadArgElim/deadretval.ll | 4 +-- llvm/test/Transforms/DeadArgElim/fct_ptr.ll | 2 +- .../Transforms/DeadArgElim/func_metadata.ll | 4 +-- llvm/test/Transforms/DeadArgElim/funclet.ll | 2 +- llvm/test/Transforms/DeadArgElim/keepalive.ll | 4 +-- .../DeadArgElim/nonzero-address-spaces.ll | 4 +-- llvm/test/Transforms/DeadArgElim/returned.ll | 10 +++--- .../Transforms/DeadArgElim/variadic_safety.ll | 2 +- .../function-specialization2.ll | 12 +++---- .../global-var-constants.ll | 14 ++++---- .../non-argument-tracked.ll | 24 +++++++------- .../specialization-order.ll | 12 +++---- llvm/test/Transforms/PhaseOrdering/dae-dce.ll | 6 ++-- .../dce-after-argument-promotion.ll | 4 +-- llvm/test/Transforms/SCCP/recursion.ll | 6 ++-- 79 files changed, 263 insertions(+), 342 deletions(-) diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp index c8b75dd475ae44..1f9b546ed29996 100644 --- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -215,7 +215,6 @@ doPromotion(Function *F, FunctionAnalysisManager &FAM, F->getParent()->getFunctionList().insert(F->getIterator(), NF); NF->takeName(F); - NF->setName(NF->getName() + ".argprom"); // Loop over all the callers of the function, transforming the call sites to // pass in the loaded pointers. diff --git a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp index b912cc66d19db5..d1548592b1ce26 100644 --- a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -889,10 +889,6 @@ bool DeadArgumentEliminationPass::removeDeadStuffFromFunction(Function *F) { // it again. F->getParent()->getFunctionList().insert(F->getIterator(), NF); NF->takeName(F); - if (NumArgumentsEliminated) - NF->setName(NF->getName() + ".argelim"); - else - NF->setName(NF->getName() + ".retelim"); NF->IsNewDbgInfoFormat = F->IsNewDbgInfoFormat; // Loop over all the callers of the function, transforming the call sites to diff --git a/llvm/test/Analysis/LazyCallGraph/remove-dead-function-spurious-ref-edge.ll b/llvm/test/Analysis/LazyCallGraph/remove-dead-function-spurious-ref-edge.ll index 4f16c02b1473ff..2bc486f541c71f 100644 --- a/llvm/test/Analysis/LazyCallGraph/remove-dead-function-spurious-ref-edge.ll +++ b/llvm/test/Analysis/LazyCallGraph/remove-dead-function-spurious-ref-edge.ll @@ -9,7 +9,7 @@ define internal void @a() alwaysinline { } define internal void @b(ptr) noinline { -; CHECK-LABEL: @b.argprom( +; CHECK-LABEL: @b( ; CHECK-NEXT: ret void ; ret void @@ -17,7 +17,7 @@ define internal void @b(ptr) noinline { define internal void @c() noinline { ; CHECK-LABEL: @c( -; CHECK-NEXT: call void @b.argprom() +; CHECK-NEXT: call void @b() ; CHECK-NEXT: ret void ; call void @b(ptr @a) diff --git a/llvm/test/BugPoint/remove_arguments_test.ll b/llvm/test/BugPoint/remove_arguments_test.ll index bb93e45e4b46ef..9e9c51eaafc383 100644 --- a/llvm/test/BugPoint/remove_arguments_test.ll +++ b/llvm/test/BugPoint/remove_arguments_test.ll @@ -11,7 +11,7 @@ declare i32 @test2() -; CHECK: define void @test.argelim() { +; CHECK: define void @test() { define i32 @test(i32 %A, ptr %B, float %C) { call i32 @test2() ret i32 %1 diff --git a/llvm/test/CodeGen/AArch64/arg_promotion.ll b/llvm/test/CodeGen/AArch64/arg_promotion.ll index 724a7f109f1e29..cc37d230c6cbe4 100644 --- a/llvm/test/CodeGen/AArch64/arg_promotion.ll +++ b/llvm/test/CodeGen/AArch64/arg_promotion.ll @@ -38,16 +38,16 @@ define dso_local void @caller_4xi32(ptr noalias %src, ptr noalias %dst) #1 { ; CHECK-LABEL: define dso_local void @caller_4xi32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SRC_VAL:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 16 -; CHECK-NEXT: call fastcc void @callee_4xi32.argprom.argprom(<4 x i32> [[SRC_VAL]], ptr noalias [[DST:%.*]]) +; CHECK-NEXT: call fastcc void @callee_4xi32(<4 x i32> [[SRC_VAL]], ptr noalias [[DST:%.*]]) ; CHECK-NEXT: ret void ; entry: - call fastcc void @callee_4xi32.argprom(ptr noalias %src, ptr noalias %dst) + call fastcc void @callee_4xi32(ptr noalias %src, ptr noalias %dst) ret void } -define internal fastcc void @callee_4xi32.argprom(ptr noalias %src, ptr noalias %dst) #1 { -; CHECK-LABEL: define internal fastcc void @callee_4xi32.argprom.argprom( +define internal fastcc void @callee_4xi32(ptr noalias %src, ptr noalias %dst) #1 { +; CHECK-LABEL: define internal fastcc void @callee_4xi32( ; CHECK-NEXT: entry: ; CHECK-NEXT: store <4 x i32> [[SRC_0_VAL:%.*]], ptr [[DST:%.*]], align 16 ; CHECK-NEXT: ret void @@ -65,7 +65,7 @@ define dso_local void @caller_i256(ptr noalias %src, ptr noalias %dst) #0 { ; CHECK-LABEL: define dso_local void @caller_i256( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SRC_VAL:%.*]] = load i256, ptr [[SRC:%.*]], align 16 -; CHECK-NEXT: call fastcc void @callee_i256.argprom(i256 [[SRC_VAL]], ptr noalias [[DST:%.*]]) +; CHECK-NEXT: call fastcc void @callee_i256(i256 [[SRC_VAL]], ptr noalias [[DST:%.*]]) ; CHECK-NEXT: ret void ; entry: @@ -74,7 +74,7 @@ entry: } define internal fastcc void @callee_i256(ptr noalias %src, ptr noalias %dst) #0 { -; CHECK-LABEL: define internal fastcc void @callee_i256.argprom( +; CHECK-LABEL: define internal fastcc void @callee_i256( ; CHECK-NEXT: entry: ; CHECK-NEXT: store i256 [[SRC_0_VAL:%.*]], ptr [[DST:%.*]], align 16 ; CHECK-NEXT: ret void @@ -159,7 +159,7 @@ define dso_local void @caller_struct4xi32(ptr noalias %src, ptr noalias %dst) #1 ; CHECK-NEXT: [[SRC_VAL:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 16 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[SRC]], i64 16 ; CHECK-NEXT: [[SRC_VAL1:%.*]] = load <4 x i32>, ptr [[TMP0]], align 16 -; CHECK-NEXT: call fastcc void @callee_struct4xi32.argprom(<4 x i32> [[SRC_VAL]], <4 x i32> [[SRC_VAL1]], ptr noalias [[DST:%.*]]) +; CHECK-NEXT: call fastcc void @callee_struct4xi32(<4 x i32> [[SRC_VAL]], <4 x i32> [[SRC_VAL1]], ptr noalias [[DST:%.*]]) ; CHECK-NEXT: ret void ; entry: @@ -168,7 +168,7 @@ entry: } define internal fastcc void @callee_struct4xi32(ptr noalias %src, ptr noalias %dst) #1 { -; CHECK-LABEL: define internal fastcc void @callee_struct4xi32.argprom( +; CHECK-LABEL: define internal fastcc void @callee_struct4xi32( ; CHECK-NEXT: entry: ; CHECK-NEXT: store <4 x i32> [[SRC_0_VAL:%.*]], ptr [[DST:%.*]], align 16 ; CHECK-NEXT: [[DST2:%.*]] = getelementptr inbounds [[STRUCT_4XI32:%.*]], ptr [[DST]], i64 0, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/internalize.ll b/llvm/test/CodeGen/AMDGPU/internalize.ll index 08b42f93bf5f47..6b2a4d5fc328b4 100644 --- a/llvm/test/CodeGen/AMDGPU/internalize.ll +++ b/llvm/test/CodeGen/AMDGPU/internalize.ll @@ -10,7 +10,7 @@ ; ALL: gvar_used @gvar_used = addrspace(1) global i32 undef, align 4 -; OPT: define internal fastcc void @func_used_noinline.argelim( +; OPT: define internal fastcc void @func_used_noinline( ; OPT-NONE: define fastcc void @func_used_noinline( define fastcc void @func_used_noinline(ptr addrspace(1) %out, i32 %tid) #1 { entry: diff --git a/llvm/test/ThinLTO/X86/memprof-aliased-location1.ll b/llvm/test/ThinLTO/X86/memprof-aliased-location1.ll index 8be9727b316d28..42819d5421ca0f 100644 --- a/llvm/test/ThinLTO/X86/memprof-aliased-location1.ll +++ b/llvm/test/ThinLTO/X86/memprof-aliased-location1.ll @@ -84,22 +84,22 @@ attributes #0 = { noinline optnone } ;; The first call to foo does not allocate cold memory. It should call the ;; original functions, which ultimately call the original allocation decorated ;; with a "notcold" attribute. -; IR: call {{.*}} @_Z3foov.retelim() +; IR: call {{.*}} @_Z3foov() ;; The second call to foo allocates cold memory. It should call cloned functions ;; which ultimately call a cloned allocation decorated with a "cold" attribute. -; IR: call {{.*}} @_Z3foov.memprof.1.retelim() -; IR: define internal {{.*}} @_Z3barv.retelim() +; IR: call {{.*}} @_Z3foov.memprof.1() +; IR: define internal {{.*}} @_Z3barv() ; IR: call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]] -; IR: define internal {{.*}} @_Z3bazv.retelim() -; IR: call {{.*}} @_Z3barv.retelim() -; IR: define internal {{.*}} @_Z3foov.retelim() -; IR: call {{.*}} @_Z3bazv.retelim() -; IR: define internal {{.*}} @_Z3barv.memprof.1.retelim() +; IR: define internal {{.*}} @_Z3bazv() +; IR: call {{.*}} @_Z3barv() +; IR: define internal {{.*}} @_Z3foov() +; IR: call {{.*}} @_Z3bazv() +; IR: define internal {{.*}} @_Z3barv.memprof.1() ; IR: call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]] -; IR: define internal {{.*}} @_Z3bazv.memprof.1.retelim() -; IR: call {{.*}} @_Z3barv.memprof.1.retelim() -; IR: define internal {{.*}} @_Z3foov.memprof.1.retelim() -; IR: call {{.*}} @_Z3bazv.memprof.1.retelim() +; IR: define internal {{.*}} @_Z3bazv.memprof.1() +; IR: call {{.*}} @_Z3barv.memprof.1() +; IR: define internal {{.*}} @_Z3foov.memprof.1() +; IR: call {{.*}} @_Z3bazv.memprof.1() ; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" } ; IR: attributes #[[COLD]] = { "memprof"="cold" } diff --git a/llvm/test/ThinLTO/X86/memprof-aliased-location2.ll b/llvm/test/ThinLTO/X86/memprof-aliased-location2.ll index 4c18cf8226c8bb..663f8525043c2f 100644 --- a/llvm/test/ThinLTO/X86/memprof-aliased-location2.ll +++ b/llvm/test/ThinLTO/X86/memprof-aliased-location2.ll @@ -84,22 +84,22 @@ attributes #0 = { noinline optnone } ;; The first call to foo does not allocate cold memory. It should call the ;; original functions, which ultimately call the original allocation decorated ;; with a "notcold" attribute. -; IR: call {{.*}} @_Z3foov.retelim() +; IR: call {{.*}} @_Z3foov() ;; The second call to foo allocates cold memory. It should call cloned functions ;; which ultimately call a cloned allocation decorated with a "cold" attribute. -; IR: call {{.*}} @_Z3foov.memprof.1.retelim() -; IR: define internal {{.*}} @_Z3barv.retelim() +; IR: call {{.*}} @_Z3foov.memprof.1() +; IR: define internal {{.*}} @_Z3barv() ; IR: call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]] -; IR: define internal {{.*}} @_Z3bazv.retelim() -; IR: call {{.*}} @_Z3barv.retelim() -; IR: define internal {{.*}} @_Z3foov.retelim() -; IR: call {{.*}} @_Z3bazv.retelim() -; IR: define internal {{.*}} @_Z3barv.memprof.1.retelim() +; IR: define internal {{.*}} @_Z3bazv() +; IR: call {{.*}} @_Z3barv() +; IR: define internal {{.*}} @_Z3foov() +; IR: call {{.*}} @_Z3bazv() +; IR: define internal {{.*}} @_Z3barv.memprof.1() ; IR: call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]] -; IR: define internal {{.*}} @_Z3bazv.memprof.1.retelim() -; IR: call {{.*}} @_Z3barv.memprof.1.retelim() -; IR: define internal {{.*}} @_Z3foov.memprof.1.retelim() -; IR: call {{.*}} @_Z3bazv.memprof.1.retelim() +; IR: define internal {{.*}} @_Z3bazv.memprof.1() +; IR: call {{.*}} @_Z3barv.memprof.1() +; IR: define internal {{.*}} @_Z3foov.memprof.1() +; IR: call {{.*}} @_Z3bazv.memprof.1() ; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" } ; IR: attributes #[[COLD]] = { "memprof"="cold" } diff --git a/llvm/test/ThinLTO/X86/memprof-basic.ll b/llvm/test/ThinLTO/X86/memprof-basic.ll index b7aadf8e32a771..6922dbfd368467 100644 --- a/llvm/test/ThinLTO/X86/memprof-basic.ll +++ b/llvm/test/ThinLTO/X86/memprof-basic.ll @@ -53,7 +53,7 @@ ;; We should have cloned bar, baz, and foo, for the cold memory allocation. ; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED -; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IRNODIST +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR ;; Try again but with distributed ThinLTO @@ -303,23 +303,6 @@ attributes #0 = { noinline optnone } ; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" } ; IR: attributes #[[COLD]] = { "memprof"="cold" } -; IRNODIST: define {{.*}} @main -; IRNODIST: call {{.*}} @_Z3foov.retelim() -; IRNODIST: call {{.*}} @_Z3foov.memprof.1.retelim() -; IRNODIST: define internal {{.*}} @_Z3barv.retelim() -; IRNODIST: call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]] -; IRNODIST: define internal {{.*}} @_Z3bazv.retelim() -; IRNODIST: call {{.*}} @_Z3barv.retelim() -; IRNODIST: define internal {{.*}} @_Z3foov.retelim() -; IRNODIST: call {{.*}} @_Z3bazv.retelim() -; IRNODIST: define internal {{.*}} @_Z3barv.memprof.1.retelim() -; IRNODIST: call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]] -; IRNODIST: define internal {{.*}} @_Z3bazv.memprof.1.retelim() -; IRNODIST: call {{.*}} @_Z3barv.memprof.1.retelim() -; IRNODIST: define internal {{.*}} @_Z3foov.memprof.1.retelim() -; IRNODIST: call {{.*}} @_Z3bazv.memprof.1.retelim() -; IRNODIST: attributes #[[NOTCOLD]] = { "memprof"="notcold" } -; IRNODIST: attributes #[[COLD]] = { "memprof"="cold" } ; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) ; STATS-BE: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend diff --git a/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll index bfc7b02a956c6f..65d794e9cba87c 100644 --- a/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll +++ b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll @@ -68,7 +68,7 @@ ; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \ ; RUN: --check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS -; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IRNODIST +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR ;; Try again but with distributed ThinLTO @@ -247,18 +247,6 @@ attributes #0 = { noinline optnone} ; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" } ; IR: attributes #[[COLD]] = { "memprof"="cold" } -; IRNODIST: define internal {{.*}} @_Z1Dv.retelim() -; IRNODIST: call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]] -; IRNODIST: define internal {{.*}} @_Z1Fv.retelim() -; IRNODIST: call {{.*}} @_Z1Dv.retelim() -; IRNODIST: define internal {{.*}} @_Z1Bv.retelim() -; IRNODIST: call {{.*}} @_Z1Dv.memprof.1.retelim() -; IRNODIST: define internal {{.*}} @_Z1Ev.retelim() -; IRNODIST: call {{.*}} @_Z1Dv.memprof.1.retelim() -; IRNODIST: define internal {{.*}} @_Z1Dv.memprof.1.retelim() -; IRNODIST: call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]] -; IRNODIST: attributes #[[NOTCOLD]] = { "memprof"="notcold" } -; IRNODIST: attributes #[[COLD]] = { "memprof"="cold" } ; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) ; STATS-BE: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend diff --git a/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll b/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll index 4153524bf44706..f1a494d077fefc 100644 --- a/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll +++ b/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll @@ -61,7 +61,7 @@ ; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \ ; RUN: --check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS -; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IRNODIST +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR ;; Try again but with distributed ThinLTO @@ -283,23 +283,6 @@ attributes #0 = { noinline optnone } ; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" } ; IR: attributes #[[COLD]] = { "memprof"="cold" } -; IRNODIST: define internal {{.*}} @_Z1EPPcS0_.argelim( -; IRNODIST: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]] -; IRNODIST: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]] -; IRNODIST: define internal {{.*}} @_Z1BPPcS0_( -; IRNODIST: call {{.*}} @_Z1EPPcS0_.argelim( -; IRNODIST: define internal {{.*}} @_Z1CPPcS0_( -; IRNODIST: call {{.*}} @_Z1EPPcS0_.memprof.3.argelim( -; IRNODIST: define internal {{.*}} @_Z1DPPcS0_( -; IRNODIST: call {{.*}} @_Z1EPPcS0_.memprof.2.argelim( -; IRNODIST: define internal {{.*}} @_Z1EPPcS0_.memprof.2.argelim( -; IRNODIST: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]] -; IRNODIST: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]] -; IRNODIST: define internal {{.*}} @_Z1EPPcS0_.memprof.3.argelim( -; IRNODIST: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]] -; IRNODIST: call {{.*}} @_Znam(i64 noundef 10) #[[COLD]] -; IRNODIST: attributes #[[NOTCOLD]] = { "memprof"="notcold" } -; IRNODIST: attributes #[[COLD]] = { "memprof"="cold" } ; STATS: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) ; STATS-BE: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend diff --git a/llvm/test/ThinLTO/X86/memprof-indirectcall.ll b/llvm/test/ThinLTO/X86/memprof-indirectcall.ll index ba8811b46175e3..07a52f441ca278 100644 --- a/llvm/test/ThinLTO/X86/memprof-indirectcall.ll +++ b/llvm/test/ThinLTO/X86/memprof-indirectcall.ll @@ -74,7 +74,7 @@ ;; from main allocating cold memory. ; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED -; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IRNODIST +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR ;; Try again but with distributed ThinLTO @@ -419,19 +419,6 @@ attributes #0 = { noinline optnone } ; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" } ; IR: attributes #[[COLD]] = { "memprof"="cold" } -; IRNODIST: define {{.*}} @main( -; IRNODIST: call {{.*}} @_Z3foov.argelim() -; IRNODIST: call {{.*}} @_Z3foov.memprof.1.argelim() -; IRNODIST: call {{.*}} @_Z3barP1A.argelim( -; IRNODIST: call {{.*}} @_Z3barP1A.argelim( -; IRNODIST: call {{.*}} @_Z3barP1A.argelim( -; IRNODIST: call {{.*}} @_Z3barP1A.argelim( -; IRNODIST: define internal {{.*}} @_Z3foov.argelim() -; IRNODIST: call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]] -; IRNODIST: define internal {{.*}} @_Z3foov.memprof.1.argelim() -; IRNODIST: call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]] -; IRNODIST: attributes #[[NOTCOLD]] = { "memprof"="notcold" } -; IRNODIST: attributes #[[COLD]] = { "memprof"="cold" } ; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) ; STATS-BE: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend diff --git a/llvm/test/ThinLTO/X86/memprof-inlined.ll b/llvm/test/ThinLTO/X86/memprof-inlined.ll index 7111a536a3110a..89df345b220423 100644 --- a/llvm/test/ThinLTO/X86/memprof-inlined.ll +++ b/llvm/test/ThinLTO/X86/memprof-inlined.ll @@ -63,7 +63,7 @@ ;; cold memory. ; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED -; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IRNODIST +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR ;; Try again but with distributed ThinLTO @@ -323,19 +323,6 @@ attributes #0 = { noinline optnone } ; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" } ; IR: attributes #[[COLD]] = { "memprof"="cold" } -; IRNODIST: define internal {{.*}} @_Z3barv.retelim() -; IRNODIST: call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]] -; IRNODIST: define internal {{.*}} @_Z3foov.retelim() -; IRNODIST: call {{.*}} @_Z3barv.retelim() -; IRNODIST: define {{.*}} @main() -; IRNODIST: call {{.*}} @_Z3foov.retelim() -; IRNODIST: call {{.*}} @_Z3foov.memprof.1.retelim() -; IRNODIST: define internal {{.*}} @_Z3barv.memprof.1.retelim() -; IRNODIST: call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]] -; IRNODIST: define internal {{.*}} @_Z3foov.memprof.1.retelim() -; IRNODIST: call {{.*}} @_Z3barv.memprof.1.retelim() -; IRNODIST: attributes #[[NOTCOLD]] = { "memprof"="notcold" } -; IRNODIST: attributes #[[COLD]] = { "memprof"="cold" } ; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) ; STATS-BE: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend diff --git a/llvm/test/Transforms/ArgumentPromotion/2008-02-01-ReturnAttrs.ll b/llvm/test/Transforms/ArgumentPromotion/2008-02-01-ReturnAttrs.ll index 51839033177034..daa4e1fb757d21 100644 --- a/llvm/test/Transforms/ArgumentPromotion/2008-02-01-ReturnAttrs.ll +++ b/llvm/test/Transforms/ArgumentPromotion/2008-02-01-ReturnAttrs.ll @@ -3,7 +3,7 @@ ; RUN: cat %t | FileCheck -check-prefix=REMARK %s define internal i32 @deref(ptr %x) nounwind { -; CHECK-LABEL: define {{[^@]+}}@deref.argprom +; CHECK-LABEL: define {{[^@]+}}@deref ; CHECK-SAME: (i32 [[X_0_VAL:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: ret i32 [[X_0_VAL]] @@ -29,7 +29,7 @@ define i32 @f(i32 %x) { ; CHECK-NEXT: [[X_ADDR:%.*]] = alloca i32, align 4 ; CHECK-NEXT: store i32 [[X]], ptr [[X_ADDR]], align 4 ; CHECK-NEXT: [[X_ADDR_VAL:%.*]] = load i32, ptr [[X_ADDR]], align 4 -; CHECK-NEXT: [[TEMP1:%.*]] = call i32 @deref.argprom(i32 [[X_ADDR_VAL]]) +; CHECK-NEXT: [[TEMP1:%.*]] = call i32 @deref(i32 [[X_ADDR_VAL]]) ; CHECK-NEXT: ret i32 [[TEMP1]] ; entry: diff --git a/llvm/test/Transforms/ArgumentPromotion/BPF/argpromotion.ll b/llvm/test/Transforms/ArgumentPromotion/BPF/argpromotion.ll index f317a5a4533484..6c39f27115ada4 100644 --- a/llvm/test/Transforms/ArgumentPromotion/BPF/argpromotion.ll +++ b/llvm/test/Transforms/ArgumentPromotion/BPF/argpromotion.ll @@ -85,4 +85,4 @@ entry: ; Without number-of-argument constraint, argpromotion will create a function signature with 5 arguments, which equals ; the maximum number of argument permitted by bpf backend, so argpromotion result code does work. ; -; CHECK: i32 @foo2.argprom(i32 %p1.0.val, i32 %p1.4.val, i32 %p2.8.val, i32 %p2.16.val, i32 %p3.20.val) +; CHECK: i32 @foo2(i32 %p1.0.val, i32 %p1.4.val, i32 %p2.8.val, i32 %p2.16.val, i32 %p3.20.val) diff --git a/llvm/test/Transforms/ArgumentPromotion/X86/attributes.ll b/llvm/test/Transforms/ArgumentPromotion/X86/attributes.ll index 6d34fb57c9dcf7..a64b7346d83618 100644 --- a/llvm/test/Transforms/ArgumentPromotion/X86/attributes.ll +++ b/llvm/test/Transforms/ArgumentPromotion/X86/attributes.ll @@ -42,7 +42,7 @@ bb: } define internal fastcc void @promote_avx2(ptr %arg, ptr readonly %arg1) #0 { -; CHECK-LABEL: define {{[^@]+}}@promote_avx2.argprom +; CHECK-LABEL: define {{[^@]+}}@promote_avx2 ; CHECK-SAME: (ptr [[ARG:%.*]], <4 x i64> [[ARG1_VAL:%.*]]) ; CHECK-NEXT: bb: ; CHECK-NEXT: store <4 x i64> [[ARG1_VAL]], ptr [[ARG]] @@ -62,7 +62,7 @@ define void @promote(ptr %arg) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x i64>, align 32 ; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 32 [[TMP]], i8 0, i64 32, i1 false) ; CHECK-NEXT: [[TMP_VAL:%.*]] = load <4 x i64>, ptr [[TMP]] -; CHECK-NEXT: call fastcc void @promote_avx2.argprom(ptr [[TMP2]], <4 x i64> [[TMP_VAL]]) +; CHECK-NEXT: call fastcc void @promote_avx2(ptr [[TMP2]], <4 x i64> [[TMP_VAL]]) ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32 ; CHECK-NEXT: store <4 x i64> [[TMP4]], ptr [[ARG]], align 2 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/ArgumentPromotion/X86/min-legal-vector-width.ll b/llvm/test/Transforms/ArgumentPromotion/X86/min-legal-vector-width.ll index 99aa19e72371fb..3373c09d5f91aa 100644 --- a/llvm/test/Transforms/ArgumentPromotion/X86/min-legal-vector-width.ll +++ b/llvm/test/Transforms/ArgumentPromotion/X86/min-legal-vector-width.ll @@ -7,7 +7,7 @@ target triple = "x86_64-unknown-linux-gnu" ; This should promote define internal fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512(ptr %arg, ptr readonly %arg1) #0 { -; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512.argprom +; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512 ; CHECK-SAME: (ptr [[ARG:%.*]], <8 x i64> [[ARG1_VAL:%.*]]) ; CHECK-NEXT: bb: ; CHECK-NEXT: store <8 x i64> [[ARG1_VAL]], ptr [[ARG]] @@ -27,7 +27,7 @@ define void @avx512_legal512_prefer512_call_avx512_legal512_prefer512(ptr %arg) ; CHECK-NEXT: [[TMP2:%.*]] = alloca <8 x i64>, align 32 ; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 32 [[TMP]], i8 0, i64 32, i1 false) ; CHECK-NEXT: [[TMP_VAL:%.*]] = load <8 x i64>, ptr [[TMP]] -; CHECK-NEXT: call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512.argprom(ptr [[TMP2]], <8 x i64> [[TMP_VAL]]) +; CHECK-NEXT: call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512(ptr [[TMP2]], <8 x i64> [[TMP_VAL]]) ; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr [[TMP2]], align 32 ; CHECK-NEXT: store <8 x i64> [[TMP4]], ptr [[ARG]], align 2 ; CHECK-NEXT: ret void @@ -44,7 +44,7 @@ bb: ; This should promote define internal fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256(ptr %arg, ptr readonly %arg1) #1 { -; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256.argprom +; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256 ; CHECK-SAME: (ptr [[ARG:%.*]], <8 x i64> [[ARG1_VAL:%.*]]) ; CHECK-NEXT: bb: ; CHECK-NEXT: store <8 x i64> [[ARG1_VAL]], ptr [[ARG]] @@ -64,7 +64,7 @@ define void @avx512_legal512_prefer256_call_avx512_legal512_prefer256(ptr %arg) ; CHECK-NEXT: [[TMP2:%.*]] = alloca <8 x i64>, align 32 ; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 32 [[TMP]], i8 0, i64 32, i1 false) ; CHECK-NEXT: [[TMP_VAL:%.*]] = load <8 x i64>, ptr [[TMP]] -; CHECK-NEXT: call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256.argprom(ptr [[TMP2]], <8 x i64> [[TMP_VAL]]) +; CHECK-NEXT: call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256(ptr [[TMP2]], <8 x i64> [[TMP_VAL]]) ; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr [[TMP2]], align 32 ; CHECK-NEXT: store <8 x i64> [[TMP4]], ptr [[ARG]], align 2 ; CHECK-NEXT: ret void @@ -81,7 +81,7 @@ bb: ; This should promote define internal fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256(ptr %arg, ptr readonly %arg1) #1 { -; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256.argprom +; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256 ; CHECK-SAME: (ptr [[ARG:%.*]], <8 x i64> [[ARG1_VAL:%.*]]) ; CHECK-NEXT: bb: ; CHECK-NEXT: store <8 x i64> [[ARG1_VAL]], ptr [[ARG]] @@ -101,7 +101,7 @@ define void @avx512_legal512_prefer512_call_avx512_legal512_prefer256(ptr %arg) ; CHECK-NEXT: [[TMP2:%.*]] = alloca <8 x i64>, align 32 ; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 32 [[TMP]], i8 0, i64 32, i1 false) ; CHECK-NEXT: [[TMP_VAL:%.*]] = load <8 x i64>, ptr [[TMP]] -; CHECK-NEXT: call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256.argprom(ptr [[TMP2]], <8 x i64> [[TMP_VAL]]) +; CHECK-NEXT: call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256(ptr [[TMP2]], <8 x i64> [[TMP_VAL]]) ; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr [[TMP2]], align 32 ; CHECK-NEXT: store <8 x i64> [[TMP4]], ptr [[ARG]], align 2 ; CHECK-NEXT: ret void @@ -118,7 +118,7 @@ bb: ; This should promote define internal fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512(ptr %arg, ptr readonly %arg1) #0 { -; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512.argprom +; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512 ; CHECK-SAME: (ptr [[ARG:%.*]], <8 x i64> [[ARG1_VAL:%.*]]) ; CHECK-NEXT: bb: ; CHECK-NEXT: store <8 x i64> [[ARG1_VAL]], ptr [[ARG]] @@ -138,7 +138,7 @@ define void @avx512_legal512_prefer256_call_avx512_legal512_prefer512(ptr %arg) ; CHECK-NEXT: [[TMP2:%.*]] = alloca <8 x i64>, align 32 ; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 32 [[TMP]], i8 0, i64 32, i1 false) ; CHECK-NEXT: [[TMP_VAL:%.*]] = load <8 x i64>, ptr [[TMP]] -; CHECK-NEXT: call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512.argprom(ptr [[TMP2]], <8 x i64> [[TMP_VAL]]) +; CHECK-NEXT: call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512(ptr [[TMP2]], <8 x i64> [[TMP_VAL]]) ; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr [[TMP2]], align 32 ; CHECK-NEXT: store <8 x i64> [[TMP4]], ptr [[ARG]], align 2 ; CHECK-NEXT: ret void @@ -229,7 +229,7 @@ bb: ; This should promote define internal fastcc void @callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256(ptr %arg, ptr readonly %arg1) #3 { -; CHECK-LABEL: define {{[^@]+}}@callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256.argprom +; CHECK-LABEL: define {{[^@]+}}@callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256 ; CHECK-SAME: (ptr [[ARG:%.*]], <8 x i64> [[ARG1_VAL:%.*]]) ; CHECK-NEXT: bb: ; CHECK-NEXT: store <8 x i64> [[ARG1_VAL]], ptr [[ARG]] @@ -249,7 +249,7 @@ define void @avx2_legal256_prefer256_call_avx2_legal512_prefer256(ptr %arg) #4 { ; CHECK-NEXT: [[TMP2:%.*]] = alloca <8 x i64>, align 32 ; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 32 [[TMP]], i8 0, i64 32, i1 false) ; CHECK-NEXT: [[TMP_VAL:%.*]] = load <8 x i64>, ptr [[TMP]] -; CHECK-NEXT: call fastcc void @callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256.argprom(ptr [[TMP2]], <8 x i64> [[TMP_VAL]]) +; CHECK-NEXT: call fastcc void @callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256(ptr [[TMP2]], <8 x i64> [[TMP_VAL]]) ; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr [[TMP2]], align 32 ; CHECK-NEXT: store <8 x i64> [[TMP4]], ptr [[ARG]], align 2 ; CHECK-NEXT: ret void @@ -266,7 +266,7 @@ bb: ; This should promote define internal fastcc void @callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256(ptr %arg, ptr readonly %arg1) #4 { -; CHECK-LABEL: define {{[^@]+}}@callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256.argprom +; CHECK-LABEL: define {{[^@]+}}@callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256 ; CHECK-SAME: (ptr [[ARG:%.*]], <8 x i64> [[ARG1_VAL:%.*]]) ; CHECK-NEXT: bb: ; CHECK-NEXT: store <8 x i64> [[ARG1_VAL]], ptr [[ARG]] @@ -286,7 +286,7 @@ define void @avx2_legal512_prefer256_call_avx2_legal256_prefer256(ptr %arg) #3 { ; CHECK-NEXT: [[TMP2:%.*]] = alloca <8 x i64>, align 32 ; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 32 [[TMP]], i8 0, i64 32, i1 false) ; CHECK-NEXT: [[TMP_VAL:%.*]] = load <8 x i64>, ptr [[TMP]] -; CHECK-NEXT: call fastcc void @callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256.argprom(ptr [[TMP2]], <8 x i64> [[TMP_VAL]]) +; CHECK-NEXT: call fastcc void @callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256(ptr [[TMP2]], <8 x i64> [[TMP_VAL]]) ; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i64>, ptr [[TMP2]], align 32 ; CHECK-NEXT: store <8 x i64> [[TMP4]], ptr [[ARG]], align 2 ; CHECK-NEXT: ret void @@ -303,7 +303,7 @@ bb: ; If the arguments are scalar, its ok to promote. define internal i32 @scalar_callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(ptr %X, ptr %Y) #2 { -; CHECK-LABEL: define {{[^@]+}}@scalar_callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256.argprom +; CHECK-LABEL: define {{[^@]+}}@scalar_callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256 ; CHECK-SAME: (i32 [[X_VAL:%.*]], i32 [[Y_VAL:%.*]]) ; CHECK-NEXT: [[C:%.*]] = add i32 [[X_VAL]], [[Y_VAL]] ; CHECK-NEXT: ret i32 [[C]] @@ -321,7 +321,7 @@ define i32 @scalar_avx512_legal256_prefer256_call_avx512_legal512_prefer256(ptr ; CHECK-NEXT: store i32 1, ptr [[A]] ; CHECK-NEXT: [[A_VAL:%.*]] = load i32, ptr [[A]] ; CHECK-NEXT: [[B_VAL:%.*]] = load i32, ptr [[B]] -; CHECK-NEXT: [[C:%.*]] = call i32 @scalar_callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256.argprom(i32 [[A_VAL]], i32 [[B_VAL]]) +; CHECK-NEXT: [[C:%.*]] = call i32 @scalar_callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(i32 [[A_VAL]], i32 [[B_VAL]]) ; CHECK-NEXT: ret i32 [[C]] ; %A = alloca i32 @@ -332,7 +332,7 @@ define i32 @scalar_avx512_legal256_prefer256_call_avx512_legal512_prefer256(ptr ; If the arguments are scalar, its ok to promote. define internal i32 @scalar_callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(ptr %X, ptr %Y) #2 { -; CHECK-LABEL: define {{[^@]+}}@scalar_callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256.argprom +; CHECK-LABEL: define {{[^@]+}}@scalar_callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256 ; CHECK-SAME: (i32 [[X_VAL:%.*]], i32 [[Y_VAL:%.*]]) ; CHECK-NEXT: [[C:%.*]] = add i32 [[X_VAL]], [[Y_VAL]] ; CHECK-NEXT: ret i32 [[C]] @@ -350,7 +350,7 @@ define i32 @scalar_avx512_legal512_prefer256_call_avx512_legal256_prefer256(ptr ; CHECK-NEXT: store i32 1, ptr [[A]] ; CHECK-NEXT: [[A_VAL:%.*]] = load i32, ptr [[A]] ; CHECK-NEXT: [[B_VAL:%.*]] = load i32, ptr [[B]] -; CHECK-NEXT: [[C:%.*]] = call i32 @scalar_callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256.argprom(i32 [[A_VAL]], i32 [[B_VAL]]) +; CHECK-NEXT: [[C:%.*]] = call i32 @scalar_callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(i32 [[A_VAL]], i32 [[B_VAL]]) ; CHECK-NEXT: ret i32 [[C]] ; %A = alloca i32 diff --git a/llvm/test/Transforms/ArgumentPromotion/X86/thiscall.ll b/llvm/test/Transforms/ArgumentPromotion/X86/thiscall.ll index 22e2c92617182f..2195e437bc8637 100644 --- a/llvm/test/Transforms/ArgumentPromotion/X86/thiscall.ll +++ b/llvm/test/Transforms/ArgumentPromotion/X86/thiscall.ll @@ -23,7 +23,7 @@ define internal x86_thiscallcc void @internalfun(ptr %this, ptr inalloca(<{ %str ; ARGPROMOTION-NEXT: call void @ext(ptr inalloca(<{ [[STRUCT_A]] }>) [[ARGMEM]]) ; ARGPROMOTION-NEXT: ret void ; -; GLOBALOPT_ARGPROMOTION-LABEL: define {{[^@]+}}@internalfun.argprom +; GLOBALOPT_ARGPROMOTION-LABEL: define {{[^@]+}}@internalfun ; GLOBALOPT_ARGPROMOTION-SAME: (ptr [[TMP0:%.*]]) unnamed_addr { ; GLOBALOPT_ARGPROMOTION-NEXT: entry: ; GLOBALOPT_ARGPROMOTION-NEXT: [[A:%.*]] = getelementptr inbounds <{ [[STRUCT_A:%.*]] }>, ptr [[TMP0]], i32 0, i32 0 @@ -56,7 +56,7 @@ define void @exportedfun(ptr %a) { ; GLOBALOPT_ARGPROMOTION-SAME: (ptr [[A:%.*]]) local_unnamed_addr { ; GLOBALOPT_ARGPROMOTION-NEXT: [[INALLOCA_SAVE:%.*]] = tail call ptr @llvm.stacksave.p0() ; GLOBALOPT_ARGPROMOTION-NEXT: [[ARGMEM:%.*]] = alloca inalloca <{ [[STRUCT_A:%.*]] }>, align 4 -; GLOBALOPT_ARGPROMOTION-NEXT: call fastcc void @internalfun.argprom(ptr [[ARGMEM]]) +; GLOBALOPT_ARGPROMOTION-NEXT: call fastcc void @internalfun(ptr [[ARGMEM]]) ; GLOBALOPT_ARGPROMOTION-NEXT: call void @llvm.stackrestore.p0(ptr [[INALLOCA_SAVE]]) ; GLOBALOPT_ARGPROMOTION-NEXT: ret void ; diff --git a/llvm/test/Transforms/ArgumentPromotion/actual-arguments.ll b/llvm/test/Transforms/ArgumentPromotion/actual-arguments.ll index 54e1727b5bca6a..63366ba998c7bb 100644 --- a/llvm/test/Transforms/ArgumentPromotion/actual-arguments.ll +++ b/llvm/test/Transforms/ArgumentPromotion/actual-arguments.ll @@ -12,7 +12,7 @@ define internal i32 @test_cannot_promote_1(ptr %p, ptr nocapture readonly %test_ ; CHECK-LABEL: define {{[^@]+}}@test_cannot_promote_1 ; CHECK-SAME: (ptr [[P:%.*]], ptr nocapture readonly [[TEST_C:%.*]]) { ; CHECK-NEXT: [[TEST_C_VAL:%.*]] = load i32, ptr [[TEST_C]], align 4 -; CHECK-NEXT: [[RES:%.*]] = call i32 @callee.argprom(ptr [[P]], i32 [[TEST_C_VAL]]) +; CHECK-NEXT: [[RES:%.*]] = call i32 @callee(ptr [[P]], i32 [[TEST_C_VAL]]) ; CHECK-NEXT: [[LTEST_C:%.*]] = load i32, ptr [[TEST_C]], align 4 ; CHECK-NEXT: [[SUM:%.*]] = add i32 [[LTEST_C]], [[RES]] ; CHECK-NEXT: ret i32 [[SUM]] @@ -33,7 +33,7 @@ define internal i32 @test_cannot_promote_2(ptr %p, ptr nocapture readonly %test_ ; CHECK-LABEL: define {{[^@]+}}@test_cannot_promote_2 ; CHECK-SAME: (ptr [[P:%.*]], ptr nocapture readonly [[TEST_C:%.*]]) { ; CHECK-NEXT: [[TEST_C_VAL:%.*]] = load i32, ptr [[TEST_C]], align 4 -; CHECK-NEXT: [[RES:%.*]] = call i32 @callee.argprom(ptr [[P]], i32 [[TEST_C_VAL]]) +; CHECK-NEXT: [[RES:%.*]] = call i32 @callee(ptr [[P]], i32 [[TEST_C_VAL]]) ; CHECK-NEXT: [[LTEST_C:%.*]] = load i32, ptr [[TEST_C]], align 4 ; CHECK-NEXT: [[SUM:%.*]] = add i32 [[LTEST_C]], [[RES]] ; CHECK-NEXT: ret i32 [[SUM]] @@ -54,7 +54,7 @@ define internal i32 @test_cannot_promote_3(ptr %p, ptr nocapture readonly %test_ ; CHECK-LABEL: define {{[^@]+}}@test_cannot_promote_3 ; CHECK-SAME: (ptr [[P:%.*]], ptr nocapture readonly [[TEST_C:%.*]]) { ; CHECK-NEXT: [[TEST_C_VAL:%.*]] = load i32, ptr [[TEST_C]], align 4 -; CHECK-NEXT: [[RES:%.*]] = call i32 @callee.argprom(ptr [[P]], i32 [[TEST_C_VAL]]) +; CHECK-NEXT: [[RES:%.*]] = call i32 @callee(ptr [[P]], i32 [[TEST_C_VAL]]) ; CHECK-NEXT: [[LTEST_C:%.*]] = load i32, ptr [[TEST_C]], align 4 ; CHECK-NEXT: [[SUM:%.*]] = add i32 [[LTEST_C]], [[RES]] ; CHECK-NEXT: ret i32 [[SUM]] @@ -77,7 +77,7 @@ define internal i32 @test_can_promote_1(ptr %p, ptr nocapture readonly %test_c) ; CHECK-LABEL: define {{[^@]+}}@test_can_promote_1 ; CHECK-SAME: (ptr [[P:%.*]], ptr nocapture readonly [[TEST_C:%.*]]) { ; CHECK-NEXT: [[TEST_C_VAL:%.*]] = load i32, ptr [[TEST_C]], align 4 -; CHECK-NEXT: [[RES:%.*]] = call i32 @callee.argprom(ptr [[P]], i32 [[TEST_C_VAL]]) +; CHECK-NEXT: [[RES:%.*]] = call i32 @callee(ptr [[P]], i32 [[TEST_C_VAL]]) ; CHECK-NEXT: [[LTEST_C:%.*]] = load i32, ptr [[TEST_C]], align 4 ; CHECK-NEXT: [[SUM:%.*]] = add i32 [[LTEST_C]], [[RES]] ; CHECK-NEXT: ret i32 [[SUM]] @@ -101,7 +101,7 @@ define internal i32 @test_can_promote_2(ptr %p, ptr nocapture readonly %test_c) ; CHECK-LABEL: define {{[^@]+}}@test_can_promote_2 ; CHECK-SAME: (ptr [[P:%.*]], ptr nocapture readonly [[TEST_C:%.*]]) { ; CHECK-NEXT: [[TEST_C_VAL:%.*]] = load i32, ptr [[TEST_C]], align 4 -; CHECK-NEXT: [[RES:%.*]] = call i32 @callee.argprom(ptr [[P]], i32 [[TEST_C_VAL]]) +; CHECK-NEXT: [[RES:%.*]] = call i32 @callee(ptr [[P]], i32 [[TEST_C_VAL]]) ; CHECK-NEXT: [[LTEST_C:%.*]] = load i32, ptr [[TEST_C]], align 4 ; CHECK-NEXT: [[SUM:%.*]] = add i32 [[LTEST_C]], [[RES]] ; CHECK-NEXT: ret i32 [[SUM]] diff --git a/llvm/test/Transforms/ArgumentPromotion/aggregate-promote-dead-gep.ll b/llvm/test/Transforms/ArgumentPromotion/aggregate-promote-dead-gep.ll index 3ff3ac7ac61d75..75e802b1510c56 100644 --- a/llvm/test/Transforms/ArgumentPromotion/aggregate-promote-dead-gep.ll +++ b/llvm/test/Transforms/ArgumentPromotion/aggregate-promote-dead-gep.ll @@ -5,7 +5,7 @@ @G = constant %T { i32 0, i32 0, i32 17, i32 25 } define internal i32 @test(ptr %p) { -; CHECK-LABEL: define {{[^@]+}}@test.argprom +; CHECK-LABEL: define {{[^@]+}}@test ; CHECK-SAME: (i32 [[P_12_VAL:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[V:%.*]] = add i32 [[P_12_VAL]], 10 @@ -24,7 +24,7 @@ define i32 @caller() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr @G, i64 12 ; CHECK-NEXT: [[G_VAL:%.*]] = load i32, ptr [[TMP0]], align 4 -; CHECK-NEXT: [[V:%.*]] = call i32 @test.argprom(i32 [[G_VAL]]) +; CHECK-NEXT: [[V:%.*]] = call i32 @test(i32 [[G_VAL]]) ; CHECK-NEXT: ret i32 [[V]] ; entry: diff --git a/llvm/test/Transforms/ArgumentPromotion/aggregate-promote.ll b/llvm/test/Transforms/ArgumentPromotion/aggregate-promote.ll index cbc3d07efc5e9b..dc5b376850f08c 100644 --- a/llvm/test/Transforms/ArgumentPromotion/aggregate-promote.ll +++ b/llvm/test/Transforms/ArgumentPromotion/aggregate-promote.ll @@ -5,7 +5,7 @@ @G = constant %T { i32 0, i32 0, i32 17, i32 25 } define internal i32 @test(ptr %p) { -; CHECK-LABEL: define {{[^@]+}}@test.argprom +; CHECK-LABEL: define {{[^@]+}}@test ; CHECK-SAME: (i32 [[P_8_VAL:%.*]], i32 [[P_12_VAL:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[V:%.*]] = add i32 [[P_12_VAL]], [[P_8_VAL]] @@ -27,7 +27,7 @@ define i32 @caller() { ; CHECK-NEXT: [[G_VAL:%.*]] = load i32, ptr [[TMP0]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr @G, i64 12 ; CHECK-NEXT: [[G_VAL1:%.*]] = load i32, ptr [[TMP1]], align 4 -; CHECK-NEXT: [[V:%.*]] = call i32 @test.argprom(i32 [[G_VAL]], i32 [[G_VAL1]]) +; CHECK-NEXT: [[V:%.*]] = call i32 @test(i32 [[G_VAL]], i32 [[G_VAL1]]) ; CHECK-NEXT: ret i32 [[V]] ; entry: diff --git a/llvm/test/Transforms/ArgumentPromotion/align.ll b/llvm/test/Transforms/ArgumentPromotion/align.ll index 251f43b2ae7286..656c7c9da5b4af 100644 --- a/llvm/test/Transforms/ArgumentPromotion/align.ll +++ b/llvm/test/Transforms/ArgumentPromotion/align.ll @@ -2,7 +2,7 @@ ; RUN: opt -S -passes=argpromotion < %s | FileCheck %s define internal i32 @callee_must_exec(ptr %p) { -; CHECK-LABEL: define {{[^@]+}}@callee_must_exec.argprom +; CHECK-LABEL: define {{[^@]+}}@callee_must_exec ; CHECK-SAME: (i32 [[P_0_VAL:%.*]]) { ; CHECK-NEXT: ret i32 [[P_0_VAL]] ; @@ -14,7 +14,7 @@ define void @caller_must_exec(ptr %p) { ; CHECK-LABEL: define {{[^@]+}}@caller_must_exec ; CHECK-SAME: (ptr [[P:%.*]]) { ; CHECK-NEXT: [[P_VAL:%.*]] = load i32, ptr [[P]], align 16 -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @callee_must_exec.argprom(i32 [[P_VAL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @callee_must_exec(i32 [[P_VAL]]) ; CHECK-NEXT: ret void ; call i32 @callee_must_exec(ptr %p) @@ -22,7 +22,7 @@ define void @caller_must_exec(ptr %p) { } define internal i32 @callee_guaranteed_aligned_1(i1 %c, ptr %p) { -; CHECK-LABEL: define {{[^@]+}}@callee_guaranteed_aligned_1.argprom +; CHECK-LABEL: define {{[^@]+}}@callee_guaranteed_aligned_1 ; CHECK-SAME: (i1 [[C:%.*]], i32 [[P_0_VAL:%.*]]) { ; CHECK-NEXT: br i1 [[C]], label [[IF:%.*]], label [[ELSE:%.*]] ; CHECK: if: @@ -44,7 +44,7 @@ define void @caller_guaranteed_aligned_1(i1 %c, ptr align 16 dereferenceable(4) ; CHECK-LABEL: define {{[^@]+}}@caller_guaranteed_aligned_1 ; CHECK-SAME: (i1 [[C:%.*]], ptr align 16 dereferenceable(4) [[P:%.*]]) { ; CHECK-NEXT: [[P_VAL:%.*]] = load i32, ptr [[P]], align 16 -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @callee_guaranteed_aligned_1.argprom(i1 [[C]], i32 [[P_VAL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @callee_guaranteed_aligned_1(i1 [[C]], i32 [[P_VAL]]) ; CHECK-NEXT: ret void ; call i32 @callee_guaranteed_aligned_1(i1 %c, ptr %p) @@ -52,7 +52,7 @@ define void @caller_guaranteed_aligned_1(i1 %c, ptr align 16 dereferenceable(4) } define internal i32 @callee_guaranteed_aligned_2(i1 %c, ptr align 16 dereferenceable(4) %p) { -; CHECK-LABEL: define {{[^@]+}}@callee_guaranteed_aligned_2.argprom +; CHECK-LABEL: define {{[^@]+}}@callee_guaranteed_aligned_2 ; CHECK-SAME: (i1 [[C:%.*]], i32 [[P_0_VAL:%.*]]) { ; CHECK-NEXT: br i1 [[C]], label [[IF:%.*]], label [[ELSE:%.*]] ; CHECK: if: @@ -74,7 +74,7 @@ define void @caller_guaranteed_aligned_2(i1 %c, ptr %p) { ; CHECK-LABEL: define {{[^@]+}}@caller_guaranteed_aligned_2 ; CHECK-SAME: (i1 [[C:%.*]], ptr [[P:%.*]]) { ; CHECK-NEXT: [[P_VAL:%.*]] = load i32, ptr [[P]], align 16 -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @callee_guaranteed_aligned_2.argprom(i1 [[C]], i32 [[P_VAL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @callee_guaranteed_aligned_2(i1 [[C]], i32 [[P_VAL]]) ; CHECK-NEXT: ret void ; call i32 @callee_guaranteed_aligned_2(i1 %c, ptr %p) @@ -83,7 +83,7 @@ define void @caller_guaranteed_aligned_2(i1 %c, ptr %p) { ; We have seen the offset before but with a lower alignment define internal i32 @callee_guaranteed_aligned_3(i1 %c, ptr align 16 dereferenceable(4) %p) { -; CHECK-LABEL: define {{[^@]+}}@callee_guaranteed_aligned_3.argprom +; CHECK-LABEL: define {{[^@]+}}@callee_guaranteed_aligned_3 ; CHECK-SAME: (i1 [[C:%.*]], i32 [[P_0_VAL:%.*]]) { ; CHECK-NEXT: br i1 [[C]], label [[IF:%.*]], label [[ELSE:%.*]] ; CHECK: if: @@ -106,7 +106,7 @@ define void @caller_guaranteed_aligned_3(i1 %c, ptr %p) { ; CHECK-LABEL: define {{[^@]+}}@caller_guaranteed_aligned_3 ; CHECK-SAME: (i1 [[C:%.*]], ptr [[P:%.*]]) { ; CHECK-NEXT: [[P_VAL:%.*]] = load i32, ptr [[P]], align 16 -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @callee_guaranteed_aligned_3.argprom(i1 [[C]], i32 [[P_VAL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @callee_guaranteed_aligned_3(i1 [[C]], i32 [[P_VAL]]) ; CHECK-NEXT: ret void ; call i32 @callee_guaranteed_aligned_3(i1 %c, ptr %p) diff --git a/llvm/test/Transforms/ArgumentPromotion/allocsize.ll b/llvm/test/Transforms/ArgumentPromotion/allocsize.ll index ca648f5a012cc4..36271e17c9d76d 100644 --- a/llvm/test/Transforms/ArgumentPromotion/allocsize.ll +++ b/llvm/test/Transforms/ArgumentPromotion/allocsize.ll @@ -6,7 +6,7 @@ declare ptr @calloc(i64, i64) define internal ptr @my_alloc1(i64 %unchanged, ptr %unused, i64 %size, ptr %unused2) allocsize(2) { ; CHECK: Function Attrs: allocsize(1) -; CHECK-LABEL: define internal ptr @my_alloc1.argprom( +; CHECK-LABEL: define internal ptr @my_alloc1( ; CHECK-SAME: i64 [[UNCHANGED:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[PTR:%.*]] = call ptr @malloc(i64 [[SIZE]]) ; CHECK-NEXT: ret ptr [[PTR]] @@ -17,7 +17,7 @@ define internal ptr @my_alloc1(i64 %unchanged, ptr %unused, i64 %size, ptr %unus define internal ptr @my_alloc2(i64 %unchanged, ptr %unused, i64 %size, i64 %size2, ptr %unused2) allocsize(2,3) { ; CHECK: Function Attrs: allocsize(1,2) -; CHECK-LABEL: define internal ptr @my_alloc2.argprom( +; CHECK-LABEL: define internal ptr @my_alloc2( ; CHECK-SAME: i64 [[UNCHANGED:%.*]], i64 [[SIZE:%.*]], i64 [[SIZE2:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: [[PTR:%.*]] = call ptr @calloc(i64 [[SIZE]], i64 [[SIZE2]]) ; CHECK-NEXT: ret ptr [[PTR]] @@ -28,7 +28,7 @@ define internal ptr @my_alloc2(i64 %unchanged, ptr %unused, i64 %size, i64 %size define internal ptr @my_alloc3(i64 %unchanged, ptr %promoted, ptr %promoted2, i64 %size) allocsize(3) { ; CHECK: Function Attrs: allocsize(5) -; CHECK-LABEL: define internal ptr @my_alloc3.argprom( +; CHECK-LABEL: define internal ptr @my_alloc3( ; CHECK-SAME: i64 [[UNCHANGED:%.*]], i32 [[PROMOTED_0_VAL:%.*]], i32 [[PROMOTED_4_VAL:%.*]], i32 [[PROMOTED2_0_VAL:%.*]], i32 [[PROMOTED2_4_VAL:%.*]], i64 [[SIZE:%.*]]) #[[ATTR2:[0-9]+]] { ; CHECK-NEXT: [[PTR:%.*]] = call ptr @malloc(i64 [[SIZE]]) ; CHECK-NEXT: ret ptr [[PTR]] @@ -47,7 +47,7 @@ define internal ptr @my_alloc3(i64 %unchanged, ptr %promoted, ptr %promoted2, i6 define internal ptr @my_alloc4(i64 %unchanged, ptr %promoted, ptr %promoted2, i64 %size, i64 %size2) allocsize(3,4) { ; CHECK: Function Attrs: allocsize(5,6) -; CHECK-LABEL: define internal ptr @my_alloc4.argprom( +; CHECK-LABEL: define internal ptr @my_alloc4( ; CHECK-SAME: i64 [[UNCHANGED:%.*]], i32 [[PROMOTED_0_VAL:%.*]], i32 [[PROMOTED_4_VAL:%.*]], i32 [[PROMOTED2_0_VAL:%.*]], i32 [[PROMOTED2_4_VAL:%.*]], i64 [[SIZE:%.*]], i64 [[SIZE2:%.*]]) #[[ATTR3:[0-9]+]] { ; CHECK-NEXT: [[PTR:%.*]] = call ptr @calloc(i64 [[SIZE]], i64 [[SIZE2]]) ; CHECK-NEXT: ret ptr [[PTR]] @@ -67,22 +67,22 @@ define internal ptr @my_alloc4(i64 %unchanged, ptr %promoted, ptr %promoted2, i6 define void @call_my_alloc(ptr %arg, ptr %arg2) { ; CHECK-LABEL: define void @call_my_alloc( ; CHECK-SAME: ptr [[ARG:%.*]], ptr [[ARG2:%.*]]) { -; CHECK-NEXT: [[TMP1:%.*]] = call ptr @my_alloc1.argprom(i64 0, i64 2) -; CHECK-NEXT: [[TMP2:%.*]] = call ptr @my_alloc2.argprom(i64 0, i64 2, i64 2) +; CHECK-NEXT: [[TMP1:%.*]] = call ptr @my_alloc1(i64 0, i64 2) +; CHECK-NEXT: [[TMP2:%.*]] = call ptr @my_alloc2(i64 0, i64 2, i64 2) ; CHECK-NEXT: [[ARG_VAL:%.*]] = load i32, ptr [[ARG]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[ARG]], i64 4 ; CHECK-NEXT: [[ARG_VAL1:%.*]] = load i32, ptr [[TMP3]], align 4 ; CHECK-NEXT: [[ARG2_VAL:%.*]] = load i32, ptr [[ARG2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[ARG2]], i64 4 ; CHECK-NEXT: [[ARG2_VAL2:%.*]] = load i32, ptr [[TMP4]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = call ptr @my_alloc3.argprom(i64 0, i32 [[ARG_VAL]], i32 [[ARG_VAL1]], i32 [[ARG2_VAL]], i32 [[ARG2_VAL2]], i64 2) +; CHECK-NEXT: [[TMP5:%.*]] = call ptr @my_alloc3(i64 0, i32 [[ARG_VAL]], i32 [[ARG_VAL1]], i32 [[ARG2_VAL]], i32 [[ARG2_VAL2]], i64 2) ; CHECK-NEXT: [[ARG_VAL3:%.*]] = load i32, ptr [[ARG]], align 4 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[ARG]], i64 4 ; CHECK-NEXT: [[ARG_VAL4:%.*]] = load i32, ptr [[TMP6]], align 4 ; CHECK-NEXT: [[ARG2_VAL5:%.*]] = load i32, ptr [[ARG2]], align 4 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[ARG2]], i64 4 ; CHECK-NEXT: [[ARG2_VAL6:%.*]] = load i32, ptr [[TMP7]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = call ptr @my_alloc4.argprom(i64 0, i32 [[ARG_VAL3]], i32 [[ARG_VAL4]], i32 [[ARG2_VAL5]], i32 [[ARG2_VAL6]], i64 2, i64 2) +; CHECK-NEXT: [[TMP8:%.*]] = call ptr @my_alloc4(i64 0, i32 [[ARG_VAL3]], i32 [[ARG_VAL4]], i32 [[ARG2_VAL5]], i32 [[ARG2_VAL6]], i64 2, i64 2) ; CHECK-NEXT: ret void ; %ptr = call ptr @my_alloc1(i64 0, ptr null, i64 2, ptr null) diff --git a/llvm/test/Transforms/ArgumentPromotion/attrs.ll b/llvm/test/Transforms/ArgumentPromotion/attrs.ll index 665065b3c35096..2b68ef2e403ba0 100644 --- a/llvm/test/Transforms/ArgumentPromotion/attrs.ll +++ b/llvm/test/Transforms/ArgumentPromotion/attrs.ll @@ -4,7 +4,7 @@ %struct.ss = type { i32, i64 } define internal void @f(ptr byval(%struct.ss) align 4 %b, ptr byval(i32) align 4 %X, i32 %i) nounwind { -; CHECK-LABEL: define {{[^@]+}}@f.argprom +; CHECK-LABEL: define {{[^@]+}}@f ; CHECK-SAME: (i32 [[B_0:%.*]], i32 [[X:%.*]], i32 [[I:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TEMP:%.*]] = add i32 [[B_0]], 1 @@ -30,7 +30,7 @@ define i32 @test(ptr %X) { ; CHECK-NEXT: store i64 2, ptr [[TEMP4]], align 4 ; CHECK-NEXT: [[S_0_VAL:%.*]] = load i32, ptr [[S]], align 4 ; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr [[X]], align 4 -; CHECK-NEXT: call void @f.argprom(i32 [[S_0_VAL]], i32 [[X_VAL]], i32 zeroext 0) +; CHECK-NEXT: call void @f(i32 [[S_0_VAL]], i32 [[X_VAL]], i32 zeroext 0) ; CHECK-NEXT: ret i32 0 ; entry: diff --git a/llvm/test/Transforms/ArgumentPromotion/basictest.ll b/llvm/test/Transforms/ArgumentPromotion/basictest.ll index 47518f73ec07a7..ba84ac126fe49b 100644 --- a/llvm/test/Transforms/ArgumentPromotion/basictest.ll +++ b/llvm/test/Transforms/ArgumentPromotion/basictest.ll @@ -3,7 +3,7 @@ target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128" define internal i32 @test(ptr %X, ptr %Y) { -; CHECK-LABEL: define {{[^@]+}}@test.argprom +; CHECK-LABEL: define {{[^@]+}}@test ; CHECK-SAME: (i32 [[X_0_VAL:%.*]], i32 [[Y_0_VAL:%.*]]) { ; CHECK-NEXT: [[C:%.*]] = add i32 [[X_0_VAL]], [[Y_0_VAL]] ; CHECK-NEXT: ret i32 [[C]] @@ -15,9 +15,9 @@ define internal i32 @test(ptr %X, ptr %Y) { } define internal i32 @caller(ptr %B) { -; CHECK-LABEL: define {{[^@]+}}@caller.argprom +; CHECK-LABEL: define {{[^@]+}}@caller ; CHECK-SAME: (i32 [[B_0_VAL:%.*]]) { -; CHECK-NEXT: [[C:%.*]] = call i32 @test.argprom(i32 1, i32 [[B_0_VAL]]) +; CHECK-NEXT: [[C:%.*]] = call i32 @test(i32 1, i32 [[B_0_VAL]]) ; CHECK-NEXT: ret i32 [[C]] ; %A = alloca i32 @@ -28,7 +28,7 @@ define internal i32 @caller(ptr %B) { define i32 @callercaller() { ; CHECK-LABEL: define {{[^@]+}}@callercaller() { -; CHECK-NEXT: [[X:%.*]] = call i32 @caller.argprom(i32 2) +; CHECK-NEXT: [[X:%.*]] = call i32 @caller(i32 2) ; CHECK-NEXT: ret i32 [[X]] ; %B = alloca i32 diff --git a/llvm/test/Transforms/ArgumentPromotion/bitcasts.ll b/llvm/test/Transforms/ArgumentPromotion/bitcasts.ll index bc4e5cc13b160a..6f2c322d7877be 100644 --- a/llvm/test/Transforms/ArgumentPromotion/bitcasts.ll +++ b/llvm/test/Transforms/ArgumentPromotion/bitcasts.ll @@ -6,7 +6,7 @@ %opaque = type opaque define internal i32 @callee_basic(ptr %p) { -; CHECK-LABEL: define {{[^@]+}}@callee_basic.argprom +; CHECK-LABEL: define {{[^@]+}}@callee_basic ; CHECK-SAME: (i32 [[P_0_VAL:%.*]], i32 [[P_4_VAL:%.*]]) { ; CHECK-NEXT: [[Z:%.*]] = add i32 [[P_0_VAL]], [[P_4_VAL]] ; CHECK-NEXT: ret i32 [[Z]] @@ -24,7 +24,7 @@ define void @caller_basic(ptr %p) { ; CHECK-NEXT: [[P_VAL:%.*]] = load i32, ptr [[P]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[P]], i64 4 ; CHECK-NEXT: [[P_VAL1:%.*]] = load i32, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @callee_basic.argprom(i32 [[P_VAL]], i32 [[P_VAL1]]) +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @callee_basic(i32 [[P_VAL]], i32 [[P_VAL1]]) ; CHECK-NEXT: ret void ; call i32 @callee_basic(ptr %p) @@ -32,7 +32,7 @@ define void @caller_basic(ptr %p) { } define internal i32 @callee_opaque(ptr %p) { -; CHECK-LABEL: define {{[^@]+}}@callee_opaque.argprom +; CHECK-LABEL: define {{[^@]+}}@callee_opaque ; CHECK-SAME: (i32 [[P_0_VAL:%.*]], i32 [[P_4_VAL:%.*]]) { ; CHECK-NEXT: [[Z:%.*]] = add i32 [[P_0_VAL]], [[P_4_VAL]] ; CHECK-NEXT: ret i32 [[Z]] @@ -50,7 +50,7 @@ define void @caller_opaque(ptr %p) { ; CHECK-NEXT: [[P_VAL:%.*]] = load i32, ptr [[P]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[P]], i64 4 ; CHECK-NEXT: [[P_VAL1:%.*]] = load i32, ptr [[TMP3]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @callee_opaque.argprom(i32 [[P_VAL]], i32 [[P_VAL1]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @callee_opaque(i32 [[P_VAL]], i32 [[P_VAL1]]) ; CHECK-NEXT: ret void ; call i32 @callee_opaque(ptr %p) diff --git a/llvm/test/Transforms/ArgumentPromotion/byval-2.ll b/llvm/test/Transforms/ArgumentPromotion/byval-2.ll index 9147a42fc7fc6b..3d0e9f2958444f 100644 --- a/llvm/test/Transforms/ArgumentPromotion/byval-2.ll +++ b/llvm/test/Transforms/ArgumentPromotion/byval-2.ll @@ -6,7 +6,7 @@ %struct.ss = type { i32, i64 } define internal void @f(ptr byval(%struct.ss) align 8 %b, ptr byval(i32) align 4 %X) nounwind { -; CHECK-LABEL: define {{[^@]+}}@f.argprom +; CHECK-LABEL: define {{[^@]+}}@f ; CHECK-SAME: (i32 [[B_0:%.*]], i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TEMP:%.*]] = add i32 [[B_0]], 1 @@ -31,7 +31,7 @@ define i32 @test(ptr %X) { ; CHECK-NEXT: store i64 2, ptr [[TEMP4]], align 4 ; CHECK-NEXT: [[S_0_VAL:%.*]] = load i32, ptr [[S]], align 4 ; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr [[X]], align 4 -; CHECK-NEXT: call void @f.argprom(i32 [[S_0_VAL]], i32 [[X_VAL]]) +; CHECK-NEXT: call void @f(i32 [[S_0_VAL]], i32 [[X_VAL]]) ; CHECK-NEXT: ret i32 0 ; entry: diff --git a/llvm/test/Transforms/ArgumentPromotion/byval-with-padding.ll b/llvm/test/Transforms/ArgumentPromotion/byval-with-padding.ll index fe3820617e2d77..9089470b7d3853 100644 --- a/llvm/test/Transforms/ArgumentPromotion/byval-with-padding.ll +++ b/llvm/test/Transforms/ArgumentPromotion/byval-with-padding.ll @@ -4,7 +4,7 @@ %struct.A = type { float, [12 x i8], i64, [8 x i8] } define internal float @callee(ptr byval(%struct.A) align 32 %0) { -; CHECK-LABEL: define {{[^@]+}}@callee.argprom +; CHECK-LABEL: define {{[^@]+}}@callee ; CHECK-SAME: (float [[DOT0_VAL:%.*]], i64 [[DOT16_VAL:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = fadd float 0.000000e+00, [[DOT0_VAL]] ; CHECK-NEXT: [[TMP2:%.*]] = uitofp i64 [[DOT16_VAL]] to float @@ -30,7 +30,7 @@ define float @caller(float %0) { ; CHECK-NEXT: [[DOTVAL:%.*]] = load float, ptr [[TMP2]], align 32 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP2]], i64 16 ; CHECK-NEXT: [[DOTVAL1:%.*]] = load i64, ptr [[TMP4]], align 16 -; CHECK-NEXT: [[TMP5:%.*]] = call noundef float @callee.argprom(float [[DOTVAL]], i64 [[DOTVAL1]]) +; CHECK-NEXT: [[TMP5:%.*]] = call noundef float @callee(float [[DOTVAL]], i64 [[DOTVAL1]]) ; CHECK-NEXT: ret float [[TMP5]] ; %2 = alloca %struct.A, align 32 diff --git a/llvm/test/Transforms/ArgumentPromotion/byval.ll b/llvm/test/Transforms/ArgumentPromotion/byval.ll index 424425b30767ed..13a60a96359212 100644 --- a/llvm/test/Transforms/ArgumentPromotion/byval.ll +++ b/llvm/test/Transforms/ArgumentPromotion/byval.ll @@ -6,7 +6,7 @@ target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:1 %struct.ss = type { i32, i64 } define internal void @f(ptr byval(%struct.ss) align 4 %b) nounwind { -; CHECK-LABEL: define {{[^@]+}}@f.argprom +; CHECK-LABEL: define {{[^@]+}}@f ; CHECK-SAME: (i32 [[B_0_VAL:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TEMP2:%.*]] = add i32 [[B_0_VAL]], 1 @@ -20,7 +20,7 @@ entry: } define internal void @g(ptr byval(%struct.ss) align 32 %b) nounwind { -; CHECK-LABEL: define {{[^@]+}}@g.argprom +; CHECK-LABEL: define {{[^@]+}}@g ; CHECK-SAME: (i32 [[B_0_VAL:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TEMP2:%.*]] = add i32 [[B_0_VAL]], 1 @@ -56,7 +56,7 @@ entry: ; Transform even if an argument is written to and then is loaded from. define internal void @k(ptr byval(%struct.ss) align 4 %b) nounwind { -; CHECK-LABEL: define {{[^@]+}}@k.argprom +; CHECK-LABEL: define {{[^@]+}}@k ; CHECK-SAME: (i32 [[B_0_VAL:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TEMP2:%.*]] = add i32 [[B_0_VAL]], 1 @@ -72,7 +72,7 @@ entry: ; Transform even if a store instruction is the single user. define internal void @l(ptr byval(%struct.ss) align 4 %b) nounwind { -; CHECK-LABEL: define {{[^@]+}}@l.argprom +; CHECK-LABEL: define {{[^@]+}}@l ; CHECK-SAME: (i32 [[B_0_VAL:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: ret void @@ -85,7 +85,7 @@ entry: ; Transform all the arguments creating the required number of 'alloca's and ; then optimize them out. define internal void @m(ptr byval(%struct.ss) align 4 %b, ptr byval(%struct.ss) align 4 %c) nounwind { -; CHECK-LABEL: define {{[^@]+}}@m.argprom +; CHECK-LABEL: define {{[^@]+}}@m ; CHECK-SAME: (i32 [[B_0_VAL:%.*]], i32 [[C_0_VAL:%.*]], i64 [[C_4_VAL:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TEMP2:%.*]] = add i32 [[B_0_VAL]], 1 @@ -116,19 +116,19 @@ define i32 @main() nounwind { ; CHECK-NEXT: [[TEMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1 ; CHECK-NEXT: store i64 2, ptr [[TEMP4]], align 4 ; CHECK-NEXT: [[S_VAL:%.*]] = load i32, ptr [[S]], align 4 -; CHECK-NEXT: call void @f.argprom(i32 [[S_VAL]]) +; CHECK-NEXT: call void @f(i32 [[S_VAL]]) ; CHECK-NEXT: [[S_VAL1:%.*]] = load i32, ptr [[S]], align 4 -; CHECK-NEXT: call void @g.argprom(i32 [[S_VAL1]]) +; CHECK-NEXT: call void @g(i32 [[S_VAL1]]) ; CHECK-NEXT: call void @h(ptr byval([[STRUCT_SS]]) [[S]]) ; CHECK-NEXT: [[S_VAL2:%.*]] = load i32, ptr [[S]], align 4 -; CHECK-NEXT: call void @k.argprom(i32 [[S_VAL2]]) +; CHECK-NEXT: call void @k(i32 [[S_VAL2]]) ; CHECK-NEXT: [[S_VAL3:%.*]] = load i32, ptr [[S]], align 4 -; CHECK-NEXT: call void @l.argprom(i32 [[S_VAL3]]) +; CHECK-NEXT: call void @l(i32 [[S_VAL3]]) ; CHECK-NEXT: [[S_VAL4:%.*]] = load i32, ptr [[S]], align 4 ; CHECK-NEXT: [[S_VAL5:%.*]] = load i32, ptr [[S]], align 4 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[S]], i64 4 ; CHECK-NEXT: [[S_VAL6:%.*]] = load i64, ptr [[TMP0]], align 8 -; CHECK-NEXT: call void @m.argprom(i32 [[S_VAL4]], i32 [[S_VAL5]], i64 [[S_VAL6]]) +; CHECK-NEXT: call void @m(i32 [[S_VAL4]], i32 [[S_VAL5]], i64 [[S_VAL6]]) ; CHECK-NEXT: ret i32 0 ; entry: diff --git a/llvm/test/Transforms/ArgumentPromotion/chained.ll b/llvm/test/Transforms/ArgumentPromotion/chained.ll index dba6726ea9b1f7..2fb80a39875688 100644 --- a/llvm/test/Transforms/ArgumentPromotion/chained.ll +++ b/llvm/test/Transforms/ArgumentPromotion/chained.ll @@ -5,7 +5,7 @@ @G2 = constant ptr @G1 define internal i32 @test(ptr %x) { -; CHECK-LABEL: define {{[^@]+}}@test.argprom.argprom +; CHECK-LABEL: define {{[^@]+}}@test ; CHECK-SAME: (i32 [[X_0_VAL_0_VAL:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: ret i32 [[X_0_VAL_0_VAL]] @@ -21,7 +21,7 @@ define i32 @caller() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[G2_VAL:%.*]] = load ptr, ptr @G2, align 8 ; CHECK-NEXT: [[G2_VAL_VAL:%.*]] = load i32, ptr [[G2_VAL]], align 4 -; CHECK-NEXT: [[X:%.*]] = call i32 @test.argprom.argprom(i32 [[G2_VAL_VAL]]) +; CHECK-NEXT: [[X:%.*]] = call i32 @test(i32 [[G2_VAL_VAL]]) ; CHECK-NEXT: ret i32 [[X]] ; entry: diff --git a/llvm/test/Transforms/ArgumentPromotion/control-flow2.ll b/llvm/test/Transforms/ArgumentPromotion/control-flow2.ll index 7fb572551b0cfa..8df89033c0d8da 100644 --- a/llvm/test/Transforms/ArgumentPromotion/control-flow2.ll +++ b/llvm/test/Transforms/ArgumentPromotion/control-flow2.ll @@ -4,7 +4,7 @@ target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128" define internal i32 @callee(i1 %C, ptr %P) { -; CHECK-LABEL: define {{[^@]+}}@callee.argprom +; CHECK-LABEL: define {{[^@]+}}@callee ; CHECK-SAME: (i1 [[C:%.*]], i32 [[P_0_VAL:%.*]]) { ; CHECK-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] ; CHECK: T: @@ -27,7 +27,7 @@ define i32 @foo() { ; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 ; CHECK-NEXT: store i32 17, ptr [[A]], align 4 ; CHECK-NEXT: [[A_VAL:%.*]] = load i32, ptr [[A]], align 4 -; CHECK-NEXT: [[X:%.*]] = call i32 @callee.argprom(i1 false, i32 [[A_VAL]]) +; CHECK-NEXT: [[X:%.*]] = call i32 @callee(i1 false, i32 [[A_VAL]]) ; CHECK-NEXT: ret i32 [[X]] ; %A = alloca i32 ; [#uses=2] diff --git a/llvm/test/Transforms/ArgumentPromotion/crash.ll b/llvm/test/Transforms/ArgumentPromotion/crash.ll index 0d15d7876dae61..12caae4dbef8df 100644 --- a/llvm/test/Transforms/ArgumentPromotion/crash.ll +++ b/llvm/test/Transforms/ArgumentPromotion/crash.ll @@ -44,7 +44,7 @@ bb: } define internal i1 @eggs(ptr %arg) { -; ARGPROMOTION-LABEL: define {{[^@]+}}@eggs.argprom() { +; ARGPROMOTION-LABEL: define {{[^@]+}}@eggs() { ; ARGPROMOTION-NEXT: bb: ; ARGPROMOTION-NEXT: unreachable ; diff --git a/llvm/test/Transforms/ArgumentPromotion/dbg.ll b/llvm/test/Transforms/ArgumentPromotion/dbg.ll index 15ed2cc2d20a6a..6a14facfb36a22 100644 --- a/llvm/test/Transforms/ArgumentPromotion/dbg.ll +++ b/llvm/test/Transforms/ArgumentPromotion/dbg.ll @@ -4,7 +4,7 @@ declare void @sink(i32) define internal void @test(ptr %X) !dbg !2 { -; CHECK-LABEL: define {{[^@]+}}@test.argprom.argprom +; CHECK-LABEL: define {{[^@]+}}@test ; CHECK-SAME: (i32 [[X_0_VAL_0_VAL:%.*]]) !dbg [[DBG3:![0-9]+]] { ; CHECK-NEXT: call void @sink(i32 [[X_0_VAL_0_VAL]]) ; CHECK-NEXT: ret void @@ -37,7 +37,7 @@ define void @caller(ptr %Y, ptr %P) { ; CHECK-SAME: (ptr [[Y:%.*]], ptr [[P:%.*]]) { ; CHECK-NEXT: [[Y_VAL:%.*]] = load ptr, ptr [[Y]], align 8, !dbg [[DBG4:![0-9]+]] ; CHECK-NEXT: [[Y_VAL_VAL:%.*]] = load i32, ptr [[Y_VAL]], align 8, !dbg [[DBG4]] -; CHECK-NEXT: call void @test.argprom.argprom(i32 [[Y_VAL_VAL]]), !dbg [[DBG4]] +; CHECK-NEXT: call void @test(i32 [[Y_VAL_VAL]]), !dbg [[DBG4]] ; CHECK-NEXT: call void @test_byval(ptr byval([[STRUCT_PAIR:%.*]]) align 4 [[P]]), !dbg [[DBG5:![0-9]+]] ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/ArgumentPromotion/fp80.ll b/llvm/test/Transforms/ArgumentPromotion/fp80.ll index 7e2a595b8f95f8..1e3d01a2361b92 100644 --- a/llvm/test/Transforms/ArgumentPromotion/fp80.ll +++ b/llvm/test/Transforms/ArgumentPromotion/fp80.ll @@ -16,12 +16,12 @@ define void @run() { ; CHECK-LABEL: define {{[^@]+}}@run() { ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr @b, i64 10 ; CHECK-NEXT: [[B_VAL:%.*]] = load i8, ptr [[TMP1]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = tail call i8 @UseLongDoubleUnsafely.argprom(i8 [[B_VAL]]) +; CHECK-NEXT: [[TMP2:%.*]] = tail call i8 @UseLongDoubleUnsafely(i8 [[B_VAL]]) ; CHECK-NEXT: [[B_VAL1:%.*]] = load x86_fp80, ptr @b, align 16 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_fp80 @UseLongDoubleSafely.argprom(x86_fp80 [[B_VAL1]]) +; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_fp80 @UseLongDoubleSafely(x86_fp80 [[B_VAL1]]) ; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_fp80 @UseLongDoubleSafelyNoPromotion(ptr byval([[UNION_U:%.*]]) align 16 @b) ; CHECK-NEXT: [[A_VAL:%.*]] = load i64, ptr @a, align 8 -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @AccessPaddingOfStruct.argprom(i64 [[A_VAL]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @AccessPaddingOfStruct(i64 [[A_VAL]]) ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @CaptureAStruct(ptr byval([[STRUCT_FOO:%.*]]) @a) ; CHECK-NEXT: ret void ; @@ -34,7 +34,7 @@ define void @run() { } define internal i8 @UseLongDoubleUnsafely(ptr byval(%union.u) align 16 %arg) { -; CHECK-LABEL: define {{[^@]+}}@UseLongDoubleUnsafely.argprom +; CHECK-LABEL: define {{[^@]+}}@UseLongDoubleUnsafely ; CHECK-SAME: (i8 [[ARG_10_VAL:%.*]]) { ; CHECK-NEXT: ret i8 [[ARG_10_VAL]] ; @@ -44,7 +44,7 @@ define internal i8 @UseLongDoubleUnsafely(ptr byval(%union.u) align 16 %arg) { } define internal x86_fp80 @UseLongDoubleSafely(ptr byval(%union.u) align 16 %arg) { -; CHECK-LABEL: define {{[^@]+}}@UseLongDoubleSafely.argprom +; CHECK-LABEL: define {{[^@]+}}@UseLongDoubleSafely ; CHECK-SAME: (x86_fp80 [[ARG_0_VAL:%.*]]) { ; CHECK-NEXT: ret x86_fp80 [[ARG_0_VAL]] ; @@ -71,7 +71,7 @@ define internal x86_fp80 @UseLongDoubleSafelyNoPromotion(ptr byval(%union.u) ali } define internal i64 @AccessPaddingOfStruct(ptr byval(%struct.Foo) %a) { -; CHECK-LABEL: define {{[^@]+}}@AccessPaddingOfStruct.argprom +; CHECK-LABEL: define {{[^@]+}}@AccessPaddingOfStruct ; CHECK-SAME: (i64 [[A_0_VAL:%.*]]) { ; CHECK-NEXT: ret i64 [[A_0_VAL]] ; diff --git a/llvm/test/Transforms/ArgumentPromotion/inalloca.ll b/llvm/test/Transforms/ArgumentPromotion/inalloca.ll index 6a5cf841f99440..f6a101cf38a47a 100644 --- a/llvm/test/Transforms/ArgumentPromotion/inalloca.ll +++ b/llvm/test/Transforms/ArgumentPromotion/inalloca.ll @@ -7,7 +7,7 @@ target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:1 ; Argpromote + sroa should change this to passing the two integers by value. define internal i32 @f(ptr inalloca(%struct.ss) %s) { -; CHECK-LABEL: define {{[^@]+}}@f.argprom +; CHECK-LABEL: define {{[^@]+}}@f ; CHECK-SAME: (i32 [[S_0_VAL:%.*]], i32 [[S_4_VAL:%.*]]) unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[R:%.*]] = add i32 [[S_0_VAL]], [[S_4_VAL]] @@ -24,7 +24,7 @@ entry: define i32 @main() { ; CHECK-LABEL: define {{[^@]+}}@main() local_unnamed_addr { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[R:%.*]] = call fastcc i32 @f.argprom(i32 1, i32 2) +; CHECK-NEXT: [[R:%.*]] = call fastcc i32 @f(i32 1, i32 2) ; CHECK-NEXT: ret i32 [[R]] ; entry: diff --git a/llvm/test/Transforms/ArgumentPromotion/invalidation.ll b/llvm/test/Transforms/ArgumentPromotion/invalidation.ll index 66de6bdfec503d..fe8f3b52f8dc5f 100644 --- a/llvm/test/Transforms/ArgumentPromotion/invalidation.ll +++ b/llvm/test/Transforms/ArgumentPromotion/invalidation.ll @@ -12,7 +12,7 @@ @G = constant i32 0 define internal i32 @a(ptr %x) { -; CHECK-LABEL: define {{[^@]+}}@a.argprom +; CHECK-LABEL: define {{[^@]+}}@a ; CHECK-SAME: (i32 [[X_0_VAL:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: ret i32 [[X_0_VAL]] @@ -26,7 +26,7 @@ define i32 @b() { ; CHECK-LABEL: define {{[^@]+}}@b() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[G_VAL:%.*]] = load i32, ptr @G, align 4 -; CHECK-NEXT: [[V:%.*]] = call i32 @a.argprom(i32 [[G_VAL]]) +; CHECK-NEXT: [[V:%.*]] = call i32 @a(i32 [[G_VAL]]) ; CHECK-NEXT: ret i32 [[V]] ; entry: @@ -38,7 +38,7 @@ define i32 @c() { ; CHECK-LABEL: define {{[^@]+}}@c() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[G_VAL:%.*]] = load i32, ptr @G, align 4 -; CHECK-NEXT: [[V1:%.*]] = call i32 @a.argprom(i32 [[G_VAL]]) +; CHECK-NEXT: [[V1:%.*]] = call i32 @a(i32 [[G_VAL]]) ; CHECK-NEXT: [[V2:%.*]] = call i32 @b() ; CHECK-NEXT: [[RESULT:%.*]] = add i32 [[V1]], [[V2]] ; CHECK-NEXT: ret i32 [[RESULT]] diff --git a/llvm/test/Transforms/ArgumentPromotion/load-alignment-value-overflows-addrspace-size.ll b/llvm/test/Transforms/ArgumentPromotion/load-alignment-value-overflows-addrspace-size.ll index e263330caaf06f..659d1331700a0f 100644 --- a/llvm/test/Transforms/ArgumentPromotion/load-alignment-value-overflows-addrspace-size.ll +++ b/llvm/test/Transforms/ArgumentPromotion/load-alignment-value-overflows-addrspace-size.ll @@ -66,7 +66,7 @@ define internal void @call_load_maxalign_alloca_maxalign() { ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [13 x i16], align 4294967296, addrspace(5) ; CHECK-NEXT: [[ADDRSPACECAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr ; CHECK-NEXT: [[ADDRSPACECAST_VAL:%.*]] = load i32, ptr [[ADDRSPACECAST]], align 4294967296 -; CHECK-NEXT: call void @load_maxalign1.argprom(i32 [[ADDRSPACECAST_VAL]]) +; CHECK-NEXT: call void @load_maxalign1(i32 [[ADDRSPACECAST_VAL]]) ; CHECK-NEXT: ret void ; bb: @@ -77,7 +77,7 @@ bb: } define internal void @load_maxalign1(ptr %arg) { -; CHECK-LABEL: define internal void @load_maxalign1.argprom +; CHECK-LABEL: define internal void @load_maxalign1 ; CHECK-SAME: (i32 [[ARG_0_VAL:%.*]]) { ; CHECK-NEXT: bb: ; CHECK-NEXT: br label [[BB1:%.*]] @@ -110,7 +110,7 @@ define internal void @call_load_maxalign_alloca_ptr128() { ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [13 x i16], align 4294967296, addrspace(6) ; CHECK-NEXT: [[ADDRSPACECAST:%.*]] = addrspacecast ptr addrspace(6) [[ALLOCA]] to ptr ; CHECK-NEXT: [[ADDRSPACECAST_VAL:%.*]] = load i32, ptr [[ADDRSPACECAST]], align 4294967296 -; CHECK-NEXT: call void @load_maxalign2.argprom(i32 [[ADDRSPACECAST_VAL]]) +; CHECK-NEXT: call void @load_maxalign2(i32 [[ADDRSPACECAST_VAL]]) ; CHECK-NEXT: ret void ; bb: @@ -121,7 +121,7 @@ bb: } define internal void @load_maxalign2(ptr %arg) { -; CHECK-LABEL: define internal void @load_maxalign2.argprom +; CHECK-LABEL: define internal void @load_maxalign2 ; CHECK-SAME: (i32 [[ARG_0_VAL:%.*]]) { ; CHECK-NEXT: bb: ; CHECK-NEXT: br label [[BB1:%.*]] diff --git a/llvm/test/Transforms/ArgumentPromotion/max-elements-limit.ll b/llvm/test/Transforms/ArgumentPromotion/max-elements-limit.ll index 424238280f7fce..06293e8bbe7580 100644 --- a/llvm/test/Transforms/ArgumentPromotion/max-elements-limit.ll +++ b/llvm/test/Transforms/ArgumentPromotion/max-elements-limit.ll @@ -2,7 +2,7 @@ ; RUN: opt -passes=argpromotion -S %s | FileCheck %s define internal i32 @callee2(ptr noundef %0) { -; CHECK-LABEL: define {{[^@]+}}@callee2.argprom +; CHECK-LABEL: define {{[^@]+}}@callee2 ; CHECK-SAME: (i32 [[DOT0_VAL:%.*]], i32 [[DOT4_VAL:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = add nsw i32 [[DOT0_VAL]], [[DOT4_VAL]] ; CHECK-NEXT: ret i32 [[TMP1]] @@ -24,7 +24,7 @@ define i32 @caller2(i32 %0, i32 %1) { ; CHECK-NEXT: [[DOTVAL:%.*]] = load i32, ptr [[TMP3]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP3]], i64 4 ; CHECK-NEXT: [[DOTVAL1:%.*]] = load i32, ptr [[TMP5]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @callee2.argprom(i32 [[DOTVAL]], i32 [[DOTVAL1]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @callee2(i32 [[DOTVAL]], i32 [[DOTVAL1]]) ; CHECK-NEXT: ret i32 [[TMP6]] ; %3 = alloca [2 x i32], align 4 diff --git a/llvm/test/Transforms/ArgumentPromotion/metadata.ll b/llvm/test/Transforms/ArgumentPromotion/metadata.ll index caac625cea30f2..b3f9fb0c5510e1 100644 --- a/llvm/test/Transforms/ArgumentPromotion/metadata.ll +++ b/llvm/test/Transforms/ArgumentPromotion/metadata.ll @@ -5,7 +5,7 @@ declare void @use.i32(i32) declare void @use.p32(ptr) define internal void @callee(ptr %p1, ptr %p2, ptr %p3, ptr %p4, ptr %p5, ptr %p6, ptr %p7, ptr %p8, ptr %p9, ptr %p10) { -; CHECK-LABEL: define {{[^@]+}}@callee.argprom +; CHECK-LABEL: define {{[^@]+}}@callee ; CHECK-SAME: (i32 [[P1_0_VAL:%.*]], i32 [[P2_0_VAL:%.*]], ptr [[P3_0_VAL:%.*]], ptr [[P4_0_VAL:%.*]], ptr [[P5_0_VAL:%.*]], ptr [[P6_0_VAL:%.*]], ptr [[P7_0_VAL:%.*]], ptr [[P8_0_VAL:%.*]], ptr [[P9_0_VAL:%.*]], ptr [[P10_0_VAL:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = icmp ne ptr [[P4_0_VAL]], null ; CHECK-NEXT: call void @llvm.assume(i1 [[TMP1]]) @@ -57,7 +57,7 @@ define void @caller(ptr %p1, ptr %p2, ptr %p3, ptr %p4, ptr %p5, ptr %p6, ptr %p ; CHECK-NEXT: [[P8_VAL:%.*]] = load ptr, ptr [[P8]], align 8, !align !3, !noundef !1 ; CHECK-NEXT: [[P9_VAL:%.*]] = load ptr, ptr [[P9]], align 8, !noundef !1 ; CHECK-NEXT: [[P10_VAL:%.*]] = load ptr, ptr [[P10]], align 8, !nontemporal !4 -; CHECK-NEXT: call void @callee.argprom(i32 [[P1_VAL]], i32 [[P2_VAL]], ptr [[P3_VAL]], ptr [[P4_VAL]], ptr [[P5_VAL]], ptr [[P6_VAL]], ptr [[P7_VAL]], ptr [[P8_VAL]], ptr [[P9_VAL]], ptr [[P10_VAL]]) +; CHECK-NEXT: call void @callee(i32 [[P1_VAL]], i32 [[P2_VAL]], ptr [[P3_VAL]], ptr [[P4_VAL]], ptr [[P5_VAL]], ptr [[P6_VAL]], ptr [[P7_VAL]], ptr [[P8_VAL]], ptr [[P9_VAL]], ptr [[P10_VAL]]) ; CHECK-NEXT: ret void ; call void @callee(ptr %p1, ptr %p2, ptr %p3, ptr %p4, ptr %p5, ptr %p6, ptr %p7, ptr %p8, ptr %p9, ptr %p10) @@ -65,7 +65,7 @@ define void @caller(ptr %p1, ptr %p2, ptr %p3, ptr %p4, ptr %p5, ptr %p6, ptr %p } define internal ptr @callee_conditional(i1 %c, ptr dereferenceable(8) align 8 %p) { -; CHECK-LABEL: define {{[^@]+}}@callee_conditional.argprom +; CHECK-LABEL: define {{[^@]+}}@callee_conditional ; CHECK-SAME: (i1 [[C:%.*]], ptr [[P_0_VAL:%.*]]) { ; CHECK-NEXT: br i1 [[C]], label [[IF:%.*]], label [[ELSE:%.*]] ; CHECK: if: @@ -89,7 +89,7 @@ define void @caller_conditional(i1 %c, ptr %p) { ; CHECK-LABEL: define {{[^@]+}}@caller_conditional ; CHECK-SAME: (i1 [[C:%.*]], ptr [[P:%.*]]) { ; CHECK-NEXT: [[P_VAL:%.*]] = load ptr, ptr [[P]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = call ptr @callee_conditional.argprom(i1 [[C]], ptr [[P_VAL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call ptr @callee_conditional(i1 [[C]], ptr [[P_VAL]]) ; CHECK-NEXT: ret void ; call ptr @callee_conditional(i1 %c, ptr %p) diff --git a/llvm/test/Transforms/ArgumentPromotion/min-legal-vector-width.ll b/llvm/test/Transforms/ArgumentPromotion/min-legal-vector-width.ll index 8812dc2104feb2..a5a0fc0cf186b7 100644 --- a/llvm/test/Transforms/ArgumentPromotion/min-legal-vector-width.ll +++ b/llvm/test/Transforms/ArgumentPromotion/min-legal-vector-width.ll @@ -2,11 +2,11 @@ ; CHECK-LABEL: define i32 @foo() #0 { ; CHECK-NEXT: %.val = load <32 x half>, ptr undef, align 4 -; CHECK-NEXT: call void @bar.argprom(<32 x half> %.val) +; CHECK-NEXT: call void @bar(<32 x half> %.val) ; CHECK-NEXT: ret i32 0 ; CHECK-NEXT: } -; CHECK-LABEL: define internal void @bar.argprom(<32 x half> %.0.val) #0 { +; CHECK-LABEL: define internal void @bar(<32 x half> %.0.val) #0 { ; CHECK-NEXT: ret void ; CHECK-NEXT: } diff --git a/llvm/test/Transforms/ArgumentPromotion/nonzero-address-spaces.ll b/llvm/test/Transforms/ArgumentPromotion/nonzero-address-spaces.ll index 335275380c11b0..6cabc5bb8f3a90 100644 --- a/llvm/test/Transforms/ArgumentPromotion/nonzero-address-spaces.ll +++ b/llvm/test/Transforms/ArgumentPromotion/nonzero-address-spaces.ll @@ -11,7 +11,7 @@ target datalayout = "e-P1-p:16:8-i8:8-i16:8-i32:8-i64:8-f32:8-f64:8-n8-a:8" define i32 @bar() { ; CHECK-LABEL: define {{[^@]+}}@bar() addrspace(1) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CALL:%.*]] = call addrspace(1) i32 @foo.argprom() +; CHECK-NEXT: [[CALL:%.*]] = call addrspace(1) i32 @foo() ; CHECK-NEXT: ret i32 [[CALL]] ; @@ -21,7 +21,7 @@ entry: } define internal i32 @foo(ptr) { -; CHECK-LABEL: define {{[^@]+}}@foo.argprom() addrspace(1) { +; CHECK-LABEL: define {{[^@]+}}@foo() addrspace(1) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 ; CHECK-NEXT: call addrspace(0) void asm sideeffect "ldr r0, [r0] \0Abx lr \0A", ""() diff --git a/llvm/test/Transforms/ArgumentPromotion/opaque-ptr.ll b/llvm/test/Transforms/ArgumentPromotion/opaque-ptr.ll index 5ca798b3a9d918..59699675577cfe 100644 --- a/llvm/test/Transforms/ArgumentPromotion/opaque-ptr.ll +++ b/llvm/test/Transforms/ArgumentPromotion/opaque-ptr.ll @@ -2,7 +2,7 @@ ; RUN: opt -S -passes=argpromotion < %s | FileCheck %s define internal i32 @callee_basic(ptr %p) { -; CHECK-LABEL: define {{[^@]+}}@callee_basic.argprom +; CHECK-LABEL: define {{[^@]+}}@callee_basic ; CHECK-SAME: (i32 [[P_0_VAL:%.*]], i32 [[P_4_VAL:%.*]]) { ; CHECK-NEXT: [[Z:%.*]] = add i32 [[P_0_VAL]], [[P_4_VAL]] ; CHECK-NEXT: ret i32 [[Z]] @@ -20,7 +20,7 @@ define void @caller_basic(ptr %p) { ; CHECK-NEXT: [[P_VAL:%.*]] = load i32, ptr [[P]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[P]], i64 4 ; CHECK-NEXT: [[P_VAL1:%.*]] = load i32, ptr [[TMP1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @callee_basic.argprom(i32 [[P_VAL]], i32 [[P_VAL1]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @callee_basic(i32 [[P_VAL]], i32 [[P_VAL1]]) ; CHECK-NEXT: ret void ; call i32 @callee_basic(ptr %p) diff --git a/llvm/test/Transforms/ArgumentPromotion/pr27568.ll b/llvm/test/Transforms/ArgumentPromotion/pr27568.ll index 1164bcf4141ed9..cc25088edf52f9 100644 --- a/llvm/test/Transforms/ArgumentPromotion/pr27568.ll +++ b/llvm/test/Transforms/ArgumentPromotion/pr27568.ll @@ -5,7 +5,7 @@ target triple = "x86_64-pc-windows-msvc" define internal void @callee(ptr) { -; CHECK-LABEL: define {{[^@]+}}@callee.argprom() { +; CHECK-LABEL: define {{[^@]+}}@callee() { ; CHECK-NEXT: entry: ; CHECK-NEXT: call void @thunk() ; CHECK-NEXT: ret void @@ -24,7 +24,7 @@ define void @test1() personality ptr @__CxxFrameHandler3 { ; CHECK-NEXT: ret void ; CHECK: cpad: ; CHECK-NEXT: [[PAD:%.*]] = cleanuppad within none [] -; CHECK-NEXT: call void @callee.argprom() [ "funclet"(token [[PAD]]) ] +; CHECK-NEXT: call void @callee() [ "funclet"(token [[PAD]]) ] ; CHECK-NEXT: cleanupret from [[PAD]] unwind to caller ; entry: diff --git a/llvm/test/Transforms/ArgumentPromotion/pr32917.ll b/llvm/test/Transforms/ArgumentPromotion/pr32917.ll index ac0d30999ce0e9..dd089a910f5a31 100644 --- a/llvm/test/Transforms/ArgumentPromotion/pr32917.ll +++ b/llvm/test/Transforms/ArgumentPromotion/pr32917.ll @@ -12,7 +12,7 @@ define i32 @fn2() local_unnamed_addr { ; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i64 -4 ; CHECK-NEXT: [[DOTVAL:%.*]] = load i32, ptr [[TMP4]], align 4 -; CHECK-NEXT: call fastcc void @fn1.argprom(i32 [[DOTVAL]]) +; CHECK-NEXT: call fastcc void @fn1(i32 [[DOTVAL]]) ; CHECK-NEXT: ret i32 undef ; %1 = load i32, ptr @b, align 4 @@ -23,7 +23,7 @@ define i32 @fn2() local_unnamed_addr { } define internal fastcc void @fn1(ptr nocapture readonly) unnamed_addr { -; CHECK-LABEL: define {{[^@]+}}@fn1.argprom +; CHECK-LABEL: define {{[^@]+}}@fn1 ; CHECK-SAME: (i32 [[DOT_4_VAL:%.*]]) unnamed_addr { ; CHECK-NEXT: store i32 [[DOT_4_VAL]], ptr @a, align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll b/llvm/test/Transforms/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll index 42728abb81e722..8db0a28e680587 100644 --- a/llvm/test/Transforms/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll +++ b/llvm/test/Transforms/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll @@ -14,7 +14,7 @@ define void @foo() { } define internal void @bar(ptr %p) { -; CHECK-LABEL: define {{.*}}void @bar.argprom() +; CHECK-LABEL: define {{.*}}void @bar() ; CHECK-NEXT: #dbg_value(ptr undef, !3, !DIExpression(), !5 call void @llvm.dbg.value(metadata ptr %p, metadata !3, metadata !DIExpression()), !dbg !5 ret void diff --git a/llvm/test/Transforms/ArgumentPromotion/profile.ll b/llvm/test/Transforms/ArgumentPromotion/profile.ll index b932f7c762431e..58d7376b8b7da0 100644 --- a/llvm/test/Transforms/ArgumentPromotion/profile.ll +++ b/llvm/test/Transforms/ArgumentPromotion/profile.ll @@ -6,7 +6,7 @@ target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:1 define void @caller() #0 { ; CHECK-LABEL: define {{[^@]+}}@caller() { -; CHECK-NEXT: call void @promote_i32_ptr.argprom(i32 42), !prof [[PROF0:![0-9]+]] +; CHECK-NEXT: call void @promote_i32_ptr(i32 42), !prof [[PROF0:![0-9]+]] ; CHECK-NEXT: ret void ; %x = alloca i32 @@ -16,7 +16,7 @@ define void @caller() #0 { } define internal void @promote_i32_ptr(ptr %xp) !prof !1 { -; CHECK-LABEL: define {{[^@]+}}@promote_i32_ptr.argprom +; CHECK-LABEL: define {{[^@]+}}@promote_i32_ptr ; CHECK-SAME: (i32 [[XP_0_VAL:%.*]]) !prof [[PROF1:![0-9]+]] { ; CHECK-NEXT: call void @use_i32(i32 [[XP_0_VAL]]) ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/ArgumentPromotion/propagate-remove-dead-args.ll b/llvm/test/Transforms/ArgumentPromotion/propagate-remove-dead-args.ll index 584ec42ae995c5..87a14533cfda26 100644 --- a/llvm/test/Transforms/ArgumentPromotion/propagate-remove-dead-args.ll +++ b/llvm/test/Transforms/ArgumentPromotion/propagate-remove-dead-args.ll @@ -4,7 +4,7 @@ %ptr.struct = type { ptr, ptr, ptr } define internal void @child(ptr %this, ptr %y, ptr %x) { -; CHECK-LABEL: define internal void @child.argprom +; CHECK-LABEL: define internal void @child ; CHECK-SAME: (ptr [[Y:%.*]], half [[X_0_VAL:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: store half [[X_0_VAL]], ptr [[Y]], align 2 @@ -17,15 +17,15 @@ entry: } define internal void @parent(ptr %this, ptr %p1, ptr %p2) { -; CHECK-LABEL: define internal void @parent.argprom +; CHECK-LABEL: define internal void @parent ; CHECK-SAME: (ptr [[P1:%.*]], ptr [[P2:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[P2_VAL2:%.*]] = load half, ptr [[P2]], align 2 -; CHECK-NEXT: call void @child.argprom(ptr [[P1]], half [[P2_VAL2]]) +; CHECK-NEXT: call void @child(ptr [[P1]], half [[P2_VAL2]]) ; CHECK-NEXT: [[P2_VAL1:%.*]] = load half, ptr [[P2]], align 2 -; CHECK-NEXT: call void @child.argprom(ptr [[P1]], half [[P2_VAL1]]) +; CHECK-NEXT: call void @child(ptr [[P1]], half [[P2_VAL1]]) ; CHECK-NEXT: [[P2_VAL:%.*]] = load half, ptr [[P2]], align 2 -; CHECK-NEXT: call void @child.argprom(ptr [[P1]], half [[P2_VAL]]) +; CHECK-NEXT: call void @child(ptr [[P1]], half [[P2_VAL]]) ; CHECK-NEXT: ret void ; entry: @@ -46,7 +46,7 @@ define void @grandparent() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[XPTR:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[YPTR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: call void @parent.argprom(ptr [[XPTR]], ptr [[YPTR]]) +; CHECK-NEXT: call void @parent(ptr [[XPTR]], ptr [[YPTR]]) ; CHECK-NEXT: ret void ; entry: @@ -58,7 +58,7 @@ entry: } define internal ptr @callee(ptr %dead) { -; CHECK-LABEL: define internal ptr @callee.argprom() { +; CHECK-LABEL: define internal ptr @callee() { ; CHECK-NEXT: ret ptr null ; ret ptr null @@ -66,8 +66,8 @@ define internal ptr @callee(ptr %dead) { define void @caller() { ; CHECK-LABEL: define void @caller() { -; CHECK-NEXT: [[TMP1:%.*]] = call ptr @callee.argprom() -; CHECK-NEXT: [[TMP2:%.*]] = call ptr @callee.argprom() +; CHECK-NEXT: [[TMP1:%.*]] = call ptr @callee() +; CHECK-NEXT: [[TMP2:%.*]] = call ptr @callee() ; CHECK-NEXT: ret void ; %ret = call ptr @callee(ptr null) diff --git a/llvm/test/Transforms/ArgumentPromotion/recursion/aggregate-promote-recursive.ll b/llvm/test/Transforms/ArgumentPromotion/recursion/aggregate-promote-recursive.ll index b1d5898a9a1c7b..011ebe4eee76e7 100644 --- a/llvm/test/Transforms/ArgumentPromotion/recursion/aggregate-promote-recursive.ll +++ b/llvm/test/Transforms/ArgumentPromotion/recursion/aggregate-promote-recursive.ll @@ -5,11 +5,11 @@ @G = constant %T { i32 0, i32 0, i32 17, i32 25 } define internal i32 @test(ptr %p) { -; CHECK-LABEL: define {{[^@]+}}@test.argprom +; CHECK-LABEL: define {{[^@]+}}@test ; CHECK-SAME: (i32 [[P_8_VAL:%.*]], i32 [[P_12_VAL:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[V:%.*]] = add i32 [[P_12_VAL]], [[P_8_VAL]] -; CHECK-NEXT: [[RET:%.*]] = call i32 @test.argprom(i32 [[P_8_VAL]], i32 [[P_12_VAL]]) +; CHECK-NEXT: [[RET:%.*]] = call i32 @test(i32 [[P_8_VAL]], i32 [[P_12_VAL]]) ; CHECK-NEXT: [[ARET:%.*]] = add i32 [[V]], [[RET]] ; CHECK-NEXT: ret i32 [[ARET]] ; @@ -31,7 +31,7 @@ define i32 @caller() { ; CHECK-NEXT: [[G_VAL:%.*]] = load i32, ptr [[TMP0]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr @G, i64 12 ; CHECK-NEXT: [[G_VAL1:%.*]] = load i32, ptr [[TMP1]], align 4 -; CHECK-NEXT: [[V:%.*]] = call i32 @test.argprom(i32 [[G_VAL]], i32 [[G_VAL1]]) +; CHECK-NEXT: [[V:%.*]] = call i32 @test(i32 [[G_VAL]], i32 [[G_VAL1]]) ; CHECK-NEXT: ret i32 [[V]] ; entry: diff --git a/llvm/test/Transforms/ArgumentPromotion/recursion/argpromotion-recursion-pr1259.ll b/llvm/test/Transforms/ArgumentPromotion/recursion/argpromotion-recursion-pr1259.ll index 28bdc8fc45050f..e160dbad92e7b1 100644 --- a/llvm/test/Transforms/ArgumentPromotion/recursion/argpromotion-recursion-pr1259.ll +++ b/llvm/test/Transforms/ArgumentPromotion/recursion/argpromotion-recursion-pr1259.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S -passes=argpromotion < %s | FileCheck %s define internal i32 @foo(ptr %x, i32 %n, i32 %m) { -; CHECK-LABEL: define internal i32 @foo.argprom( +; CHECK-LABEL: define internal i32 @foo( ; CHECK-SAME: i32 [[X_0_VAL:%.*]], i32 [[N:%.*]], i32 [[M:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[N]], 0 @@ -10,9 +10,9 @@ define internal i32 @foo(ptr %x, i32 %n, i32 %m) { ; CHECK-NEXT: br label %[[RETURN:.*]] ; CHECK: [[COND_FALSE]]: ; CHECK-NEXT: [[SUBVAL:%.*]] = sub i32 [[N]], 1 -; CHECK-NEXT: [[CALLRET:%.*]] = call i32 @foo.argprom(i32 [[X_0_VAL]], i32 [[SUBVAL]], i32 [[X_0_VAL]]) +; CHECK-NEXT: [[CALLRET:%.*]] = call i32 @foo(i32 [[X_0_VAL]], i32 [[SUBVAL]], i32 [[X_0_VAL]]) ; CHECK-NEXT: [[SUBVAL2:%.*]] = sub i32 [[N]], 2 -; CHECK-NEXT: [[CALLRET2:%.*]] = call i32 @foo.argprom(i32 [[X_0_VAL]], i32 [[SUBVAL2]], i32 [[M]]) +; CHECK-NEXT: [[CALLRET2:%.*]] = call i32 @foo(i32 [[X_0_VAL]], i32 [[SUBVAL2]], i32 [[M]]) ; CHECK-NEXT: [[CMP2:%.*]] = add i32 [[CALLRET]], [[CALLRET2]] ; CHECK-NEXT: br label %[[RETURN]] ; CHECK: [[COND_NEXT:.*]]: @@ -51,7 +51,7 @@ define i32 @bar(ptr align(4) dereferenceable(4) %x, i32 %n, i32 %m) { ; CHECK-SAME: ptr align 4 dereferenceable(4) [[X:%.*]], i32 [[N:%.*]], i32 [[M:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr [[X]], align 4 -; CHECK-NEXT: [[CALLRET3:%.*]] = call i32 @foo.argprom(i32 [[X_VAL]], i32 [[N]], i32 [[M]]) +; CHECK-NEXT: [[CALLRET3:%.*]] = call i32 @foo(i32 [[X_VAL]], i32 [[N]], i32 [[M]]) ; CHECK-NEXT: br label %[[RETURN:.*]] ; CHECK: [[RETURN]]: ; CHECK-NEXT: ret i32 [[CALLRET3]] diff --git a/llvm/test/Transforms/ArgumentPromotion/recursion/recursion-mixed-calls.ll b/llvm/test/Transforms/ArgumentPromotion/recursion/recursion-mixed-calls.ll index 0e048c2726a312..0ec4137aadeb4c 100644 --- a/llvm/test/Transforms/ArgumentPromotion/recursion/recursion-mixed-calls.ll +++ b/llvm/test/Transforms/ArgumentPromotion/recursion/recursion-mixed-calls.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S -passes=argpromotion < %s | FileCheck %s define internal i32 @zoo(ptr %x, i32 %m) { -; CHECK-LABEL: define internal i32 @zoo.argprom( +; CHECK-LABEL: define internal i32 @zoo( ; CHECK-SAME: i32 [[X_0_VAL:%.*]], i32 [[M:%.*]]) { ; CHECK-NEXT: [[RESZOO:%.*]] = add i32 [[X_0_VAL]], [[M]] ; CHECK-NEXT: ret i32 [[X_0_VAL]] @@ -12,7 +12,7 @@ define internal i32 @zoo(ptr %x, i32 %m) { } define internal i32 @foo(ptr %x, ptr %y, i32 %n, i32 %m) { -; CHECK-LABEL: define internal i32 @foo.argprom( +; CHECK-LABEL: define internal i32 @foo( ; CHECK-SAME: ptr [[X:%.*]], i32 [[Y_0_VAL:%.*]], i32 [[N:%.*]], i32 [[M:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[N]], 0 @@ -23,12 +23,12 @@ define internal i32 @foo(ptr %x, ptr %y, i32 %n, i32 %m) { ; CHECK: [[COND_FALSE]]: ; CHECK-NEXT: [[VAL2:%.*]] = load i32, ptr [[X]], align 4 ; CHECK-NEXT: [[SUBVAL:%.*]] = sub i32 [[N]], [[Y_0_VAL]] -; CHECK-NEXT: [[CALLRET:%.*]] = call i32 @foo.argprom(ptr [[X]], i32 [[Y_0_VAL]], i32 [[SUBVAL]], i32 [[VAL2]]) +; CHECK-NEXT: [[CALLRET:%.*]] = call i32 @foo(ptr [[X]], i32 [[Y_0_VAL]], i32 [[SUBVAL]], i32 [[VAL2]]) ; CHECK-NEXT: [[SUBVAL2:%.*]] = sub i32 [[N]], 2 -; CHECK-NEXT: [[CALLRET2:%.*]] = call i32 @foo.argprom(ptr [[X]], i32 [[Y_0_VAL]], i32 [[SUBVAL2]], i32 [[M]]) +; CHECK-NEXT: [[CALLRET2:%.*]] = call i32 @foo(ptr [[X]], i32 [[Y_0_VAL]], i32 [[SUBVAL2]], i32 [[M]]) ; CHECK-NEXT: [[CMP1:%.*]] = add i32 [[CALLRET]], [[CALLRET2]] ; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr [[X]], align 4 -; CHECK-NEXT: [[CALLRETFINAL:%.*]] = call i32 @zoo.argprom(i32 [[X_VAL]], i32 [[M]]) +; CHECK-NEXT: [[CALLRETFINAL:%.*]] = call i32 @zoo(i32 [[X_VAL]], i32 [[M]]) ; CHECK-NEXT: [[CMP2:%.*]] = add i32 [[CMP1]], [[CALLRETFINAL]] ; CHECK-NEXT: br label %[[RETURN]] ; CHECK: [[COND_NEXT:.*]]: @@ -70,7 +70,7 @@ define i32 @bar(ptr align(4) dereferenceable(4) %x, ptr align(4) dereferenceable ; CHECK-SAME: ptr align 4 dereferenceable(4) [[X:%.*]], ptr align 4 dereferenceable(4) [[Y:%.*]], i32 [[N:%.*]], i32 [[M:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[Y_VAL:%.*]] = load i32, ptr [[Y]], align 4 -; CHECK-NEXT: [[CALLRET3:%.*]] = call i32 @foo.argprom(ptr [[X]], i32 [[Y_VAL]], i32 [[N]], i32 [[M]]) +; CHECK-NEXT: [[CALLRET3:%.*]] = call i32 @foo(ptr [[X]], i32 [[Y_VAL]], i32 [[N]], i32 [[M]]) ; CHECK-NEXT: br label %[[RETURN:.*]] ; CHECK: [[RETURN]]: ; CHECK-NEXT: ret i32 [[CALLRET3]] diff --git a/llvm/test/Transforms/ArgumentPromotion/recursion/recursion-non-zero-offset.ll b/llvm/test/Transforms/ArgumentPromotion/recursion/recursion-non-zero-offset.ll index 1ec8ab1edca669..805414de17f133 100644 --- a/llvm/test/Transforms/ArgumentPromotion/recursion/recursion-non-zero-offset.ll +++ b/llvm/test/Transforms/ArgumentPromotion/recursion/recursion-non-zero-offset.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S -passes=argpromotion < %s | FileCheck %s define internal i32 @foo(ptr %x, i32 %n, i32 %m) { -; CHECK-LABEL: define internal i32 @foo.argprom( +; CHECK-LABEL: define internal i32 @foo( ; CHECK-SAME: i32 [[X_0_VAL:%.*]], i32 [[N:%.*]], i32 [[M:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[N]], 0 @@ -10,9 +10,9 @@ define internal i32 @foo(ptr %x, i32 %n, i32 %m) { ; CHECK-NEXT: br label %[[RETURN:.*]] ; CHECK: [[COND_FALSE]]: ; CHECK-NEXT: [[SUBVAL:%.*]] = sub i32 [[N]], 1 -; CHECK-NEXT: [[CALLRET:%.*]] = call i32 @foo.argprom(i32 [[X_0_VAL]], i32 [[SUBVAL]], i32 [[X_0_VAL]]) +; CHECK-NEXT: [[CALLRET:%.*]] = call i32 @foo(i32 [[X_0_VAL]], i32 [[SUBVAL]], i32 [[X_0_VAL]]) ; CHECK-NEXT: [[SUBVAL2:%.*]] = sub i32 [[N]], 2 -; CHECK-NEXT: [[CALLRET2:%.*]] = call i32 @foo.argprom(i32 [[X_0_VAL]], i32 [[SUBVAL2]], i32 [[M]]) +; CHECK-NEXT: [[CALLRET2:%.*]] = call i32 @foo(i32 [[X_0_VAL]], i32 [[SUBVAL2]], i32 [[M]]) ; CHECK-NEXT: [[CMP2:%.*]] = add i32 [[CALLRET]], [[CALLRET2]] ; CHECK-NEXT: br label %[[RETURN]] ; CHECK: [[COND_NEXT:.*]]: @@ -52,7 +52,7 @@ define i32 @bar(ptr align(4) dereferenceable(4) %x, i32 %n, i32 %m) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[GEPVAL:%.*]] = getelementptr ptr, ptr [[X]], i32 0 ; CHECK-NEXT: [[GEPVAL_VAL:%.*]] = load i32, ptr [[GEPVAL]], align 4 -; CHECK-NEXT: [[CALLRET3:%.*]] = call i32 @foo.argprom(i32 [[GEPVAL_VAL]], i32 [[N]], i32 [[M]]) +; CHECK-NEXT: [[CALLRET3:%.*]] = call i32 @foo(i32 [[GEPVAL_VAL]], i32 [[N]], i32 [[M]]) ; CHECK-NEXT: br label %[[RETURN:.*]] ; CHECK: [[RETURN]]: ; CHECK-NEXT: ret i32 [[CALLRET3]] diff --git a/llvm/test/Transforms/ArgumentPromotion/reserve-tbaa.ll b/llvm/test/Transforms/ArgumentPromotion/reserve-tbaa.ll index 2f13767c643184..f60dd48a464d22 100644 --- a/llvm/test/Transforms/ArgumentPromotion/reserve-tbaa.ll +++ b/llvm/test/Transforms/ArgumentPromotion/reserve-tbaa.ll @@ -14,7 +14,7 @@ @d = global i8 0, align 1 define internal fastcc void @fn(ptr nocapture readonly %p1, ptr nocapture readonly %p2) { -; CHECK-LABEL: define {{[^@]+}}@fn.argprom +; CHECK-LABEL: define {{[^@]+}}@fn ; CHECK-SAME: (i32 [[P1_0_VAL:%.*]], i64 [[P2_0_VAL:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CONV:%.*]] = trunc i64 [[P2_0_VAL]] to i32 @@ -40,7 +40,7 @@ define i32 @main() { ; CHECK-NEXT: store i32 1, ptr [[TMP1]], align 4, !tbaa [[TBAA5:![0-9]+]] ; CHECK-NEXT: [[G_VAL:%.*]] = load i32, ptr @g, align 4, !tbaa [[TBAA5]] ; CHECK-NEXT: [[C_VAL:%.*]] = load i64, ptr @c, align 8, !tbaa [[TBAA7:![0-9]+]] -; CHECK-NEXT: call fastcc void @fn.argprom(i32 [[G_VAL]], i64 [[C_VAL]]) +; CHECK-NEXT: call fastcc void @fn(i32 [[G_VAL]], i64 [[C_VAL]]) ; CHECK-NEXT: ret i32 0 ; entry: diff --git a/llvm/test/Transforms/ArgumentPromotion/sret.ll b/llvm/test/Transforms/ArgumentPromotion/sret.ll index 80403e1d92d527..fcc868954bc951 100644 --- a/llvm/test/Transforms/ArgumentPromotion/sret.ll +++ b/llvm/test/Transforms/ArgumentPromotion/sret.ll @@ -5,7 +5,7 @@ target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-pc-windows-msvc" define internal void @add(ptr %this, ptr sret(i32) %r) { -; CHECK-LABEL: define {{[^@]+}}@add.argprom +; CHECK-LABEL: define {{[^@]+}}@add ; CHECK-SAME: (i32 [[THIS_0_VAL:%.*]], i32 [[THIS_4_VAL:%.*]], ptr noalias [[R:%.*]]) { ; CHECK-NEXT: [[AB:%.*]] = add i32 [[THIS_0_VAL]], [[THIS_4_VAL]] ; CHECK-NEXT: store i32 [[AB]], ptr [[R]], align 4 @@ -27,7 +27,7 @@ define void @f() { ; CHECK-NEXT: [[PAIR_VAL:%.*]] = load i32, ptr [[PAIR]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[PAIR]], i64 4 ; CHECK-NEXT: [[PAIR_VAL1:%.*]] = load i32, ptr [[TMP1]], align 4 -; CHECK-NEXT: call void @add.argprom(i32 [[PAIR_VAL]], i32 [[PAIR_VAL1]], ptr noalias [[R]]) +; CHECK-NEXT: call void @add(i32 [[PAIR_VAL]], i32 [[PAIR_VAL1]], ptr noalias [[R]]) ; CHECK-NEXT: ret void ; %r = alloca i32 diff --git a/llvm/test/Transforms/ArgumentPromotion/store-into-inself.ll b/llvm/test/Transforms/ArgumentPromotion/store-into-inself.ll index ecf8eb557786dc..0db42a97841f48 100644 --- a/llvm/test/Transforms/ArgumentPromotion/store-into-inself.ll +++ b/llvm/test/Transforms/ArgumentPromotion/store-into-inself.ll @@ -59,7 +59,7 @@ entry: } define internal void @l(ptr byval(ptr) align 4 %p) nounwind { -; CHECK-LABEL: define {{[^@]+}}@l.argprom.argprom +; CHECK-LABEL: define {{[^@]+}}@l ; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: ret void @@ -83,7 +83,7 @@ define i32 @main() nounwind { ; CHECK-NEXT: call void @g(ptr byval(ptr) align 4 [[S]]) #[[ATTR0]] ; CHECK-NEXT: call void @h(ptr byval(ptr) align 4 [[S]]) #[[ATTR0]] ; CHECK-NEXT: call void @k(ptr byval(ptr) align 4 [[S]]) #[[ATTR0]] -; CHECK-NEXT: call void @l.argprom.argprom() #[[ATTR0]] +; CHECK-NEXT: call void @l() #[[ATTR0]] ; CHECK-NEXT: ret i32 0 ; entry: diff --git a/llvm/test/Transforms/ArgumentPromotion/unused-argument.ll b/llvm/test/Transforms/ArgumentPromotion/unused-argument.ll index ec1503d3022154..f648d20f47311e 100644 --- a/llvm/test/Transforms/ArgumentPromotion/unused-argument.ll +++ b/llvm/test/Transforms/ArgumentPromotion/unused-argument.ll @@ -4,7 +4,7 @@ ; while the used arguments should be promoted if they are pointers. ; The pass should not touch any unused non-pointer arguments. define internal i32 @callee(i1 %c, i1 %d, ptr %used, ptr %unused) nounwind { -; CHECK-LABEL: define {{[^@]+}}@callee.argprom +; CHECK-LABEL: define {{[^@]+}}@callee ; CHECK-SAME: (i1 [[C:%.*]], i1 [[D:%.*]], i32 [[USED_VAL:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[C]], label %if, label %else @@ -28,7 +28,7 @@ else: ; while the used arguments should be promoted if they are pointers. ; The pass should not touch any unused non-pointer arguments. define internal i32 @callee_byval(i1 %c, i1 %d, ptr byval(i32) align 4 %used, ptr byval(i32) align 4 %unused) nounwind { -; CHECK-LABEL: define {{[^@]+}}@callee_byval.argprom +; CHECK-LABEL: define {{[^@]+}}@callee_byval ; CHECK-SAME: (i1 [[C:%.*]], i1 [[D:%.*]], i32 [[USED_VAL:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[C]], label %if, label %else @@ -53,9 +53,9 @@ define i32 @caller(i1 %c, i1 %d, ptr %arg) nounwind { ; CHECK-SAME: (i1 [[C:%.*]], i1 [[D:%.*]], ptr [[ARG:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARG_VAL_0:%.*]] = load i32, ptr [[ARG]], align 4 -; CHECK-NEXT: [[RES_0:%.*]] = call i32 @callee_byval.argprom(i1 [[C]], i1 [[D]], i32 [[ARG_VAL_0]]) #[[ATTR0]] +; CHECK-NEXT: [[RES_0:%.*]] = call i32 @callee_byval(i1 [[C]], i1 [[D]], i32 [[ARG_VAL_0]]) #[[ATTR0]] ; CHECK-NEXT: [[ARG_VAL_1:%.*]] = load i32, ptr [[ARG]], align 4 -; CHECK-NEXT: [[RES_1:%.*]] = call i32 @callee.argprom(i1 [[C]], i1 [[D]], i32 [[ARG_VAL_1]]) #[[ATTR0]] +; CHECK-NEXT: [[RES_1:%.*]] = call i32 @callee(i1 [[C]], i1 [[D]], i32 [[ARG_VAL_1]]) #[[ATTR0]] ; CHECK-NEXT: ret i32 1 ; entry: diff --git a/llvm/test/Transforms/Attributor/reduced/clear_cached_analysis_for_deleted_functions.ll b/llvm/test/Transforms/Attributor/reduced/clear_cached_analysis_for_deleted_functions.ll index db8f86ea1b06cc..b3e3b2497194cb 100644 --- a/llvm/test/Transforms/Attributor/reduced/clear_cached_analysis_for_deleted_functions.ll +++ b/llvm/test/Transforms/Attributor/reduced/clear_cached_analysis_for_deleted_functions.ll @@ -19,7 +19,7 @@ define i32 @clause_LiteralComputeWeight(ptr %call23) { ; CGSCC-NEXT: [[TERM_0:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[CALL24:%.*]], [[DO_BODY]] ] ; CGSCC-NEXT: [[CALL2:%.*]] = load volatile i32, ptr [[TERM_0]], align 4 ; CGSCC-NEXT: [[CALL23_VAL:%.*]] = load ptr, ptr [[CALL23]], align 8 -; CGSCC-NEXT: [[CALL24]] = call ptr @list_Car.argprom(ptr nofree readonly [[CALL23_VAL]]) #[[ATTR3:[0-9]+]] +; CGSCC-NEXT: [[CALL24]] = call ptr @list_Car(ptr nofree readonly [[CALL23_VAL]]) #[[ATTR3:[0-9]+]] ; CGSCC-NEXT: br label [[DO_BODY]] ; entry: @@ -46,7 +46,7 @@ entry: define internal ptr @list_Car(ptr %L) #0 { ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read) -; CGSCC-LABEL: define {{[^@]+}}@list_Car.argprom +; CGSCC-LABEL: define {{[^@]+}}@list_Car ; CGSCC-SAME: (ptr nofree [[L_0_VAL:%.*]]) #[[ATTR2:[0-9]+]] { ; CGSCC-NEXT: entry: ; CGSCC-NEXT: ret ptr [[L_0_VAL]] diff --git a/llvm/test/Transforms/DeadArgElim/2007-02-07-FuncRename.ll b/llvm/test/Transforms/DeadArgElim/2007-02-07-FuncRename.ll index ee6a16e834718a..5096aff3eb0298 100644 --- a/llvm/test/Transforms/DeadArgElim/2007-02-07-FuncRename.ll +++ b/llvm/test/Transforms/DeadArgElim/2007-02-07-FuncRename.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -passes=deadargelim -S | grep "@test.argelim(" +; RUN: opt < %s -passes=deadargelim -S | grep "@test(" ; RUN: opt < %s -passes=deadargelim -S | not grep dead define internal i32 @test(i32 %X, i32 %dead) { diff --git a/llvm/test/Transforms/DeadArgElim/2007-12-20-ParamAttrs.ll b/llvm/test/Transforms/DeadArgElim/2007-12-20-ParamAttrs.ll index 79fe6eb81bd077..c3f7d7df90bc4f 100644 --- a/llvm/test/Transforms/DeadArgElim/2007-12-20-ParamAttrs.ll +++ b/llvm/test/Transforms/DeadArgElim/2007-12-20-ParamAttrs.ll @@ -5,7 +5,7 @@ @g = global i8 0 -; CHECK: define internal void @foo.argelim(i8 signext %y) [[NUW:#[0-9]+]] +; CHECK: define internal void @foo(i8 signext %y) [[NUW:#[0-9]+]] ; ; REMARK-LABEL: Function: foo ; REMARK: Args: @@ -21,7 +21,7 @@ define internal zeroext i8 @foo(ptr inreg %p, i8 signext %y, ... ) nounwind { } define i32 @bar() { -; CHECK: call void @foo.argelim(i8 signext 1) [[NUW]] +; CHECK: call void @foo(i8 signext 1) [[NUW]] %A = call zeroext i8(ptr, i8, ...) @foo(ptr inreg null, i8 signext 1, ptr byval(%struct) null ) nounwind ret i32 0 } diff --git a/llvm/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll b/llvm/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll index 8d6b1d13c52a34..485275b11160ff 100644 --- a/llvm/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll +++ b/llvm/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll @@ -15,7 +15,7 @@ define ptr @vfs_addname(ptr %name, i32 %len, i32 %hash, i32 %flags) nounwind ssp ; CHECK-NEXT: #dbg_value(i32 [[LEN]], [[META13:![0-9]+]], !DIExpression(), [[META12]]) ; CHECK-NEXT: #dbg_value(i32 [[HASH]], [[META14:![0-9]+]], !DIExpression(), [[META12]]) ; CHECK-NEXT: #dbg_value(i32 [[FLAGS]], [[META15:![0-9]+]], !DIExpression(), [[META12]]) -; CHECK-NEXT: [[TMP0:%.*]] = call fastcc ptr @add_name_internal.argelim(ptr [[NAME]], i32 [[HASH]]) #[[ATTR2:[0-9]+]], !dbg [[DBG16:![0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = call fastcc ptr @add_name_internal(ptr [[NAME]], i32 [[HASH]]) #[[ATTR2:[0-9]+]], !dbg [[DBG16:![0-9]+]] ; CHECK-NEXT: ret ptr [[TMP0]], !dbg [[DBG16]] ; entry: @@ -31,7 +31,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone define internal fastcc ptr @add_name_internal(ptr %name, i32 %len, i32 %hash, i8 zeroext %extra, i32 %flags) noinline nounwind ssp !dbg !16 { ; -; CHECK-LABEL: define {{[^@]+}}@add_name_internal.argelim +; CHECK-LABEL: define {{[^@]+}}@add_name_internal ; CHECK-SAME: (ptr [[NAME:%.*]], i32 [[HASH:%.*]]) #[[ATTR1:[0-9]+]] !dbg [[DBG18:![0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: #dbg_value(ptr [[NAME]], [[META22:![0-9]+]], !DIExpression(), [[META23:![0-9]+]]) diff --git a/llvm/test/Transforms/DeadArgElim/aggregates.ll b/llvm/test/Transforms/DeadArgElim/aggregates.ll index 4671fa63c5c918..784ac3af64c754 100644 --- a/llvm/test/Transforms/DeadArgElim/aggregates.ll +++ b/llvm/test/Transforms/DeadArgElim/aggregates.ll @@ -4,7 +4,7 @@ ; actually only used in ways we can eliminate. We gain benefit from analysing ; the "use" and applying its results to all sub-values. -; CHECK-LABEL: define internal void @agguse_dead.retelim() +; CHECK-LABEL: define internal void @agguse_dead() define internal { i32, i32 } @agguse_dead() { ret { i32, i32 } { i32 0, i32 1 } @@ -20,7 +20,7 @@ define internal { i32, i32 } @test_agguse_dead() { ; Case 1: an opaque use of the aggregate exists (in this case dead). Otherwise ; only one value is used, so function can be simplified. -; CHECK-LABEL: define internal i32 @rets_independent_if_agguse_dead.retelim() +; CHECK-LABEL: define internal i32 @rets_independent_if_agguse_dead() ; CHECK: [[RET:%.*]] = extractvalue { i32, i32 } { i32 0, i32 1 }, 1 ; CHECK: ret i32 [[RET]] @@ -89,7 +89,7 @@ define [2 x i32] @test_array_rets_have_multiple_slots() { ; Case 4: we can remove some retvals from the array. It's nice to produce an ; array again having done so (rather than converting it to a struct). -; CHECK-LABEL: define internal [2 x i32] @can_shrink_arrays.retelim() +; CHECK-LABEL: define internal [2 x i32] @can_shrink_arrays() ; CHECK: [[VAL0:%.*]] = extractvalue [3 x i32] [i32 42, i32 43, i32 44], 0 ; CHECK: [[RESTMP:%.*]] = insertvalue [2 x i32] poison, i32 [[VAL0]], 0 ; CHECK: [[VAL2:%.*]] = extractvalue [3 x i32] [i32 42, i32 43, i32 44], 2 @@ -117,7 +117,7 @@ define void @test_can_shrink_arrays() { ; Case 5: %in gets passed directly to the return. It should mark be marked as ; used if *any* of the return values are, not just if value 0 is. -; CHECK-LABEL: define internal i32 @ret_applies_to_all.retelim({ i32, i32 } %in) +; CHECK-LABEL: define internal i32 @ret_applies_to_all({ i32, i32 } %in) ; CHECK: [[RET:%.*]] = extractvalue { i32, i32 } %in, 1 ; CHECK: ret i32 [[RET]] @@ -167,7 +167,7 @@ entry: } ; CHECK-LABEL: define void @PR24906 -; CHECK: %[[invoke:.*]] = invoke i32 @agg_ret.retelim() +; CHECK: %[[invoke:.*]] = invoke i32 @agg_ret() ; CHECK: %[[oldret:.*]] = insertvalue { i32 } poison, i32 %[[invoke]], 0 ; CHECK: phi { i32 } [ %[[oldret]], define void @PR24906() personality ptr poison { diff --git a/llvm/test/Transforms/DeadArgElim/call_profile.ll b/llvm/test/Transforms/DeadArgElim/call_profile.ll index 93572a3c540b89..94dbbef6a6e983 100644 --- a/llvm/test/Transforms/DeadArgElim/call_profile.ll +++ b/llvm/test/Transforms/DeadArgElim/call_profile.ll @@ -3,8 +3,8 @@ ; Checks if !prof metadata is corret in deadargelim. define void @caller() #0 { -; CHECK: call void @test_vararg.argelim(), !prof ![[PROF:[0-9]]] -; CHECK: call void @test.argelim(), !prof ![[PROF]] +; CHECK: call void @test_vararg(), !prof ![[PROF:[0-9]]] +; CHECK: call void @test(), !prof ![[PROF]] call void (i32, ...) @test_vararg(i32 1), !prof !0 call void @test(i32 1), !prof !0 ret void diff --git a/llvm/test/Transforms/DeadArgElim/comdat.ll b/llvm/test/Transforms/DeadArgElim/comdat.ll index 0175ffe436e2d2..ea80d0dec0d1e9 100644 --- a/llvm/test/Transforms/DeadArgElim/comdat.ll +++ b/llvm/test/Transforms/DeadArgElim/comdat.ll @@ -11,4 +11,4 @@ define internal void @g(i32 %dead) comdat($f) { ret void } -; CHECK: define internal void @g.argelim() comdat($f) { +; CHECK: define internal void @g() comdat($f) { diff --git a/llvm/test/Transforms/DeadArgElim/dbginfo-update-dbgval-local.ll b/llvm/test/Transforms/DeadArgElim/dbginfo-update-dbgval-local.ll index 514bfd72d48b3f..0e834013fe40b4 100644 --- a/llvm/test/Transforms/DeadArgElim/dbginfo-update-dbgval-local.ll +++ b/llvm/test/Transforms/DeadArgElim/dbginfo-update-dbgval-local.ll @@ -5,7 +5,7 @@ ; Reproducer for PR23260. -; CHECK-LABEL: define internal void @bar.argelim() +; CHECK-LABEL: define internal void @bar() ; CHECK: #dbg_value(i32 poison, ![[LOCAL1:[0-9]+]] ; CHECK: call void @sink() @@ -18,9 +18,9 @@ entry: } ; CHECK-LABEL: define void @foo() -; CHECK: call void @bar.argelim() +; CHECK: call void @bar() ; CHECK: #dbg_value(i32 poison, ![[LOCAL2:[0-9]+]] -; CHECK: call void @bar.argelim() +; CHECK: call void @bar() ; Function Attrs: nounwind uwtable define void @foo() #0 !dbg !6 { diff --git a/llvm/test/Transforms/DeadArgElim/dbginfo.ll b/llvm/test/Transforms/DeadArgElim/dbginfo.ll index c86fc457860519..a27ca9dd70c245 100644 --- a/llvm/test/Transforms/DeadArgElim/dbginfo.ll +++ b/llvm/test/Transforms/DeadArgElim/dbginfo.ll @@ -14,7 +14,7 @@ ; the function->debug info mapping on update to ensure it's accurate when used ; again for the next removal. -; CHECK: define internal void @_ZL2f1iz.argelim({{.*}} !dbg [[SP:![0-9]+]] +; CHECK: define internal void @_ZL2f1iz({{.*}} !dbg [[SP:![0-9]+]] ; CHECK: [[SP]] = distinct !DISubprogram(name: "f1" ; Check that debug info metadata for subprograms stores pointers to diff --git a/llvm/test/Transforms/DeadArgElim/deadretval.ll b/llvm/test/Transforms/DeadArgElim/deadretval.ll index 74359f29ccbd2b..910aa7b9bd2238 100644 --- a/llvm/test/Transforms/DeadArgElim/deadretval.ll +++ b/llvm/test/Transforms/DeadArgElim/deadretval.ll @@ -23,7 +23,7 @@ define i32 @test3() { ; The callee function's return type shouldn't be changed if the call result is ; used. -; CHECK-LABEL: define internal ptr @callee4.argelim() +; CHECK-LABEL: define internal ptr @callee4() define internal ptr @callee4(ptr %a0) { ret ptr @g0; @@ -32,7 +32,7 @@ define internal ptr @callee4(ptr %a0) { declare void @llvm.objc.clang.arc.noop.use(...) ; CHECK-LABEL: define ptr @test4( -; CHECK: tail call ptr @callee4.argelim() [ "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ] +; CHECK: tail call ptr @callee4() [ "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ] define ptr @test4() { %call = tail call ptr @callee4(ptr @g0) [ "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ] diff --git a/llvm/test/Transforms/DeadArgElim/fct_ptr.ll b/llvm/test/Transforms/DeadArgElim/fct_ptr.ll index 6c02bd5ee9c3c3..bf54fb2e8b7286 100644 --- a/llvm/test/Transforms/DeadArgElim/fct_ptr.ll +++ b/llvm/test/Transforms/DeadArgElim/fct_ptr.ll @@ -22,7 +22,7 @@ define i32 @call_indirect(ptr readnone %fct_ptr, i32 %arg1, i32 %arg2, i32 %arg3 ; CHECK-NEXT: [[RES2:%.*]] = tail call i32 @internal_fct(i32 poison, i32 [[ARG2]], i32 poison) ; CHECK-NEXT: br label [[END]] ; CHECK: call_other: -; CHECK-NEXT: [[RES3:%.*]] = tail call i32 @other_fct.argelim(i32 [[ARG2]]) +; CHECK-NEXT: [[RES3:%.*]] = tail call i32 @other_fct(i32 [[ARG2]]) ; CHECK-NEXT: br label [[END]] ; CHECK: end: ; CHECK-NEXT: [[FINAL_RES:%.*]] = phi i32 [ [[RES1]], [[CALL_EXT]] ], [ [[RES2]], [[CALL_INT]] ], [ [[RES3]], [[CALL_OTHER]] ] diff --git a/llvm/test/Transforms/DeadArgElim/func_metadata.ll b/llvm/test/Transforms/DeadArgElim/func_metadata.ll index 2d25d916b24426..4922798b3aaa35 100644 --- a/llvm/test/Transforms/DeadArgElim/func_metadata.ll +++ b/llvm/test/Transforms/DeadArgElim/func_metadata.ll @@ -8,7 +8,7 @@ target triple = "x86_64-unknown-linux-gnu" @s = common dso_local local_unnamed_addr global i32 0, align 4 define internal i32 @va_func(i32 %num, ...) !prof !28 !PGOFuncName !29{ -; CHECK: define internal void @va_func.retelim(i32 %num) !prof ![[ENTRYCOUNT:[0-9]+]] !PGOFuncName ![[PGOFUNCNAME1:[0-9]+]] { +; CHECK: define internal void @va_func(i32 %num) !prof ![[ENTRYCOUNT:[0-9]+]] !PGOFuncName ![[PGOFUNCNAME1:[0-9]+]] { entry: %0 = load i32, ptr @s, align 4, !tbaa !31 %add = add nsw i32 %0, %num @@ -17,7 +17,7 @@ entry: } define internal fastcc i32 @foo() unnamed_addr !prof !28 !PGOFuncName !30 { -; CHECK: define internal fastcc void @foo.retelim() unnamed_addr !prof ![[ENTRYCOUNT:[0-9]+]] !PGOFuncName ![[PGOFUNCNAME2:[0-9]+]] { +; CHECK: define internal fastcc void @foo() unnamed_addr !prof ![[ENTRYCOUNT:[0-9]+]] !PGOFuncName ![[PGOFUNCNAME2:[0-9]+]] { entry: %0 = load i32, ptr @s, align 4, !tbaa !31 %add = add nsw i32 %0, 8 diff --git a/llvm/test/Transforms/DeadArgElim/funclet.ll b/llvm/test/Transforms/DeadArgElim/funclet.ll index d56720f96379da..3115c8b341415f 100644 --- a/llvm/test/Transforms/DeadArgElim/funclet.ll +++ b/llvm/test/Transforms/DeadArgElim/funclet.ll @@ -22,7 +22,7 @@ bad1: ; preds = %entry-block } ; CHECK-LABEL: define void @test1( ; CHECK: %[[pad:.*]] = cleanuppad within none [] -; CHECK-NEXT: call void @callee.argelim() [ "funclet"(token %[[pad]]) ] +; CHECK-NEXT: call void @callee() [ "funclet"(token %[[pad]]) ] declare void @thunk() diff --git a/llvm/test/Transforms/DeadArgElim/keepalive.ll b/llvm/test/Transforms/DeadArgElim/keepalive.ll index 43dd8791ff4565..bcb9f1d5f302cf 100644 --- a/llvm/test/Transforms/DeadArgElim/keepalive.ll +++ b/llvm/test/Transforms/DeadArgElim/keepalive.ll @@ -10,7 +10,7 @@ declare ptr @llvm.call.preallocated.arg(token, i32) ; the function and then changing too much. ; This checks if the return value attributes are not removed -; CHECK: define internal zeroext i32 @test1.argelim() #1 +; CHECK: define internal zeroext i32 @test1() #1 define internal zeroext i32 @test1(i32 %DEADARG1) nounwind { ; ; @@ -18,7 +18,7 @@ define internal zeroext i32 @test1(i32 %DEADARG1) nounwind { } ; This checks if the struct doesn't get non-packed -; CHECK-LABEL: define internal <{ i32, i32 }> @test2.argelim( +; CHECK-LABEL: define internal <{ i32, i32 }> @test2( define internal <{ i32, i32 }> @test2(i32 %DEADARG1) { ; ; diff --git a/llvm/test/Transforms/DeadArgElim/nonzero-address-spaces.ll b/llvm/test/Transforms/DeadArgElim/nonzero-address-spaces.ll index fd9d4e0c411072..ddd9aaac628d54 100644 --- a/llvm/test/Transforms/DeadArgElim/nonzero-address-spaces.ll +++ b/llvm/test/Transforms/DeadArgElim/nonzero-address-spaces.ll @@ -5,14 +5,14 @@ target datalayout = "e-P1-p:16:8-i8:8-i16:8-i32:8-i64:8-f32:8-f64:8-n8-a:8" -; CHECK: define internal i32 @foo.argelim() addrspace(1) +; CHECK: define internal i32 @foo() addrspace(1) define internal i32 @foo(i32 %x) #0 { tail call void asm sideeffect inteldialect "mov eax, [esp + $$4]\0A\09ret", "~{eax},~{dirflag},~{fpsr},~{flags}"() unreachable } define i32 @f(i32 %x, i32 %y) { - ; CHECK: %r = call addrspace(1) i32 @foo.argelim() + ; CHECK: %r = call addrspace(1) i32 @foo() %r = call i32 @foo(i32 %x) ret i32 %r diff --git a/llvm/test/Transforms/DeadArgElim/returned.ll b/llvm/test/Transforms/DeadArgElim/returned.ll index 94b1c9654d4d64..73f23ffa725eff 100644 --- a/llvm/test/Transforms/DeadArgElim/returned.ll +++ b/llvm/test/Transforms/DeadArgElim/returned.ll @@ -3,14 +3,14 @@ %Ty = type { i32, i32 } ; Validate that the argument and return value are both dead -; CHECK-LABEL: define internal void @test1.argelim() +; CHECK-LABEL: define internal void @test1() define internal ptr @test1(ptr %this) { ret ptr %this } ; do not keep alive the return value of a function with a dead 'returned' argument -; CHECK-LABEL: define internal void @test2.argelim() +; CHECK-LABEL: define internal void @test2() define internal ptr @test2(ptr returned %this) { ret ptr %this @@ -20,7 +20,7 @@ define internal ptr @test2(ptr returned %this) { @dummy = global ptr null ; Validate that return value is dead -; CHECK-LABEL: define internal void @test3.argelim(ptr %this) +; CHECK-LABEL: define internal void @test3(ptr %this) define internal ptr @test3(ptr %this) { store volatile ptr %this, ptr @dummy @@ -36,7 +36,7 @@ define internal ptr @test4(ptr returned %this) { } ; don't do this if 'returned' is on the call site... -; CHECK-LABEL: define internal void @test5.argelim(ptr %this) +; CHECK-LABEL: define internal void @test5(ptr %this) define internal ptr @test5(ptr %this) { store volatile ptr %this, ptr @dummy @@ -55,7 +55,7 @@ define ptr @caller(ptr %this) { %3 = call ptr @test3(ptr %this) %4 = call ptr @test4(ptr %this) ; ...instead, drop 'returned' form the call site -; CHECK: call void @test5.argelim(ptr %this) +; CHECK: call void @test5(ptr %this) %5 = call ptr @test5(ptr returned %this) %6 = call ptr @test6() ret ptr %this diff --git a/llvm/test/Transforms/DeadArgElim/variadic_safety.ll b/llvm/test/Transforms/DeadArgElim/variadic_safety.ll index d9fc4a1c822056..2147e4d0b8372d 100644 --- a/llvm/test/Transforms/DeadArgElim/variadic_safety.ll +++ b/llvm/test/Transforms/DeadArgElim/variadic_safety.ll @@ -34,5 +34,5 @@ define void @call_deadret(i32 %in) { store i32 42, ptr %stacked call i32 (i32, i32, ...) @va_deadret_func(i32 poison, i32 %in, [6 x i32] poison, ptr byval(i32) %stacked) ret void -; CHECK: call void (i32, i32, ...) @va_deadret_func.retelim(i32 poison, i32 poison, [6 x i32] poison, ptr byval(i32) %stacked) +; CHECK: call void (i32, i32, ...) @va_deadret_func(i32 poison, i32 poison, [6 x i32] poison, ptr byval(i32) %stacked) } diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization2.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization2.ll index 6e92084625294d..b6cdcf18eea429 100644 --- a/llvm/test/Transforms/FunctionSpecialization/function-specialization2.ll +++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization2.ll @@ -42,15 +42,15 @@ define internal void @decrement(ptr nocapture %0) { } define i32 @main(ptr %0, i32 %1) { -; CHECK: call void @func.specialized.2.argelim(ptr [[TMP0:%.*]], i32 [[TMP1:%.*]]) +; CHECK: call void @func.specialized.2(ptr [[TMP0:%.*]], i32 [[TMP1:%.*]]) %3 = call i32 @func(ptr %0, i32 %1, ptr nonnull @increment) -; CHECK: call void @func.specialized.1.argelim(ptr [[TMP0]], i32 0) +; CHECK: call void @func.specialized.1(ptr [[TMP0]], i32 0) %4 = call i32 @func(ptr %0, i32 %3, ptr nonnull @decrement) ; CHECK: ret i32 0 ret i32 %4 } -; CHECK: @func.specialized.1.argelim( +; CHECK: @func.specialized.1( ; CHECK: [[TMP3:%.*]] = alloca i32, align 4 ; CHECK: store i32 [[TMP1:%.*]], ptr [[TMP3]], align 4 ; CHECK: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 @@ -63,13 +63,13 @@ define i32 @main(ptr %0, i32 %1) { ; CHECK: call void @decrement(ptr [[TMP9]]) ; CHECK: [[TMP10:%.*]] = load i32, ptr [[TMP3]], align 4 ; CHECK: [[TMP11:%.*]] = add nsw i32 [[TMP10]], -1 -; CHECK: call void @func.specialized.1.argelim(ptr [[TMP0]], i32 [[TMP11]]) +; CHECK: call void @func.specialized.1(ptr [[TMP0]], i32 [[TMP11]]) ; CHECK: br label [[TMP12:%.*]] ; CHECK: 12: ; CHECK: ret void ; ; -; CHECK: @func.specialized.2.argelim( +; CHECK: @func.specialized.2( ; CHECK: [[TMP3:%.*]] = alloca i32, align 4 ; CHECK: store i32 [[TMP1:%.*]], ptr [[TMP3]], align 4 ; CHECK: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 @@ -82,7 +82,7 @@ define i32 @main(ptr %0, i32 %1) { ; CHECK: call void @increment(ptr [[TMP9]]) ; CHECK: [[TMP10:%.*]] = load i32, ptr [[TMP3]], align 4 ; CHECK: [[TMP11:%.*]] = add nsw i32 [[TMP10]], -1 -; CHECK: call void @func.specialized.2.argelim(ptr [[TMP0]], i32 [[TMP11]]) +; CHECK: call void @func.specialized.2(ptr [[TMP0]], i32 [[TMP11]]) ; CHECK: br label [[TMP12:%.*]] ; CHECK: 12: ; CHECK: ret void diff --git a/llvm/test/Transforms/FunctionSpecialization/global-var-constants.ll b/llvm/test/Transforms/FunctionSpecialization/global-var-constants.ll index 65b859b3a44801..a576d9aa32e140 100644 --- a/llvm/test/Transforms/FunctionSpecialization/global-var-constants.ll +++ b/llvm/test/Transforms/FunctionSpecialization/global-var-constants.ll @@ -49,11 +49,11 @@ entry: ; Check if specialisation on the address of a non-const global variable ; is not allowed, then it is not performed. -; NO-GLOBALS-LABEL: define internal range(i32 -2147483646, -2147483648) i32 @g.argelim() +; NO-GLOBALS-LABEL: define internal range(i32 -2147483646, -2147483648) i32 @g() ; NO-GLOBALS: call i32 @f(ptr @G) ; NO-GLOBALS-LABEL: define range(i32 -2147483646, -2147483648) i32 @h0(ptr %p) -; NO-GLOBALS:call i32 @g.argelim() +; NO-GLOBALS:call i32 @g() ; NO-GLOBALS-LABEL: define i32 @h1() ; NO-GLOBALS: call i32 @f(ptr @G) @@ -64,15 +64,15 @@ entry: ; Check if specialisation on the address of a non-const global variable ; is allowed, then it is performed where possible. -; GLOBALS-LABEL: define internal range(i32 -2147483646, -2147483648) i32 @g.argelim() -; GLOBALS: call i32 @f.specialized.2.argelim() +; GLOBALS-LABEL: define internal range(i32 -2147483646, -2147483648) i32 @g() +; GLOBALS: call i32 @f.specialized.2() ; GLOBALS-LABEL: define range(i32 -2147483646, -2147483648) i32 @h0(ptr %p) -; GLOBALS: call i32 @g.argelim() +; GLOBALS: call i32 @g() ; GLOBALS-LABEL: define i32 @h1() -; GLOBALS: call i32 @f.specialized.2.argelim() +; GLOBALS: call i32 @f.specialized.2() ; GLOBALS-LABEL: define i32 @h2() -; GLOBALS: call i32 @f.specialized.1.argelim() +; GLOBALS: call i32 @f.specialized.1() diff --git a/llvm/test/Transforms/FunctionSpecialization/non-argument-tracked.ll b/llvm/test/Transforms/FunctionSpecialization/non-argument-tracked.ll index 85ff084e90b198..9446e557da7581 100644 --- a/llvm/test/Transforms/FunctionSpecialization/non-argument-tracked.ll +++ b/llvm/test/Transforms/FunctionSpecialization/non-argument-tracked.ll @@ -29,9 +29,9 @@ define internal i32 @f2(i32 %i) { ;; All calls are to specilisation instances. ; CHECK-LABEL: define i32 @g0 -; CHECK: call void @f0.specialized.[[#A:]].argelim() -; CHECK-NEXT: call void @f1.specialized.[[#B:]].argelim() -; CHECK-NEXT: call void @f2.specialized.[[#C:]].argelim() +; CHECK: call void @f0.specialized.[[#A:]]() +; CHECK-NEXT: call void @f1.specialized.[[#B:]]() +; CHECK-NEXT: call void @f2.specialized.[[#C:]]() ; CHECK-NEXT: ret i32 9 define i32 @g0(i32 %i) { %u0 = call i32 @f0(i32 1) @@ -43,9 +43,9 @@ define i32 @g0(i32 %i) { } ; CHECK-LABEL: define i32 @g1 -; CHECK: call void @f0.specialized.[[#D:]].argelim() -; CHECK-NEXT: call void @f1.specialized.[[#E:]].argelim() -; CHECK-NEXT: call void @f2.specialized.[[#F:]].argelim() +; CHECK: call void @f0.specialized.[[#D:]]() +; CHECK-NEXT: call void @f1.specialized.[[#E:]]() +; CHECK-NEXT: call void @f2.specialized.[[#F:]]() ; CHECK-NEXT: ret i32 12 define i32 @g1(i32 %i) { %u0 = call i32 @f0(i32 2) @@ -58,9 +58,9 @@ define i32 @g1(i32 %i) { ; All of the function are specialized and all clones are with internal linkage. -; CHECK-DAG: define internal void @f0.specialized.[[#A]].argelim() { -; CHECK-DAG: define internal void @f1.specialized.[[#B]].argelim() { -; CHECK-DAG: define internal void @f2.specialized.[[#C]].argelim() { -; CHECK-DAG: define internal void @f0.specialized.[[#D]].argelim() { -; CHECK-DAG: define internal void @f1.specialized.[[#E]].argelim() { -; CHECK-DAG: define internal void @f2.specialized.[[#F]].argelim() { +; CHECK-DAG: define internal void @f0.specialized.[[#A]]() { +; CHECK-DAG: define internal void @f1.specialized.[[#B]]() { +; CHECK-DAG: define internal void @f2.specialized.[[#C]]() { +; CHECK-DAG: define internal void @f0.specialized.[[#D]]() { +; CHECK-DAG: define internal void @f1.specialized.[[#E]]() { +; CHECK-DAG: define internal void @f2.specialized.[[#F]]() { diff --git a/llvm/test/Transforms/FunctionSpecialization/specialization-order.ll b/llvm/test/Transforms/FunctionSpecialization/specialization-order.ll index 1e81f2ebc409a0..da4cb40fb6dc50 100644 --- a/llvm/test/Transforms/FunctionSpecialization/specialization-order.ll +++ b/llvm/test/Transforms/FunctionSpecialization/specialization-order.ll @@ -21,7 +21,7 @@ entry: define dso_local i32 @g0(i32 %x, i32 %y) { ; CHECK-LABEL: @g0 -; CHECK: call i32 @f.specialized.3.argelim(i32 [[X:%.*]], i32 [[Y:%.*]]) +; CHECK: call i32 @f.specialized.3(i32 [[X:%.*]], i32 [[Y:%.*]]) entry: %call = tail call i32 @f(i32 %x, i32 %y, ptr @add, ptr @add) ret i32 %call @@ -30,7 +30,7 @@ entry: define dso_local i32 @g1(i32 %x, i32 %y) { ; CHECK-LABEL: @g1( -; CHECK: call i32 @f.specialized.2.argelim(i32 [[X:%.*]], i32 [[Y:%.*]]) +; CHECK: call i32 @f.specialized.2(i32 [[X:%.*]], i32 [[Y:%.*]]) entry: %call = tail call i32 @f(i32 %x, i32 %y, ptr @sub, ptr @add) ret i32 %call @@ -38,21 +38,21 @@ entry: define dso_local i32 @g2(i32 %x, i32 %y, ptr %v) { ; CHECK-LABEL: @g2 -; CHECK: call i32 @f.specialized.1.argelim(i32 [[X:%.*]], i32 [[Y:%.*]], ptr [[V:%.*]]) +; CHECK: call i32 @f.specialized.1(i32 [[X:%.*]], i32 [[Y:%.*]], ptr [[V:%.*]]) entry: %call = tail call i32 @f(i32 %x, i32 %y, ptr @sub, ptr %v) ret i32 %call } -; CHECK-LABEL: define {{.*}} i32 @f.specialized.1.argelim +; CHECK-LABEL: define {{.*}} i32 @f.specialized.1 ; CHECK: call i32 @sub(i32 %x, i32 %y) ; CHECK-NEXT: call i32 %v(i32 %x, i32 %y) -; CHECK-LABEL: define {{.*}} i32 @f.specialized.2.argelim +; CHECK-LABEL: define {{.*}} i32 @f.specialized.2 ; CHECK: call i32 @sub(i32 %x, i32 %y) ; CHECK-NEXT: call i32 @add(i32 %x, i32 %y) -; CHECK-LABEL: define {{.*}} i32 @f.specialized.3.argelim +; CHECK-LABEL: define {{.*}} i32 @f.specialized.3 ; CHECK: call i32 @add(i32 %x, i32 %y) ; CHECK-NEXT: call i32 @add(i32 %x, i32 %y) diff --git a/llvm/test/Transforms/PhaseOrdering/dae-dce.ll b/llvm/test/Transforms/PhaseOrdering/dae-dce.ll index 389c2f5612488d..7ff3c5dc5536f7 100644 --- a/llvm/test/Transforms/PhaseOrdering/dae-dce.ll +++ b/llvm/test/Transforms/PhaseOrdering/dae-dce.ll @@ -14,8 +14,7 @@ define void @do_trap(ptr %ptr) { } define internal void @capture_and_trap(ptr %ptr) noinline { -; DEFAULT-LABEL: @capture_and_trap.argelim( -; LTO-LABEL: @capture_and_trap.argprom( +; CHECK-LABEL: @capture_and_trap( ; CHECK-NEXT: tail call void @llvm.trap() ; CHECK-NEXT: unreachable ; @@ -35,8 +34,7 @@ define internal void @dead_fn2() { define void @test(i1 %c) { ; CHECK-LABEL: @test( -; DEFAULT-NEXT: tail call fastcc void @capture_and_trap.argelim() -; LTO-NEXT: tail call fastcc void @capture_and_trap.argprom() +; CHECK-NEXT: tail call fastcc void @capture_and_trap() ; CHECK-NEXT: unreachable ; br i1 %c, label %if, label %else diff --git a/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion.ll b/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion.ll index 9361ec16d23d55..c33fcfbe6ed973 100644 --- a/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion.ll +++ b/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion.ll @@ -8,7 +8,7 @@ ; CHECK: [[DUMMY:@.*]] = local_unnamed_addr global i32 0 define internal void @f(ptr byval(%struct.ss) align 8 %b, ptr byval(i32) align 4 %X) noinline nounwind { -; CHECK-LABEL: define {{[^@]+}}@f.argprom.argelim +; CHECK-LABEL: define {{[^@]+}}@f ; CHECK-SAME: (i32 [[B_0:%.*]]){{[^#]*}} #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TEMP:%.*]] = add i32 [[B_0]], 1 @@ -27,7 +27,7 @@ define i32 @test(ptr %X) { ; CHECK-LABEL: define {{[^@]+}}@test ; CHECK-SAME: (ptr {{[^%]*}} [[X:%.*]]){{[^#]*}} #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: tail call {{.*}}void @f.argprom.argelim(i32 1) +; CHECK-NEXT: tail call {{.*}}void @f(i32 1) ; CHECK-NEXT: ret i32 0 ; entry: diff --git a/llvm/test/Transforms/SCCP/recursion.ll b/llvm/test/Transforms/SCCP/recursion.ll index bc036f71d0c7e2..f6556bee3eaba1 100644 --- a/llvm/test/Transforms/SCCP/recursion.ll +++ b/llvm/test/Transforms/SCCP/recursion.ll @@ -4,8 +4,8 @@ ; CHECK-NOT: %X define internal i32 @foo(i32 %X) { -; CHECK-LABEL: @foo.argelim( -; CHECK-NEXT: [[Y:%.*]] = call i32 @foo.argelim() +; CHECK-LABEL: @foo( +; CHECK-NEXT: [[Y:%.*]] = call i32 @foo() ; CHECK-NEXT: [[Z:%.*]] = add i32 [[Y]], 1 ; CHECK-NEXT: ret i32 [[Z]] ; @@ -16,7 +16,7 @@ define internal i32 @foo(i32 %X) { define void @bar() { ; CHECK-LABEL: @bar( -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @foo.argelim() +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @foo() ; CHECK-NEXT: ret void ; call i32 @foo( i32 17 ) ; :1 [#uses=0] From 04ccbe6e70cf11e846da3fbc800832c6e56b573f Mon Sep 17 00:00:00 2001 From: Hans Wennborg Date: Thu, 19 Sep 2024 13:10:28 +0200 Subject: [PATCH 187/321] Fix typos in interception_win.cpp --- compiler-rt/lib/interception/interception_win.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/compiler-rt/lib/interception/interception_win.cpp b/compiler-rt/lib/interception/interception_win.cpp index a0ff124a89c9ed..1e613816010ed7 100644 --- a/compiler-rt/lib/interception/interception_win.cpp +++ b/compiler-rt/lib/interception/interception_win.cpp @@ -27,7 +27,7 @@ // // 1) Detour // -// The Detour hooking technique is assuming the presence of an header with +// The Detour hooking technique is assuming the presence of a header with // padding and an overridable 2-bytes nop instruction (mov edi, edi). The // nop instruction can safely be replaced by a 2-bytes jump without any need // to save the instruction. A jump to the target is encoded in the function @@ -47,7 +47,7 @@ // // func: jmp