From 3a2a469c1698f49dd3aee3b006caaeac321f061f Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Fri, 14 Jul 2023 13:34:45 +0200 Subject: [PATCH] llpc: optimize constant local arrays to constant globals when multiple stores are used MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For allocas of array type, detect the case where the array is filled by multiple stores of constant values to constant (and disjoint) indices, and promote such allocas to constant globals. Co-authored-by: Nicolai Hähnle --- .../llpcSpirvLowerConstImmediateStore.cpp | 274 ++++++++++-------- .../lower/llpcSpirvLowerConstImmediateStore.h | 6 +- ...estCombineOfMultipleStoreInstructions.frag | 87 ++++++ 3 files changed, 244 insertions(+), 123 deletions(-) create mode 100644 llpc/test/shaderdb/general/TestCombineOfMultipleStoreInstructions.frag diff --git a/llpc/lower/llpcSpirvLowerConstImmediateStore.cpp b/llpc/lower/llpcSpirvLowerConstImmediateStore.cpp index a27cd4ad41..a4fe78af1b 100644 --- a/llpc/lower/llpcSpirvLowerConstImmediateStore.cpp +++ b/llpc/lower/llpcSpirvLowerConstImmediateStore.cpp @@ -32,6 +32,7 @@ #include "SPIRVInternal.h" #include "llpcContext.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/Debug.h" @@ -51,29 +52,21 @@ namespace Llpc { // @param [in/out] module : LLVM module to be run on (empty on entry) // @param [in/out] analysisManager : Analysis manager to use for this transformation PreservedAnalyses SpirvLowerConstImmediateStore::run(Module &module, ModuleAnalysisManager &analysisManager) { - runImpl(module); - return PreservedAnalyses::none(); -} - -// ===================================================================================================================== -// Executes this SPIR-V lowering pass on the specified LLVM module. -// -// @param [in/out] module : LLVM module to be run on -bool SpirvLowerConstImmediateStore::runImpl(Module &module) { LLVM_DEBUG(dbgs() << "Run the pass Spirv-Lower-Const-Immediate-Store\n"); SpirvLower::init(&module); // Process "alloca" instructions to see if they can be optimized to a read-only global // variable. - for (auto funcIt = module.begin(), funcItEnd = module.end(); funcIt != funcItEnd; ++funcIt) { - if (auto func = dyn_cast(&*funcIt)) { - if (!func->empty()) - processAllocaInsts(func); + bool changed = false; + for (auto &func : module.functions()) { + if (!func.empty()) { + if (processAllocaInsts(&func)) + changed |= true; } } - return true; + return changed ? PreservedAnalyses::allInSet() : PreservedAnalyses::all(); } // ===================================================================================================================== @@ -81,134 +74,177 @@ bool SpirvLowerConstImmediateStore::runImpl(Module &module) { // can be optimized to a read-only global variable. // // @param func : Function to process -void SpirvLowerConstImmediateStore::processAllocaInsts(Function *func) { +bool SpirvLowerConstImmediateStore::processAllocaInsts(Function *func) { // NOTE: We only visit the entry block on the basis that SPIR-V translator puts all "alloca" // instructions there. - auto entryBlock = &func->front(); - for (auto instIt = entryBlock->begin(), instItEnd = entryBlock->end(); instIt != instItEnd; ++instIt) { - auto inst = &*instIt; - if (auto allocaInst = dyn_cast(inst)) { - if (allocaInst->getAllocatedType()->isAggregateType()) { - // Got an "alloca" instruction of aggregate type. - auto storeInst = findSingleStore(allocaInst); - if (storeInst && isa(storeInst->getValueOperand())) { - // Got an aggregate "alloca" with a single store to the whole type. - // Do the optimization. - convertAllocaToReadOnlyGlobal(storeInst); - } - } + bool changed = false; + SmallVector candidates; + for (auto &inst : func->getEntryBlock()) { + if (auto allocaInst = dyn_cast(&inst)) { + if (allocaInst->getAllocatedType()->isAggregateType()) + candidates.push_back(allocaInst); } } + for (auto *alloca : candidates) { + if (tryProcessAlloca(alloca)) + changed |= true; + } + return changed; } // ===================================================================================================================== -// Finds the single "store" instruction storing to this pointer. -// -// Returns nullptr if no "store", multiple "store", or partial "store" instructions (store only part -// of the memory) are found. -// -// NOTE: This is conservative in that it returns nullptr if the pointer escapes by being used in anything -// other than "store" (as the pointer), "load" or "getelementptr" instruction. +// For a single alloca, try to replace it by a constant global variable. // // @param allocaInst : The "alloca" instruction to process -StoreInst *SpirvLowerConstImmediateStore::findSingleStore(AllocaInst *allocaInst) { - std::vector pointers; - bool isOuterPointer = true; - StoreInst *storeInstFound = nullptr; - Instruction *pointer = allocaInst; - while (true) { - for (auto useIt = pointer->use_begin(), useItEnd = pointer->use_end(); useIt != useItEnd; ++useIt) { - auto user = cast(useIt->getUser()); +// @return true if the alloca was replaced +bool SpirvLowerConstImmediateStore::tryProcessAlloca(AllocaInst *allocaInst) { + // LLVM IR allocas can have an "arrayness" where multiple elements of the allocated type are allocated at once. + // SPIR-V doesn't have this (because it only has OpVariable and not a "true" alloca), but let's guard against it + // anyway just in case. + if (allocaInst->isArrayAllocation()) + return false; + + auto *allocatedTy = allocaInst->getAllocatedType(); + auto *arrayTy = dyn_cast(allocatedTy); + + StoreInst *aggregateStore = nullptr; + DenseMap elementStores; + + SmallVector toErase; + SmallVector geps; + + // Step 1: Determine if the alloca can be converted and find the relevant store(s) + SmallVector>> pointers; + pointers.emplace_back(allocaInst, 0); + do { + auto [pointer, index] = pointers.pop_back_val(); + for (Use &use : pointer->uses()) { + auto user = cast(use.getUser()); if (auto storeInst = dyn_cast(user)) { - if (pointer == storeInst->getValueOperand() || storeInstFound || !isOuterPointer) { - // Pointer escapes by being stored, or we have already found a "store" - // instruction, or this is a partial "store" instruction. - return nullptr; + if (use.getOperandNo() != storeInst->getPointerOperandIndex() || !index.has_value()) { + // Pointer escapes by being stored, or we store to a dynamically indexed (or otherwise complex) pointer. + return false; } - storeInstFound = storeInst; - } else if (auto getElemPtrInst = dyn_cast(user)) - pointers.push_back(getElemPtrInst); - else if (!isa(user) && !isAssumeLikeIntrinsic(user)) { - // Pointer escapes by being used in some way other than "load/store/getelementptr". - return nullptr; - } - } - if (pointers.empty()) - break; + Value *storeValue = storeInst->getValueOperand(); + if (!isa(storeValue)) + return false; + + // We already have a store of the entire variable. Multiple stores means it's not an overall constant. + if (aggregateStore) + return false; + + if (storeValue->getType() == allocatedTy) { + if (index.value() != 0) { + // This store is out-of-bounds, which makes it UB if it is ever executed (it might be in control flow that + // is unreachable for some reason). Remember the store as to-be-erased and ignore it otherwise. + toErase.push_back(storeInst); + continue; + } + + if (!elementStores.empty()) + return false; + aggregateStore = storeInst; + continue; + } - pointer = pointers.back(); - pointers.pop_back(); - isOuterPointer = false; - } + if (arrayTy && storeValue->getType() == arrayTy->getArrayElementType()) { + if (index.value() >= arrayTy->getArrayNumElements()) { + // This store is out-of-bounds, which makes it UB if it is ever executed (it might be in control flow that + // is unreachable for some reason). Remember the store as to-be-erased and ignore it otherwise. + toErase.push_back(storeInst); + continue; + } - return storeInstFound; -} + if (!elementStores.try_emplace(index.value(), storeInst).second) + return false; + continue; + } -// ===================================================================================================================== -// Converts an "alloca" instruction with a single constant store into a read-only global variable. -// -// NOTE: This erases the "store" instruction (so it will not be lowered by a later lowering pass -// any more) but not the "alloca" or replaced "getelementptr" instruction (they will be removed -// later by DCE pass). -// -// @param storeInst : The single constant store into the "alloca" -void SpirvLowerConstImmediateStore::convertAllocaToReadOnlyGlobal(StoreInst *storeInst) { - auto allocaInst = cast(storeInst->getPointerOperand()); - auto globalType = allocaInst->getAllocatedType(); - auto initVal = cast(storeInst->getValueOperand()); + return false; + } - if (globalType != initVal->getType()) - return; + if (auto gep = dyn_cast(user)) { + geps.push_back(gep); + + if (index.has_value() && arrayTy && gep->getSourceElementType() == allocatedTy && + gep->hasAllConstantIndices() && gep->getNumIndices() == 2 && + cast(gep->getOperand(1))->isNullValue()) { + int64_t gepIdx = cast(gep->getOperand(2))->getSExtValue(); + if (gepIdx >= 0) { + pointers.emplace_back(gep, index.value() + gepIdx); + continue; + } + } - auto global = new GlobalVariable(*m_module, globalType, - true, // isConstant - GlobalValue::InternalLinkage, initVal, "", nullptr, GlobalValue::NotThreadLocal, - SPIRAS_Constant); - global->takeName(allocaInst); - // Change all uses of allocaInst to use global. We need to do it manually, as there is a change - // of address space, and we also need to recreate "getelementptr"s. - std::vector> allocaToGlobalMap; - allocaToGlobalMap.push_back(std::pair(allocaInst, global)); - do { - auto allocaInst = allocaToGlobalMap.back().first; - auto global = allocaToGlobalMap.back().second; - allocaToGlobalMap.pop_back(); - while (!allocaInst->use_empty()) { - auto useIt = allocaInst->use_begin(); - if (auto origGetElemPtrInst = dyn_cast(useIt->getUser())) { - // This use is a "getelementptr" instruction. Create a replacement one with the new address space. - SmallVector indices; - for (auto idxIt = origGetElemPtrInst->idx_begin(), idxItEnd = origGetElemPtrInst->idx_end(); idxIt != idxItEnd; - ++idxIt) - indices.push_back(*idxIt); - auto newGetElemPtrInst = GetElementPtrInst::Create(globalType, global, indices, "", origGetElemPtrInst); - newGetElemPtrInst->takeName(origGetElemPtrInst); - newGetElemPtrInst->setIsInBounds(origGetElemPtrInst->isInBounds()); - newGetElemPtrInst->copyMetadata(*origGetElemPtrInst); - // Remember that we need to replace the uses of the original "getelementptr" with the new one. - allocaToGlobalMap.push_back(std::pair(origGetElemPtrInst, newGetElemPtrInst)); - // Remove the use from the original "getelementptr". - *useIt = PoisonValue::get(allocaInst->getType()); + pointers.emplace_back(gep, std::nullopt); continue; } - if (auto *intrinsic = dyn_cast(useIt->getUser())) { - switch (intrinsic->getIntrinsicID()) { - case Intrinsic::lifetime_start: - case Intrinsic::lifetime_end: - // Lifetime markers are only useful for allocas, not for globals, and if we did not erase them we would have - // to change their name mangling because of the change of address space. - intrinsic->eraseFromParent(); - continue; - } + if (isa(user)) + continue; + + if (isAssumeLikeIntrinsic(user)) { + toErase.push_back(user); + continue; } - *useIt = global; + // Pointer escapes by being used in some way other than "load/store/getelementptr". + return false; + } + } while (!pointers.empty()); + + // Step 2: Extract or build the initializer constant + Constant *initializer = nullptr; + + if (aggregateStore) { + initializer = cast(aggregateStore->getValueOperand()); + } else if (!elementStores.empty()) { + // Give up if the array is 4x larger than the number of element stores. This is a fairly arbitrary heuristic to + // prevent a super-linear blow-up of the size of IR. (Imagine input IR that defines a giant array and writes to + // only a single element of it.) + if (arrayTy->getArrayNumElements() / 4 > elementStores.size()) + return false; + + std::vector elements; + elements.resize(arrayTy->getArrayNumElements()); + + for (uint64_t index = 0; index < elements.size(); ++index) { + if (auto *store = elementStores.lookup(index)) + elements[index] = cast(store->getValueOperand()); + else + elements[index] = PoisonValue::get(arrayTy->getElementType()); } - // Visit next map pair. - } while (!allocaToGlobalMap.empty()); - storeInst->eraseFromParent(); + + initializer = ConstantArray::get(arrayTy, elements); + } else { + initializer = PoisonValue::get(allocatedTy); + } + + // Step 3: Create the global variable and replace the alloca + auto global = new GlobalVariable(*m_module, allocatedTy, + true, // isConstant + GlobalValue::InternalLinkage, initializer, "", nullptr, GlobalValue::NotThreadLocal, + SPIRAS_Constant); + global->takeName(allocaInst); + + for (Use &use : llvm::make_early_inc_range(allocaInst->uses())) + use.set(global); + + for (auto *gep : geps) + gep->mutateType(global->getType()); + + for (auto *inst : toErase) + inst->eraseFromParent(); + if (aggregateStore) + aggregateStore->eraseFromParent(); + for (auto [index, store] : elementStores) { + if (store) + store->eraseFromParent(); + } + allocaInst->eraseFromParent(); + + return true; } } // namespace Llpc diff --git a/llpc/lower/llpcSpirvLowerConstImmediateStore.h b/llpc/lower/llpcSpirvLowerConstImmediateStore.h index e65b45553d..db38ce5b38 100644 --- a/llpc/lower/llpcSpirvLowerConstImmediateStore.h +++ b/llpc/lower/llpcSpirvLowerConstImmediateStore.h @@ -45,14 +45,12 @@ namespace Llpc { class SpirvLowerConstImmediateStore : public SpirvLower, public llvm::PassInfoMixin { public: llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager); - bool runImpl(llvm::Module &module); static llvm::StringRef name() { return "Lower SPIR-V constant immediate store"; } private: - void processAllocaInsts(llvm::Function *func); - llvm::StoreInst *findSingleStore(llvm::AllocaInst *allocaInst); - void convertAllocaToReadOnlyGlobal(llvm::StoreInst *storeInst); + bool processAllocaInsts(llvm::Function *func); + bool tryProcessAlloca(llvm::AllocaInst *allocaInst); }; } // namespace Llpc diff --git a/llpc/test/shaderdb/general/TestCombineOfMultipleStoreInstructions.frag b/llpc/test/shaderdb/general/TestCombineOfMultipleStoreInstructions.frag new file mode 100644 index 0000000000..749a3d27a2 --- /dev/null +++ b/llpc/test/shaderdb/general/TestCombineOfMultipleStoreInstructions.frag @@ -0,0 +1,87 @@ +#version 460 + +// BEGIN_SHADERTEST +// RUN: amdllpc -verify-ir -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s + +// SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results +// SHADERTEST: @{{.*}} = internal unnamed_addr addrspace(4) constant [54 x float] [float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float -1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float -1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float -1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float -1.000000e+00, float 0.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float -1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float -1.000000e+00] +// SHADERTEST: AMDLLPC SUCCESS +// END_SHADERTEST + +layout(push_constant, std430) uniform RootConstants +{ + uvec2 _m0; + uint _m1; + uint _m2; + uint _m3; + uint _m4; + uint _m5; + uint _m6; + uint _m7; +} registers; + +layout(location = 0) out vec4 SV_Target; +float _49[54]; + +void main() +{ + _49[0u] = 0.0; + _49[1u] = 0.0; + _49[2u] = 1.0; + _49[3u] = 0.0; + _49[4u] = 1.0; + _49[5u] = 0.0; + _49[6u] = -1.0; + _49[7u] = 0.0; + _49[8u] = 0.0; + _49[9u] = 0.0; + _49[10u] = 0.0; + _49[11u] = -1.0; + _49[12u] = 0.0; + _49[13u] = 1.0; + _49[14u] = 0.0; + _49[15u] = 1.0; + _49[16u] = 0.0; + _49[17u] = 0.0; + _49[18u] = -1.0; + _49[19u] = 0.0; + _49[20u] = 0.0; + _49[21u] = 0.0; + _49[22u] = 0.0; + _49[23u] = 1.0; + _49[24u] = 0.0; + _49[25u] = 1.0; + _49[26u] = 0.0; + _49[27u] = -1.0; + _49[28u] = 0.0; + _49[29u] = 0.0; + _49[30u] = 0.0; + _49[31u] = 0.0; + _49[32u] = -1.0; + _49[33u] = 0.0; + _49[34u] = -1.0; + _49[35u] = 0.0; + _49[36u] = 1.0; + _49[37u] = 0.0; + _49[38u] = 0.0; + _49[39u] = 0.0; + _49[40u] = 1.0; + _49[41u] = 0.0; + _49[42u] = 0.0; + _49[43u] = 0.0; + _49[44u] = 1.0; + _49[45u] = -1.0; + _49[46u] = 0.0; + _49[47u] = 0.0; + _49[48u] = 0.0; + _49[49u] = 1.0; + _49[50u] = 0.0; + _49[51u] = 0.0; + _49[52u] = 0.0; + _49[53u] = -1.0; + + uvec4 _160 = uvec4(registers._m1, registers._m2, registers._m3, registers._m4); + SV_Target.z = _49[2u + _160.z]; + SV_Target.w = _49[3u + _160.w]; +} +