From d5533b440c764093c04a4859b30fc78ddb0e0bbe Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Tue, 25 Jun 2019 13:03:51 -0400 Subject: [PATCH] shader_ir: Unify blocks in decompiled shaders. --- .../renderer_opengl/gl_shader_decompiler.cpp | 10 ++-- src/video_core/shader/control_flow.cpp | 47 ++++++------------- src/video_core/shader/control_flow.h | 3 +- src/video_core/shader/decode.cpp | 45 +++++++++++++----- src/video_core/shader/decode/other.cpp | 30 +++++++++--- src/video_core/shader/node.h | 12 ++--- src/video_core/shader/shader_ir.h | 6 +++ 7 files changed, 90 insertions(+), 63 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index cedfe30b1d..bfc975a04d 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -191,10 +191,12 @@ public: // TODO(Subv): Figure out the actual depth of the flow stack, for now it seems // unlikely that shaders will use 20 nested SSYs and PBKs. - constexpr u32 FLOW_STACK_SIZE = 20; - for (const auto stack : std::array{MetaStackClass::Ssy, MetaStackClass::Pbk}) { - code.AddLine("uint {}[{}];", FlowStackName(stack), FLOW_STACK_SIZE); - code.AddLine("uint {} = 0u;", FlowStackTopName(stack)); + if (!ir.IsFlowStackDisabled()) { + constexpr u32 FLOW_STACK_SIZE = 20; + for (const auto stack : std::array{MetaStackClass::Ssy, MetaStackClass::Pbk}) { + code.AddLine("uint {}[{}];", FlowStackName(stack), FLOW_STACK_SIZE); + code.AddLine("uint {} = 0u;", FlowStackTopName(stack)); + } } code.AddLine("while (true) {{"); diff --git a/src/video_core/shader/control_flow.cpp b/src/video_core/shader/control_flow.cpp index 3af4c61902..c99d95b574 100644 --- a/src/video_core/shader/control_flow.cpp +++ b/src/video_core/shader/control_flow.cpp @@ -1,5 +1,6 @@ #include +#include #include #include #include @@ -104,28 +105,6 @@ struct BlockInfo { } }; -struct Stamp { - Stamp() = default; - Stamp(u32 address, u32 target) : address{address}, target{target} {} - u32 address{}; - u32 target{}; - bool operator==(const Stamp& sb) const { - return std::tie(address, target) == std::tie(sb.address, sb.target); - } - bool operator<(const Stamp& sb) const { - return address < sb.address; - } - bool operator>(const Stamp& sb) const { - return address > sb.address; - } - bool operator<=(const Stamp& sb) const { - return address <= sb.address; - } - bool operator>=(const Stamp& sb) const { - return address >= sb.address; - } -}; - struct CFGRebuildState { explicit CFGRebuildState(const ProgramCode& program_code, const std::size_t program_size) : program_code{program_code}, program_size{program_size} { @@ -144,8 +123,8 @@ struct CFGRebuildState { std::list queries{}; std::unordered_map registered{}; std::unordered_set labels{}; - std::set ssy_labels; - std::set pbk_labels; + std::map ssy_labels; + std::map pbk_labels; std::unordered_map stacks{}; const ProgramCode& program_code; const std::size_t program_size; @@ -393,7 +372,7 @@ bool TryInspectAddress(CFGRebuildState& state) { } case BlockCollision::Inside: { // This case is the tricky one: - // We need to Split the block in 2 sepprate blocks + // We need to Split the block in 2 sepparate blocks auto it = search_result.second; block_info = CreateBlockInfo(state, address, it->end); it->end = address - 1; @@ -428,13 +407,11 @@ bool TryInspectAddress(CFGRebuildState& state) { } bool TryQuery(CFGRebuildState& state) { - auto gather_labels = ([](ControlStack& cc, std::set labels, BlockInfo& block) { - Stamp start{block.start, 0}; - Stamp end{block.end, 0}; - auto gather_start = labels.lower_bound(start); - auto gather_end = labels.upper_bound(end); + auto gather_labels = ([](ControlStack& cc, std::map& labels, BlockInfo& block) { + auto gather_start = labels.lower_bound(block.start); + auto gather_end = labels.upper_bound(block.end); while (gather_start != gather_end) { - cc.Push(gather_start->target); + cc.Push(gather_start->second); gather_start++; } }); @@ -444,9 +421,13 @@ bool TryQuery(CFGRebuildState& state) { Query& q = state.queries.front(); u32 block_index = state.registered[q.address]; BlockInfo& block = state.block_info[block_index]; + // If the block is visted, check if the stacks match, else gather the ssy/pbk + // labels into the current stack and look if the branch at the end of the block + // consumes a label. Schedule new queries accordingly if (block.visited) { BlockStack& stack = state.stacks[q.address]; - bool all_okay = q.ssy_stack.Compare(stack.ssy_stack) && q.pbk_stack.Compare(stack.pbk_stack); + bool all_okay = (stack.ssy_stack.Size() == 0 || q.ssy_stack.Compare(stack.ssy_stack)) && + (stack.pbk_stack.Size() == 0 || q.pbk_stack.Compare(stack.pbk_stack)); state.queries.pop_front(); return all_okay; } @@ -523,8 +504,10 @@ bool ScanFlow(const ProgramCode& program_code, u32 program_size, u32 start_addre result_out.blocks.push_back(new_block); } if (result_out.decompilable) { + result_out.labels = std::move(state.labels); return true; } + // If it's not decompilable, merge the unlabelled blocks together auto back = result_out.blocks.begin(); auto next = std::next(back); while (next != result_out.blocks.end()) { diff --git a/src/video_core/shader/control_flow.h b/src/video_core/shader/control_flow.h index f5d37a231f..4a2cd622c9 100644 --- a/src/video_core/shader/control_flow.h +++ b/src/video_core/shader/control_flow.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include "video_core/engines/shader_bytecode.h" #include "video_core/shader/shader_ir.h" @@ -48,6 +48,7 @@ struct ShaderCharacteristics { bool decompilable{}; u32 start; u32 end; + std::unordered_set labels{}; }; bool ScanFlow(const ProgramCode& program_code, u32 program_size, u32 start_address, diff --git a/src/video_core/shader/decode.cpp b/src/video_core/shader/decode.cpp index 1a74b70cb4..f9b1960daa 100644 --- a/src/video_core/shader/decode.cpp +++ b/src/video_core/shader/decode.cpp @@ -38,32 +38,47 @@ constexpr bool IsSchedInstruction(u32 offset, u32 main_offset) { void ShaderIR::Decode() { std::memcpy(&header, program_code.data(), sizeof(Tegra::Shader::Header)); + disable_flow_stack = false; ShaderCharacteristics shader_info{}; bool can_proceed = ScanFlow(program_code, program_code.size(), main_offset, shader_info); if (can_proceed) { coverage_begin = shader_info.start; coverage_end = shader_info.end; if (shader_info.decompilable) { - std::list& blocks = shader_info.blocks; - for (auto& block : blocks) { - NodeBlock nodes; - if (!block.ignore_branch) { - nodes = DecodeRange(block.start, block.end); - InsertControlFlow(nodes, block); - } else { - nodes = DecodeRange(block.start, block.end + 1); + disable_flow_stack = true; + auto insert_block = ([this](NodeBlock& nodes, u32 label) { + if (label == exit_branch) { + return; + } + basic_blocks.insert({label, nodes}); + }); + std::list& blocks = shader_info.blocks; + NodeBlock current_block; + u32 current_label = exit_branch; + for (auto& block : blocks) { + if (shader_info.labels.count(block.start) != 0) { + insert_block(current_block, current_label); + current_block.clear(); + current_label = block.start; + } + if (!block.ignore_branch) { + DecodeRangeInner(current_block, block.start, block.end); + InsertControlFlow(current_block, block); + } else { + DecodeRangeInner(current_block, block.start, block.end + 1); } - basic_blocks.insert({block.start, nodes}); } + insert_block(current_block, current_label); return; } + LOG_WARNING(HW_GPU, "Flow Stack Removing Failed! Falling back to old method"); // we can't decompile it, fallback to standard method for (const auto& block : shader_info.blocks) { basic_blocks.insert({block.start, DecodeRange(block.start, block.end + 1)}); } return; } - LOG_WARNING(HW_GPU, "Flow Analysis failed, falling back to brute force compiling"); + LOG_WARNING(HW_GPU, "Flow Analysis Failed! Falling back to brute force compiling"); // Now we need to deal with an undecompilable shader. We need to brute force // a shader that captures every position. @@ -78,12 +93,16 @@ void ShaderIR::Decode() { NodeBlock ShaderIR::DecodeRange(u32 begin, u32 end) { NodeBlock basic_block; - for (u32 pc = begin; pc < (begin > end ? MAX_PROGRAM_LENGTH : end);) { - pc = DecodeInstr(basic_block, pc); - } + DecodeRangeInner(basic_block, begin, end); return basic_block; } +void ShaderIR::DecodeRangeInner(NodeBlock& bb, u32 begin, u32 end) { + for (u32 pc = begin; pc < (begin > end ? MAX_PROGRAM_LENGTH : end);) { + pc = DecodeInstr(bb, pc); + } +} + void ShaderIR::InsertControlFlow(NodeBlock& bb, const ShaderBlock& block) { auto apply_conditions = ([&](const Condition& cond, Node n) -> Node { Node result = n; diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp index ed3c63781f..42e3de02fc 100644 --- a/src/video_core/shader/decode/other.cpp +++ b/src/video_core/shader/decode/other.cpp @@ -98,9 +98,10 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { } else { const u32 target = pc + 1; const Node op_a = GetConstBuffer(instr.cbuf36.index, instr.cbuf36.GetOffset()); - const Node convert = SignedOperation(OperationCode::IArithmeticShiftRight, - true, PRECISE, op_a, Immediate(3)); - const Node operand = Operation(OperationCode::IAdd, PRECISE, convert, Immediate(target)); + const Node convert = SignedOperation(OperationCode::IArithmeticShiftRight, true, + PRECISE, op_a, Immediate(3)); + const Node operand = + Operation(OperationCode::IAdd, PRECISE, convert, Immediate(target)); branch = Operation(OperationCode::BranchIndirect, convert); } @@ -119,14 +120,14 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { const Node index = GetRegister(instr.gpr8); const Node op_a = GetConstBufferIndirect(instr.cbuf36.index, instr.cbuf36.GetOffset() + 0, index); - const Node convert = SignedOperation(OperationCode::IArithmeticShiftRight, - true, PRECISE, op_a, Immediate(3)); + const Node convert = SignedOperation(OperationCode::IArithmeticShiftRight, true, + PRECISE, op_a, Immediate(3)); operand = Operation(OperationCode::IAdd, PRECISE, convert, Immediate(target)); } else { const s32 target = pc + instr.brx.GetBranchExtend(); const Node op_a = GetRegister(instr.gpr8); - const Node convert = SignedOperation(OperationCode::IArithmeticShiftRight, - true, PRECISE, op_a, Immediate(3)); + const Node convert = SignedOperation(OperationCode::IArithmeticShiftRight, true, + PRECISE, op_a, Immediate(3)); operand = Operation(OperationCode::IAdd, PRECISE, convert, Immediate(target)); } const Node branch = Operation(OperationCode::BranchIndirect, operand); @@ -143,6 +144,10 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0, "Constant buffer flow is not supported"); + if (disable_flow_stack) { + break; + } + // The SSY opcode tells the GPU where to re-converge divergent execution paths with SYNC. const u32 target = pc + instr.bra.GetBranchTarget(); bb.push_back( @@ -153,6 +158,10 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0, "Constant buffer PBK is not supported"); + if (disable_flow_stack) { + break; + } + // PBK pushes to a stack the address where BRK will jump to. const u32 target = pc + instr.bra.GetBranchTarget(); bb.push_back( @@ -164,6 +173,10 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { UNIMPLEMENTED_IF_MSG(cc != Tegra::Shader::ConditionCode::T, "SYNC condition code used: {}", static_cast(cc)); + if (disable_flow_stack) { + break; + } + // The SYNC opcode jumps to the address previously set by the SSY opcode bb.push_back(Operation(OperationCode::PopFlowStack, MetaStackClass::Ssy)); break; @@ -172,6 +185,9 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { const Tegra::Shader::ConditionCode cc = instr.flow_condition_code; UNIMPLEMENTED_IF_MSG(cc != Tegra::Shader::ConditionCode::T, "BRK condition code used: {}", static_cast(cc)); + if (disable_flow_stack) { + break; + } // The BRK opcode jumps to the address previously set by the PBK opcode bb.push_back(Operation(OperationCode::PopFlowStack, MetaStackClass::Pbk)); diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h index e468758a6d..7427ed896d 100644 --- a/src/video_core/shader/node.h +++ b/src/video_core/shader/node.h @@ -148,12 +148,12 @@ enum class OperationCode { ImageStore, /// (MetaImage, float[N] coords) -> void - Branch, /// (uint branch_target) -> void - BranchIndirect,/// (uint branch_target) -> void - PushFlowStack, /// (uint branch_target) -> void - PopFlowStack, /// () -> void - Exit, /// () -> void - Discard, /// () -> void + Branch, /// (uint branch_target) -> void + BranchIndirect, /// (uint branch_target) -> void + PushFlowStack, /// (uint branch_target) -> void + PopFlowStack, /// () -> void + Exit, /// () -> void + Discard, /// () -> void EmitVertex, /// () -> void EndPrimitive, /// () -> void diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h index a6729064b8..928ac7cb5c 100644 --- a/src/video_core/shader/shader_ir.h +++ b/src/video_core/shader/shader_ir.h @@ -123,10 +123,15 @@ public: return header; } + bool IsFlowStackDisabled() const { + return disable_flow_stack; + } + private: void Decode(); NodeBlock DecodeRange(u32 begin, u32 end); + void DecodeRangeInner(NodeBlock& bb, u32 begin, u32 end); void InsertControlFlow(NodeBlock& bb, const ShaderBlock& block); /** @@ -320,6 +325,7 @@ private: const ProgramCode& program_code; const u32 main_offset; const std::size_t program_size; + bool disable_flow_stack{}; u32 coverage_begin{}; u32 coverage_end{};