diff --git a/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp b/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp index ef6cac302d..59d0bc13c4 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp @@ -102,19 +102,14 @@ A32EmitX64::BlockDescriptor A32EmitX64::Emit(IR::Block& block) { } code.EnableWriting(); - - const boost::container::static_vector gpr_order = [this] { - boost::container::static_vector gprs{any_gpr}; - if (conf.fastmem_pointer) { - gprs.erase(std::find(gprs.begin(), gprs.end(), HostLoc::R13)); - } - if (conf.page_table) { - gprs.erase(std::find(gprs.begin(), gprs.end(), HostLoc::R14)); - } + new (&this->reg_alloc) RegAlloc([this] { + std::bitset<32> gprs{any_gpr}; + if (conf.fastmem_pointer) + gprs.reset(size_t(HostLoc::R13)); + if (conf.page_table) + gprs.reset(size_t(HostLoc::R14)); return gprs; - }(); - - new (&this->reg_alloc) RegAlloc(gpr_order, any_xmm); + }(), any_xmm); A32EmitContext ctx{conf, reg_alloc, block}; // Start emitting. diff --git a/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp b/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp index 97faaa7ec4..825ff6f9cf 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp @@ -76,18 +76,14 @@ A64EmitX64::BlockDescriptor A64EmitX64::Emit(IR::Block& block) noexcept { } code.EnableWriting(); - const boost::container::static_vector gpr_order = [this] { - boost::container::static_vector gprs{any_gpr}; - if (conf.fastmem_pointer) { - gprs.erase(std::find(gprs.begin(), gprs.end(), HostLoc::R13)); - } - if (conf.page_table) { - gprs.erase(std::find(gprs.begin(), gprs.end(), HostLoc::R14)); - } + new (&this->reg_alloc) RegAlloc{[this] { + std::bitset<32> gprs = any_gpr; + if (conf.fastmem_pointer) + gprs.reset(size_t(HostLoc::R13)); + if (conf.page_table) + gprs.reset(size_t(HostLoc::R14)); return gprs; - }(); - - new (&this->reg_alloc) RegAlloc{gpr_order, any_xmm}; + }(), any_xmm}; A64EmitContext ctx{conf, reg_alloc, block}; // Start emitting. @@ -188,7 +184,7 @@ void A64EmitX64::ClearFastDispatchTable() { void A64EmitX64::GenTerminalHandlers() { // PC ends up in rcx, location_descriptor ends up in rbx - static_assert(std::find(ABI_ALL_CALLEE_SAVE.begin(), ABI_ALL_CALLEE_SAVE.end(), HostLoc::R12) != ABI_ALL_CALLEE_SAVE.end()); + //static_assert(ABI_ALL_CALLEE_SAVE.test(size_t(HostLoc::R12))); const auto calculate_location_descriptor = [this] { // This calculation has to match up with A64::LocationDescriptor::UniqueHash // TODO: Optimization is available here based on known state of fpcr. diff --git a/src/dynarmic/src/dynarmic/backend/x64/abi.cpp b/src/dynarmic/src/dynarmic/backend/x64/abi.cpp index 14f1b287ac..413af7b557 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/abi.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/abi.cpp @@ -40,59 +40,53 @@ static FrameInfo CalculateFrameInfo(const size_t num_gprs, const size_t num_xmms }; } -template -void ABI_PushRegistersAndAdjustStack(BlockOfCode& code, const size_t frame_size, const RegisterArrayT& regs) { +void ABI_PushRegistersAndAdjustStack(BlockOfCode& code, const size_t frame_size, std::bitset<32> const& regs) { using namespace Xbyak::util; - const size_t num_gprs = std::count_if(regs.begin(), regs.end(), HostLocIsGPR); - const size_t num_xmms = std::count_if(regs.begin(), regs.end(), HostLocIsXMM); + const size_t num_gprs = (ABI_ALL_GPRS & regs).count(); + const size_t num_xmms = (ABI_ALL_XMMS & regs).count(); const FrameInfo frame_info = CalculateFrameInfo(num_gprs, num_xmms, frame_size); - for (auto const gpr : regs) - if (HostLocIsGPR(gpr)) - code.push(HostLocToReg64(gpr)); + for (size_t i = 0; i < regs.size(); ++i) + if (regs[i] && HostLocIsGPR(HostLoc(i))) + code.push(HostLocToReg64(HostLoc(i))); if (frame_info.stack_subtraction != 0) code.sub(rsp, u32(frame_info.stack_subtraction)); size_t xmm_offset = frame_info.xmm_offset; - for (auto const xmm : regs) { - if (HostLocIsXMM(xmm)) { + for (size_t i = 0; i < regs.size(); ++i) { + if (regs[i] && HostLocIsXMM(HostLoc(i))) { if (code.HasHostFeature(HostFeature::AVX)) { - code.vmovaps(code.xword[rsp + xmm_offset], HostLocToXmm(xmm)); + code.vmovaps(code.xword[rsp + xmm_offset], HostLocToXmm(HostLoc(i))); } else { - code.movaps(code.xword[rsp + xmm_offset], HostLocToXmm(xmm)); + code.movaps(code.xword[rsp + xmm_offset], HostLocToXmm(HostLoc(i))); } xmm_offset += XMM_SIZE; } } } -template -void ABI_PopRegistersAndAdjustStack(BlockOfCode& code, const size_t frame_size, const RegisterArrayT& regs) { +void ABI_PopRegistersAndAdjustStack(BlockOfCode& code, const size_t frame_size, std::bitset<32> const& regs) { using namespace Xbyak::util; - const size_t num_gprs = std::count_if(regs.begin(), regs.end(), HostLocIsGPR); - const size_t num_xmms = std::count_if(regs.begin(), regs.end(), HostLocIsXMM); + const size_t num_gprs = (ABI_ALL_GPRS & regs).count(); + const size_t num_xmms = (ABI_ALL_XMMS & regs).count(); const FrameInfo frame_info = CalculateFrameInfo(num_gprs, num_xmms, frame_size); size_t xmm_offset = frame_info.xmm_offset + (num_xmms * XMM_SIZE); - for (auto it = regs.rbegin(); it != regs.rend(); ++it) { - auto const xmm = *it; - if (HostLocIsXMM(xmm)) { + for (int32_t i = regs.size() - 1; i >= 0; --i) + if (regs[i] && HostLocIsXMM(HostLoc(i))) { xmm_offset -= XMM_SIZE; if (code.HasHostFeature(HostFeature::AVX)) { - code.vmovaps(HostLocToXmm(xmm), code.xword[rsp + xmm_offset]); + code.vmovaps(HostLocToXmm(HostLoc(i)), code.xword[rsp + xmm_offset]); } else { - code.movaps(HostLocToXmm(xmm), code.xword[rsp + xmm_offset]); + code.movaps(HostLocToXmm(HostLoc(i)), code.xword[rsp + xmm_offset]); } } - } if (frame_info.stack_subtraction != 0) code.add(rsp, u32(frame_info.stack_subtraction)); - for (auto it = regs.rbegin(); it != regs.rend(); ++it) { - auto const gpr = *it; - if (HostLocIsGPR(gpr)) - code.pop(HostLocToReg64(gpr)); - } + for (int32_t i = regs.size() - 1; i >= 0; --i) + if (regs[i] && HostLocIsGPR(HostLoc(i))) + code.pop(HostLocToReg64(HostLoc(i))); } void ABI_PushCalleeSaveRegistersAndAdjustStack(BlockOfCode& code, const std::size_t frame_size) { @@ -112,74 +106,16 @@ void ABI_PopCallerSaveRegistersAndAdjustStack(BlockOfCode& code, const std::size } // Windows ABI registers are not in the same allocation algorithm as unix's -#ifdef _MSC_VER void ABI_PushCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, const HostLoc exception) { - std::vector regs; - std::remove_copy(ABI_ALL_CALLER_SAVE.begin(), ABI_ALL_CALLER_SAVE.end(), std::back_inserter(regs), exception); + std::bitset<32> regs = ABI_ALL_CALLER_SAVE; + regs.reset(size_t(exception)); ABI_PushRegistersAndAdjustStack(code, 0, regs); } void ABI_PopCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, const HostLoc exception) { - std::vector regs; - std::remove_copy(ABI_ALL_CALLER_SAVE.begin(), ABI_ALL_CALLER_SAVE.end(), std::back_inserter(regs), exception); + std::bitset<32> regs = ABI_ALL_CALLER_SAVE; + regs.reset(size_t(exception)); ABI_PopRegistersAndAdjustStack(code, 0, regs); } -#else -static consteval size_t ABI_AllCallerSaveSize() noexcept { - return ABI_ALL_CALLER_SAVE.max_size(); -} -static consteval std::array ABI_AllCallerSaveExcept(const std::size_t except) noexcept { - std::array arr; - for(std::size_t i = 0; i < arr.size(); ++i) { - arr[i] = static_cast(i + (i >= except ? 1 : 0)); - } - return arr; -} - -alignas(64) static constinit std::array ABI_CALLER_SAVED_EXCEPT_TABLE[32] = { - ABI_AllCallerSaveExcept(0), - ABI_AllCallerSaveExcept(1), - ABI_AllCallerSaveExcept(2), - ABI_AllCallerSaveExcept(3), - ABI_AllCallerSaveExcept(4), - ABI_AllCallerSaveExcept(5), - ABI_AllCallerSaveExcept(6), - ABI_AllCallerSaveExcept(7), - ABI_AllCallerSaveExcept(8), - ABI_AllCallerSaveExcept(9), - ABI_AllCallerSaveExcept(10), - ABI_AllCallerSaveExcept(11), - ABI_AllCallerSaveExcept(12), - ABI_AllCallerSaveExcept(13), - ABI_AllCallerSaveExcept(14), - ABI_AllCallerSaveExcept(15), - ABI_AllCallerSaveExcept(16), - ABI_AllCallerSaveExcept(17), - ABI_AllCallerSaveExcept(18), - ABI_AllCallerSaveExcept(19), - ABI_AllCallerSaveExcept(20), - ABI_AllCallerSaveExcept(21), - ABI_AllCallerSaveExcept(22), - ABI_AllCallerSaveExcept(23), - ABI_AllCallerSaveExcept(24), - ABI_AllCallerSaveExcept(25), - ABI_AllCallerSaveExcept(26), - ABI_AllCallerSaveExcept(27), - ABI_AllCallerSaveExcept(28), - ABI_AllCallerSaveExcept(29), - ABI_AllCallerSaveExcept(30), - ABI_AllCallerSaveExcept(31), -}; - -void ABI_PushCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, const HostLoc exception) { - ASSUME(size_t(exception) < 32); - ABI_PushRegistersAndAdjustStack(code, 0, ABI_CALLER_SAVED_EXCEPT_TABLE[size_t(exception)]); -} - -void ABI_PopCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, const HostLoc exception) { - ASSUME(size_t(exception) < 32); - ABI_PopRegistersAndAdjustStack(code, 0, ABI_CALLER_SAVED_EXCEPT_TABLE[size_t(exception)]); -} -#endif } // namespace Dynarmic::Backend::X64 diff --git a/src/dynarmic/src/dynarmic/backend/x64/abi.h b/src/dynarmic/src/dynarmic/backend/x64/abi.h index 307817a864..c37910ce22 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/abi.h +++ b/src/dynarmic/src/dynarmic/backend/x64/abi.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project // SPDX-License-Identifier: GPL-3.0-or-later /* This file is part of the dynarmic project. @@ -8,15 +8,52 @@ #pragma once #include +#include #include "dynarmic/common/common_types.h" - #include "dynarmic/backend/x64/hostloc.h" namespace Dynarmic::Backend::X64 { class BlockOfCode; +constexpr std::bitset<32> ABI_ALL_GPRS = BuildRegSet({ + HostLoc::RAX, + HostLoc::RBX, + HostLoc::RCX, + HostLoc::RDX, + HostLoc::RDI, + HostLoc::RSI, + HostLoc::RBP, + HostLoc::RSP, + HostLoc::R8, + HostLoc::R9, + HostLoc::R10, + HostLoc::R11, + HostLoc::R12, + HostLoc::R13, + HostLoc::R14, + HostLoc::R15, +}); +constexpr std::bitset<32> ABI_ALL_XMMS = BuildRegSet({ + HostLoc::XMM0, + HostLoc::XMM1, + HostLoc::XMM2, + HostLoc::XMM3, + HostLoc::XMM4, + HostLoc::XMM5, + HostLoc::XMM6, + HostLoc::XMM7, + HostLoc::XMM8, + HostLoc::XMM9, + HostLoc::XMM10, + HostLoc::XMM11, + HostLoc::XMM12, + HostLoc::XMM13, + HostLoc::XMM14, + HostLoc::XMM15, +}); + constexpr HostLoc ABI_JIT_PTR = HostLoc::R15; #ifdef _WIN32 @@ -29,7 +66,7 @@ constexpr HostLoc ABI_PARAM2 = HostLoc::RDX; constexpr HostLoc ABI_PARAM3 = HostLoc::R8; constexpr HostLoc ABI_PARAM4 = HostLoc::R9; -constexpr std::array ABI_ALL_CALLER_SAVE = { +constexpr std::bitset<32> ABI_ALL_CALLER_SAVE = BuildRegSet({ HostLoc::RAX, HostLoc::RCX, HostLoc::RDX, @@ -43,9 +80,9 @@ constexpr std::array ABI_ALL_CALLER_SAVE = { HostLoc::XMM3, HostLoc::XMM4, HostLoc::XMM5, -}; +}); -constexpr std::array ABI_ALL_CALLEE_SAVE = { +constexpr std::bitset<32> ABI_ALL_CALLEE_SAVE = BuildRegSet({ HostLoc::RBX, HostLoc::RSI, HostLoc::RDI, @@ -64,7 +101,7 @@ constexpr std::array ABI_ALL_CALLEE_SAVE = { HostLoc::XMM13, HostLoc::XMM14, HostLoc::XMM15, -}; +}); constexpr size_t ABI_SHADOW_SPACE = 32; // bytes @@ -82,7 +119,7 @@ constexpr HostLoc ABI_PARAM4 = HostLoc::RCX; constexpr HostLoc ABI_PARAM5 = HostLoc::R8; constexpr HostLoc ABI_PARAM6 = HostLoc::R9; -constexpr std::array ABI_ALL_CALLER_SAVE = { +constexpr std::bitset<32> ABI_ALL_CALLER_SAVE = BuildRegSet({ HostLoc::RAX, HostLoc::RCX, HostLoc::RDX, @@ -108,22 +145,22 @@ constexpr std::array ABI_ALL_CALLER_SAVE = { HostLoc::XMM13, HostLoc::XMM14, HostLoc::XMM15, -}; +}); -constexpr std::array ABI_ALL_CALLEE_SAVE = { +constexpr std::bitset<32> ABI_ALL_CALLEE_SAVE = BuildRegSet({ HostLoc::RBX, HostLoc::RBP, HostLoc::R12, HostLoc::R13, HostLoc::R14, HostLoc::R15, -}; +}); constexpr size_t ABI_SHADOW_SPACE = 0; // bytes #endif -static_assert(ABI_ALL_CALLER_SAVE.size() + ABI_ALL_CALLEE_SAVE.size() == 31, "Invalid total number of registers"); +//static_assert(ABI_ALL_CALLER_SAVE.count() + ABI_ALL_CALLEE_SAVE.count() == 31, "Invalid total number of registers"); void ABI_PushCalleeSaveRegistersAndAdjustStack(BlockOfCode& code, size_t frame_size = 0); void ABI_PopCalleeSaveRegistersAndAdjustStack(BlockOfCode& code, size_t frame_size = 0); diff --git a/src/dynarmic/src/dynarmic/backend/x64/hostloc.h b/src/dynarmic/src/dynarmic/backend/x64/hostloc.h index bd6a5cede8..2feecf5d5e 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/hostloc.h +++ b/src/dynarmic/src/dynarmic/backend/x64/hostloc.h @@ -7,6 +7,9 @@ */ #pragma once +#include +#include + #include "dynarmic/common/assert.h" #include "dynarmic/common/common_types.h" #include "dynarmic/backend/x64/xbyak.h" @@ -106,13 +109,18 @@ constexpr size_t HostLocBitWidth(HostLoc loc) { UNREACHABLE(); } -using HostLocList = std::initializer_list; +constexpr std::bitset<32> BuildRegSet(std::initializer_list regs) { + size_t bits = 0; + for (auto const& reg : regs) + bits |= size_t{1} << size_t(reg); + return {bits}; +} // RSP is preserved for function calls // R13 contains fastmem pointer if any // R14 contains the pagetable pointer // R15 contains the JitState pointer -const HostLocList any_gpr = { +const std::bitset<32> any_gpr = BuildRegSet({ HostLoc::RAX, HostLoc::RBX, HostLoc::RCX, @@ -128,13 +136,13 @@ const HostLocList any_gpr = { HostLoc::R13, HostLoc::R14, //HostLoc::R15, -}; +}); // XMM0 is reserved for use by instructions that implicitly use it as an argument // XMM1 is used by 128 mem accessors // XMM2 is also used by that (and other stuff) // Basically dont use either XMM0, XMM1 or XMM2 ever; they're left for the regsel -const HostLocList any_xmm = { +const std::bitset<32> any_xmm = BuildRegSet({ //HostLoc::XMM1, //HostLoc::XMM2, HostLoc::XMM3, @@ -150,7 +158,7 @@ const HostLocList any_xmm = { HostLoc::XMM13, HostLoc::XMM14, HostLoc::XMM15, -}; +}); inline Xbyak::Reg64 HostLocToReg64(HostLoc loc) noexcept { ASSERT(HostLocIsGPR(loc)); diff --git a/src/dynarmic/src/dynarmic/backend/x64/reg_alloc.cpp b/src/dynarmic/src/dynarmic/backend/x64/reg_alloc.cpp index f4326204b2..5c5ed25131 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/reg_alloc.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/reg_alloc.cpp @@ -6,19 +6,19 @@ * SPDX-License-Identifier: 0BSD */ -#include "dynarmic/backend/x64/reg_alloc.h" - #include #include #include #include #include +#include "dynarmic/backend/x64/hostloc.h" #include "dynarmic/common/assert.h" #include #include "dynarmic/backend/x64/xbyak.h" #include "dynarmic/backend/x64/abi.h" +#include "dynarmic/backend/x64/reg_alloc.h" #include "dynarmic/backend/x64/stack_layout.h" #include "dynarmic/backend/x64/verbose_debugging_output.h" @@ -185,9 +185,8 @@ bool Argument::IsInMemory(RegAlloc& reg_alloc) const noexcept { return HostLocIsSpill(*reg_alloc.ValueLocation(value.GetInst())); } -RegAlloc::RegAlloc(boost::container::static_vector gpr_order, boost::container::static_vector xmm_order) noexcept - : gpr_order(gpr_order), - xmm_order(xmm_order) +RegAlloc::RegAlloc(std::bitset<32> gpr_order, std::bitset<32> xmm_order) noexcept + : gpr_order(gpr_order), xmm_order(xmm_order) {} RegAlloc::ArgumentInfo RegAlloc::GetArgumentInfo(const IR::Inst* inst) noexcept { @@ -237,7 +236,7 @@ Xbyak::Xmm RegAlloc::UseScratchXmm(BlockOfCode& code, Argument& arg) noexcept { void RegAlloc::UseScratch(BlockOfCode& code, Argument& arg, HostLoc host_loc) noexcept { ASSERT(!arg.allocated); arg.allocated = true; - UseScratchImpl(code, arg.value, {host_loc}); + UseScratchImpl(code, arg.value, BuildRegSet({host_loc})); } void RegAlloc::DefineValue(BlockOfCode& code, IR::Inst* inst, const Xbyak::Reg& reg) noexcept { @@ -258,7 +257,7 @@ void RegAlloc::Release(const Xbyak::Reg& reg) noexcept { LocInfo(hostloc).ReleaseOne(); } -HostLoc RegAlloc::UseImpl(BlockOfCode& code, IR::Value use_value, const boost::container::static_vector& desired_locations) noexcept { +HostLoc RegAlloc::UseImpl(BlockOfCode& code, IR::Value use_value, std::bitset<32> desired_locations) noexcept { if (use_value.IsImmediate()) { return LoadImmediate(code, use_value, ScratchImpl(code, desired_locations)); } @@ -266,8 +265,7 @@ HostLoc RegAlloc::UseImpl(BlockOfCode& code, IR::Value use_value, const boost::c auto const* use_inst = use_value.GetInst(); HostLoc const current_location = *ValueLocation(use_inst); - const bool can_use_current_location = std::find(desired_locations.begin(), desired_locations.end(), current_location) != desired_locations.end(); - if (can_use_current_location) { + if (HostLocIsRegister(current_location) && desired_locations.test(size_t(current_location))) { LocInfo(current_location).ReadLock(); return current_location; } @@ -290,7 +288,7 @@ HostLoc RegAlloc::UseImpl(BlockOfCode& code, IR::Value use_value, const boost::c return destination_location; } -HostLoc RegAlloc::UseScratchImpl(BlockOfCode& code, IR::Value use_value, const boost::container::static_vector& desired_locations) noexcept { +HostLoc RegAlloc::UseScratchImpl(BlockOfCode& code, IR::Value use_value, std::bitset<32> desired_locations) noexcept { if (use_value.IsImmediate()) { return LoadImmediate(code, use_value, ScratchImpl(code, desired_locations)); } @@ -298,9 +296,7 @@ HostLoc RegAlloc::UseScratchImpl(BlockOfCode& code, IR::Value use_value, const b const auto* use_inst = use_value.GetInst(); const HostLoc current_location = *ValueLocation(use_inst); const size_t bit_width = GetBitWidth(use_inst->GetType()); - - const bool can_use_current_location = std::find(desired_locations.begin(), desired_locations.end(), current_location) != desired_locations.end(); - if (can_use_current_location && !LocInfo(current_location).IsLocked()) { + if (HostLocIsRegister(current_location) && desired_locations.test(size_t(current_location)) && !LocInfo(current_location).IsLocked()) { if (LocInfo(current_location).IsLastUse()) { LocInfo(current_location).is_set_last_use = true; } else { @@ -317,7 +313,7 @@ HostLoc RegAlloc::UseScratchImpl(BlockOfCode& code, IR::Value use_value, const b return destination_location; } -HostLoc RegAlloc::ScratchImpl(BlockOfCode& code, const boost::container::static_vector& desired_locations) noexcept { +HostLoc RegAlloc::ScratchImpl(BlockOfCode& code, std::bitset<32> desired_locations) noexcept { const HostLoc location = SelectARegister(desired_locations); MoveOutOfTheWay(code, location); LocInfo(location).WriteLock(); @@ -336,11 +332,11 @@ void RegAlloc::HostCall( constexpr std::array args_hostloc = {ABI_PARAM1, ABI_PARAM2, ABI_PARAM3, ABI_PARAM4}; const std::array, args_count> args = {arg0, arg1, arg2, arg3}; - static const boost::container::static_vector other_caller_save = [args_hostloc]() noexcept { - boost::container::static_vector ret(ABI_ALL_CALLER_SAVE.begin(), ABI_ALL_CALLER_SAVE.end()); - ret.erase(std::find(ret.begin(), ret.end(), ABI_RETURN)); + static const std::bitset<32> other_caller_save = [args_hostloc]() noexcept { + std::bitset<32> ret = ABI_ALL_CALLER_SAVE; + ret.reset(size_t(ABI_RETURN)); for (auto const hostloc : args_hostloc) - ret.erase(std::find(ret.begin(), ret.end(), hostloc)); + ret.reset(size_t(hostloc)); return ret; }(); @@ -356,9 +352,11 @@ void RegAlloc::HostCall( } } // Must match with with ScratchImpl - for (auto const gpr : other_caller_save) { - MoveOutOfTheWay(code, gpr); - LocInfo(gpr).WriteLock(); + for (size_t i = 0; i < other_caller_save.size(); ++i) { + if (other_caller_save[i]) { + MoveOutOfTheWay(code, HostLoc(i)); + LocInfo(HostLoc(i)).WriteLock(); + } } for (size_t i = 0; i < args.size(); i++) { if (args[i] && !args[i]->get().IsVoid()) { @@ -397,46 +395,48 @@ void RegAlloc::ReleaseStackSpace(BlockOfCode& code, const size_t stack_space) no code.add(code.rsp, u32(stack_space)); } -HostLoc RegAlloc::SelectARegister(const boost::container::static_vector& desired_locations) const noexcept { +HostLoc RegAlloc::SelectARegister(std::bitset<32> desired_locations) const noexcept { // TODO(lizzie): Overspill causes issues (reads to 0 and such) on some games, I need to make a testbench // to later track this down - however I just modified the LRU algo so it prefers empty registers first // we need to test high register pressure (and spills, maybe 32 regs?) - + static_assert(size_t(HostLoc::FirstSpill) >= 32); // Selects the best location out of the available locations. // NOTE: Using last is BAD because new REX prefix for each insn using the last regs // TODO: Actually do LRU or something. Currently we just try to pick something without a value if possible. auto min_lru_counter = size_t(-1); - auto it_candidate = desired_locations.cend(); //default fallback if everything fails - auto it_rex_candidate = desired_locations.cend(); - auto it_empty_candidate = desired_locations.cend(); - for (auto it = desired_locations.cbegin(); it != desired_locations.cend(); it++) { - auto const& loc_info = LocInfo(*it); - DEBUG_ASSERT(*it != ABI_JIT_PTR); - // Abstain from using upper registers unless absolutely nescesary - if (loc_info.IsLocked()) { - // skip, not suitable for allocation - // While R13 and R14 are technically available, we avoid allocating for them - // at all costs, because theoretically skipping them is better than spilling - // all over the place - it also fixes bugs with high reg pressure - } else if (*it >= HostLoc::R13 && *it <= HostLoc::R15) { - // skip, do not touch - // Intel recommends to reuse registers as soon as they're overwritable (DO NOT SPILL) - } else if (loc_info.IsEmpty()) { - it_empty_candidate = it; - break; - // No empty registers for some reason (very evil) - just do normal LRU - } else if (loc_info.lru_counter < min_lru_counter) { - // Otherwise a "quasi"-LRU - min_lru_counter = loc_info.lru_counter; - if (*it >= HostLoc::R8 && *it <= HostLoc::R15) { - it_rex_candidate = it; - } else { - it_candidate = it; + auto it_candidate = HostLoc::FirstSpill; //default fallback if everything fails + auto it_rex_candidate = HostLoc::FirstSpill; + auto it_empty_candidate = HostLoc::FirstSpill; + for (HostLoc i = HostLoc(0); i < HostLoc(desired_locations.size()); i = HostLoc(size_t(i) + 1)) { + if (desired_locations.test(size_t(i))) { + auto const& loc_info = LocInfo(i); + DEBUG_ASSERT(i != ABI_JIT_PTR); + // Abstain from using upper registers unless absolutely nescesary + if (loc_info.IsLocked()) { + // skip, not suitable for allocation + // While R13 and R14 are technically available, we avoid allocating for them + // at all costs, because theoretically skipping them is better than spilling + // all over the place - i also fixes bugs with high reg pressure + } else if (i >= HostLoc::R13 && i <= HostLoc::R15) { + // skip, do not touch + // Intel recommends to reuse registers as soon as they're overwritable (DO NOT SPILL) + } else if (loc_info.IsEmpty()) { + it_empty_candidate = i; + break; + // No empty registers for some reason (very evil) - just do normal LRU + } else if (loc_info.lru_counter < min_lru_counter) { + // Otherwise a "quasi"-LRU + min_lru_counter = loc_info.lru_counter; + if (i >= HostLoc::R8 && i <= HostLoc::R15) { + it_rex_candidate = i; + } else { + it_candidate = i; + } + // There used to be a break here - DO NOT BREAK away you MUST + // evaluate ALL of the registers BEFORE making a decision on when to take + // otherwise reg pressure will get high and bugs will seep :) + // TODO(lizzie): Investigate these god awful annoying reg pressure issues } - // There used to be a break here - DO NOT BREAK away you MUST - // evaluate ALL of the registers BEFORE making a decision on when to take - // otherwise reg pressure will get high and bugs will seep :) - // TODO(lizzie): Investigate these god awful annoying reg pressure issues } } // Final resolution goes as follows: @@ -445,13 +445,13 @@ HostLoc RegAlloc::SelectARegister(const boost::container::static_vector Try using a REX prefixed one // We avoid using REX-addressable registers because they add +1 REX prefix which // do we really need? The trade-off may not be worth it. - auto const it_final = it_empty_candidate != desired_locations.cend() - ? it_empty_candidate : it_candidate != desired_locations.cend() + auto const it_final = it_empty_candidate != HostLoc::FirstSpill + ? it_empty_candidate : it_candidate != HostLoc::FirstSpill ? it_candidate : it_rex_candidate; - ASSERT(it_final != desired_locations.cend() && "All candidate registers have already been allocated"); + ASSERT(it_final != HostLoc::FirstSpill && "All candidate registers have already been allocated"); // Evil magic - increment LRU counter (will wrap at 256) - const_cast(this)->LocInfo(*it_final).lru_counter++; - return *it_final; + const_cast(this)->LocInfo(HostLoc(it_final)).lru_counter++; + return HostLoc(it_final); } std::optional RegAlloc::ValueLocation(const IR::Inst* value) const noexcept { diff --git a/src/dynarmic/src/dynarmic/backend/x64/reg_alloc.h b/src/dynarmic/src/dynarmic/backend/x64/reg_alloc.h index c0a896f8c3..8b872a0e9c 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/reg_alloc.h +++ b/src/dynarmic/src/dynarmic/backend/x64/reg_alloc.h @@ -139,7 +139,7 @@ class RegAlloc final { public: using ArgumentInfo = std::array; RegAlloc() noexcept = default; - RegAlloc(boost::container::static_vector gpr_order, boost::container::static_vector xmm_order) noexcept; + RegAlloc(std::bitset<32> gpr_order, std::bitset<32> xmm_order) noexcept; ArgumentInfo GetArgumentInfo(const IR::Inst* inst) noexcept; void RegisterPseudoOperation(const IR::Inst* inst) noexcept; @@ -162,7 +162,7 @@ public: inline void Use(BlockOfCode& code, Argument& arg, const HostLoc host_loc) noexcept { ASSERT(!arg.allocated); arg.allocated = true; - UseImpl(code, arg.value, {host_loc}); + UseImpl(code, arg.value, BuildRegSet({host_loc})); } Xbyak::Reg64 UseScratchGpr(BlockOfCode& code, Argument& arg) noexcept; @@ -178,13 +178,13 @@ public: return HostLocToReg64(ScratchImpl(code, gpr_order)); } inline Xbyak::Reg64 ScratchGpr(BlockOfCode& code, const HostLoc desired_location) noexcept { - return HostLocToReg64(ScratchImpl(code, {desired_location})); + return HostLocToReg64(ScratchImpl(code, BuildRegSet({desired_location}))); } inline Xbyak::Xmm ScratchXmm(BlockOfCode& code) noexcept { return HostLocToXmm(ScratchImpl(code, xmm_order)); } inline Xbyak::Xmm ScratchXmm(BlockOfCode& code, HostLoc desired_location) noexcept { - return HostLocToXmm(ScratchImpl(code, {desired_location})); + return HostLocToXmm(ScratchImpl(code, BuildRegSet({desired_location}))); } void HostCall( @@ -216,11 +216,11 @@ public: private: friend struct Argument; - HostLoc SelectARegister(const boost::container::static_vector& desired_locations) const noexcept; + HostLoc SelectARegister(std::bitset<32> desired_locations) const noexcept; std::optional ValueLocation(const IR::Inst* value) const noexcept; - HostLoc UseImpl(BlockOfCode& code, IR::Value use_value, const boost::container::static_vector& desired_locations) noexcept; - HostLoc UseScratchImpl(BlockOfCode& code, IR::Value use_value, const boost::container::static_vector& desired_locations) noexcept; - HostLoc ScratchImpl(BlockOfCode& code, const boost::container::static_vector& desired_locations) noexcept; + HostLoc UseImpl(BlockOfCode& code, IR::Value use_value, std::bitset<32> desired_locations) noexcept; + HostLoc UseScratchImpl(BlockOfCode& code, IR::Value use_value, std::bitset<32> desired_locations) noexcept; + HostLoc ScratchImpl(BlockOfCode& code, std::bitset<32> desired_locations) noexcept; void DefineValueImpl(BlockOfCode& code, IR::Inst* def_inst, HostLoc host_loc) noexcept; void DefineValueImpl(BlockOfCode& code, IR::Inst* def_inst, const IR::Value& use_inst) noexcept; @@ -246,12 +246,10 @@ private: void EmitExchange(BlockOfCode& code, const HostLoc a, const HostLoc b) noexcept; //data - alignas(64) boost::container::static_vector gpr_order; - alignas(64) boost::container::static_vector xmm_order; alignas(64) std::array hostloc_info; + std::bitset<32> gpr_order; + std::bitset<32> xmm_order; size_t reserved_stack_space = 0; }; -// Ensure a cache line (or less) is used, this is primordial -static_assert(sizeof(boost::container::static_vector) < 64); } // namespace Dynarmic::Backend::X64