[dynarmic] use bitset<32> instead of vector for ABI regset allocations (#3507)

HLE macro does this, may as well do it on dynarm as well :)

could improve perf a bit

Signed-off-by: lizzie <lizzie@eden-emu.dev>
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/3507
Reviewed-by: crueter <crueter@eden-emu.dev>
Reviewed-by: CamilleLaVey <camillelavey99@gmail.com>
Reviewed-by: DraVee <dravee@eden-emu.dev>
Co-authored-by: lizzie <lizzie@eden-emu.dev>
Co-committed-by: lizzie <lizzie@eden-emu.dev>
This commit is contained in:
lizzie 2026-02-24 06:56:08 +01:00 committed by crueter
parent 40251c2115
commit b45c78a051
No known key found for this signature in database
GPG Key ID: 425ACD2D4830EBC6
7 changed files with 168 additions and 198 deletions

View File

@ -102,19 +102,14 @@ A32EmitX64::BlockDescriptor A32EmitX64::Emit(IR::Block& block) {
}
code.EnableWriting();
const boost::container::static_vector<HostLoc, 28> gpr_order = [this] {
boost::container::static_vector<HostLoc, 28> gprs{any_gpr};
if (conf.fastmem_pointer) {
gprs.erase(std::find(gprs.begin(), gprs.end(), HostLoc::R13));
}
if (conf.page_table) {
gprs.erase(std::find(gprs.begin(), gprs.end(), HostLoc::R14));
}
new (&this->reg_alloc) RegAlloc([this] {
std::bitset<32> gprs{any_gpr};
if (conf.fastmem_pointer)
gprs.reset(size_t(HostLoc::R13));
if (conf.page_table)
gprs.reset(size_t(HostLoc::R14));
return gprs;
}();
new (&this->reg_alloc) RegAlloc(gpr_order, any_xmm);
}(), any_xmm);
A32EmitContext ctx{conf, reg_alloc, block};
// Start emitting.

View File

@ -76,18 +76,14 @@ A64EmitX64::BlockDescriptor A64EmitX64::Emit(IR::Block& block) noexcept {
}
code.EnableWriting();
const boost::container::static_vector<HostLoc, 28> gpr_order = [this] {
boost::container::static_vector<HostLoc, 28> gprs{any_gpr};
if (conf.fastmem_pointer) {
gprs.erase(std::find(gprs.begin(), gprs.end(), HostLoc::R13));
}
if (conf.page_table) {
gprs.erase(std::find(gprs.begin(), gprs.end(), HostLoc::R14));
}
new (&this->reg_alloc) RegAlloc{[this] {
std::bitset<32> gprs = any_gpr;
if (conf.fastmem_pointer)
gprs.reset(size_t(HostLoc::R13));
if (conf.page_table)
gprs.reset(size_t(HostLoc::R14));
return gprs;
}();
new (&this->reg_alloc) RegAlloc{gpr_order, any_xmm};
}(), any_xmm};
A64EmitContext ctx{conf, reg_alloc, block};
// Start emitting.
@ -188,7 +184,7 @@ void A64EmitX64::ClearFastDispatchTable() {
void A64EmitX64::GenTerminalHandlers() {
// PC ends up in rcx, location_descriptor ends up in rbx
static_assert(std::find(ABI_ALL_CALLEE_SAVE.begin(), ABI_ALL_CALLEE_SAVE.end(), HostLoc::R12) != ABI_ALL_CALLEE_SAVE.end());
//static_assert(ABI_ALL_CALLEE_SAVE.test(size_t(HostLoc::R12)));
const auto calculate_location_descriptor = [this] {
// This calculation has to match up with A64::LocationDescriptor::UniqueHash
// TODO: Optimization is available here based on known state of fpcr.

View File

@ -40,59 +40,53 @@ static FrameInfo CalculateFrameInfo(const size_t num_gprs, const size_t num_xmms
};
}
template<typename RegisterArrayT>
void ABI_PushRegistersAndAdjustStack(BlockOfCode& code, const size_t frame_size, const RegisterArrayT& regs) {
void ABI_PushRegistersAndAdjustStack(BlockOfCode& code, const size_t frame_size, std::bitset<32> const& regs) {
using namespace Xbyak::util;
const size_t num_gprs = std::count_if(regs.begin(), regs.end(), HostLocIsGPR);
const size_t num_xmms = std::count_if(regs.begin(), regs.end(), HostLocIsXMM);
const size_t num_gprs = (ABI_ALL_GPRS & regs).count();
const size_t num_xmms = (ABI_ALL_XMMS & regs).count();
const FrameInfo frame_info = CalculateFrameInfo(num_gprs, num_xmms, frame_size);
for (auto const gpr : regs)
if (HostLocIsGPR(gpr))
code.push(HostLocToReg64(gpr));
for (size_t i = 0; i < regs.size(); ++i)
if (regs[i] && HostLocIsGPR(HostLoc(i)))
code.push(HostLocToReg64(HostLoc(i)));
if (frame_info.stack_subtraction != 0)
code.sub(rsp, u32(frame_info.stack_subtraction));
size_t xmm_offset = frame_info.xmm_offset;
for (auto const xmm : regs) {
if (HostLocIsXMM(xmm)) {
for (size_t i = 0; i < regs.size(); ++i) {
if (regs[i] && HostLocIsXMM(HostLoc(i))) {
if (code.HasHostFeature(HostFeature::AVX)) {
code.vmovaps(code.xword[rsp + xmm_offset], HostLocToXmm(xmm));
code.vmovaps(code.xword[rsp + xmm_offset], HostLocToXmm(HostLoc(i)));
} else {
code.movaps(code.xword[rsp + xmm_offset], HostLocToXmm(xmm));
code.movaps(code.xword[rsp + xmm_offset], HostLocToXmm(HostLoc(i)));
}
xmm_offset += XMM_SIZE;
}
}
}
template<typename RegisterArrayT>
void ABI_PopRegistersAndAdjustStack(BlockOfCode& code, const size_t frame_size, const RegisterArrayT& regs) {
void ABI_PopRegistersAndAdjustStack(BlockOfCode& code, const size_t frame_size, std::bitset<32> const& regs) {
using namespace Xbyak::util;
const size_t num_gprs = std::count_if(regs.begin(), regs.end(), HostLocIsGPR);
const size_t num_xmms = std::count_if(regs.begin(), regs.end(), HostLocIsXMM);
const size_t num_gprs = (ABI_ALL_GPRS & regs).count();
const size_t num_xmms = (ABI_ALL_XMMS & regs).count();
const FrameInfo frame_info = CalculateFrameInfo(num_gprs, num_xmms, frame_size);
size_t xmm_offset = frame_info.xmm_offset + (num_xmms * XMM_SIZE);
for (auto it = regs.rbegin(); it != regs.rend(); ++it) {
auto const xmm = *it;
if (HostLocIsXMM(xmm)) {
for (int32_t i = regs.size() - 1; i >= 0; --i)
if (regs[i] && HostLocIsXMM(HostLoc(i))) {
xmm_offset -= XMM_SIZE;
if (code.HasHostFeature(HostFeature::AVX)) {
code.vmovaps(HostLocToXmm(xmm), code.xword[rsp + xmm_offset]);
code.vmovaps(HostLocToXmm(HostLoc(i)), code.xword[rsp + xmm_offset]);
} else {
code.movaps(HostLocToXmm(xmm), code.xword[rsp + xmm_offset]);
code.movaps(HostLocToXmm(HostLoc(i)), code.xword[rsp + xmm_offset]);
}
}
}
if (frame_info.stack_subtraction != 0)
code.add(rsp, u32(frame_info.stack_subtraction));
for (auto it = regs.rbegin(); it != regs.rend(); ++it) {
auto const gpr = *it;
if (HostLocIsGPR(gpr))
code.pop(HostLocToReg64(gpr));
}
for (int32_t i = regs.size() - 1; i >= 0; --i)
if (regs[i] && HostLocIsGPR(HostLoc(i)))
code.pop(HostLocToReg64(HostLoc(i)));
}
void ABI_PushCalleeSaveRegistersAndAdjustStack(BlockOfCode& code, const std::size_t frame_size) {
@ -112,74 +106,16 @@ void ABI_PopCallerSaveRegistersAndAdjustStack(BlockOfCode& code, const std::size
}
// Windows ABI registers are not in the same allocation algorithm as unix's
#ifdef _MSC_VER
void ABI_PushCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, const HostLoc exception) {
std::vector<HostLoc> regs;
std::remove_copy(ABI_ALL_CALLER_SAVE.begin(), ABI_ALL_CALLER_SAVE.end(), std::back_inserter(regs), exception);
std::bitset<32> regs = ABI_ALL_CALLER_SAVE;
regs.reset(size_t(exception));
ABI_PushRegistersAndAdjustStack(code, 0, regs);
}
void ABI_PopCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, const HostLoc exception) {
std::vector<HostLoc> regs;
std::remove_copy(ABI_ALL_CALLER_SAVE.begin(), ABI_ALL_CALLER_SAVE.end(), std::back_inserter(regs), exception);
std::bitset<32> regs = ABI_ALL_CALLER_SAVE;
regs.reset(size_t(exception));
ABI_PopRegistersAndAdjustStack(code, 0, regs);
}
#else
static consteval size_t ABI_AllCallerSaveSize() noexcept {
return ABI_ALL_CALLER_SAVE.max_size();
}
static consteval std::array<HostLoc, ABI_AllCallerSaveSize() - 1> ABI_AllCallerSaveExcept(const std::size_t except) noexcept {
std::array<HostLoc, ABI_AllCallerSaveSize() - 1> arr;
for(std::size_t i = 0; i < arr.size(); ++i) {
arr[i] = static_cast<HostLoc>(i + (i >= except ? 1 : 0));
}
return arr;
}
alignas(64) static constinit std::array<HostLoc, ABI_AllCallerSaveSize() - 1> ABI_CALLER_SAVED_EXCEPT_TABLE[32] = {
ABI_AllCallerSaveExcept(0),
ABI_AllCallerSaveExcept(1),
ABI_AllCallerSaveExcept(2),
ABI_AllCallerSaveExcept(3),
ABI_AllCallerSaveExcept(4),
ABI_AllCallerSaveExcept(5),
ABI_AllCallerSaveExcept(6),
ABI_AllCallerSaveExcept(7),
ABI_AllCallerSaveExcept(8),
ABI_AllCallerSaveExcept(9),
ABI_AllCallerSaveExcept(10),
ABI_AllCallerSaveExcept(11),
ABI_AllCallerSaveExcept(12),
ABI_AllCallerSaveExcept(13),
ABI_AllCallerSaveExcept(14),
ABI_AllCallerSaveExcept(15),
ABI_AllCallerSaveExcept(16),
ABI_AllCallerSaveExcept(17),
ABI_AllCallerSaveExcept(18),
ABI_AllCallerSaveExcept(19),
ABI_AllCallerSaveExcept(20),
ABI_AllCallerSaveExcept(21),
ABI_AllCallerSaveExcept(22),
ABI_AllCallerSaveExcept(23),
ABI_AllCallerSaveExcept(24),
ABI_AllCallerSaveExcept(25),
ABI_AllCallerSaveExcept(26),
ABI_AllCallerSaveExcept(27),
ABI_AllCallerSaveExcept(28),
ABI_AllCallerSaveExcept(29),
ABI_AllCallerSaveExcept(30),
ABI_AllCallerSaveExcept(31),
};
void ABI_PushCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, const HostLoc exception) {
ASSUME(size_t(exception) < 32);
ABI_PushRegistersAndAdjustStack(code, 0, ABI_CALLER_SAVED_EXCEPT_TABLE[size_t(exception)]);
}
void ABI_PopCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, const HostLoc exception) {
ASSUME(size_t(exception) < 32);
ABI_PopRegistersAndAdjustStack(code, 0, ABI_CALLER_SAVED_EXCEPT_TABLE[size_t(exception)]);
}
#endif
} // namespace Dynarmic::Backend::X64

View File

@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
/* This file is part of the dynarmic project.
@ -8,15 +8,52 @@
#pragma once
#include <array>
#include <bitset>
#include "dynarmic/common/common_types.h"
#include "dynarmic/backend/x64/hostloc.h"
namespace Dynarmic::Backend::X64 {
class BlockOfCode;
constexpr std::bitset<32> ABI_ALL_GPRS = BuildRegSet({
HostLoc::RAX,
HostLoc::RBX,
HostLoc::RCX,
HostLoc::RDX,
HostLoc::RDI,
HostLoc::RSI,
HostLoc::RBP,
HostLoc::RSP,
HostLoc::R8,
HostLoc::R9,
HostLoc::R10,
HostLoc::R11,
HostLoc::R12,
HostLoc::R13,
HostLoc::R14,
HostLoc::R15,
});
constexpr std::bitset<32> ABI_ALL_XMMS = BuildRegSet({
HostLoc::XMM0,
HostLoc::XMM1,
HostLoc::XMM2,
HostLoc::XMM3,
HostLoc::XMM4,
HostLoc::XMM5,
HostLoc::XMM6,
HostLoc::XMM7,
HostLoc::XMM8,
HostLoc::XMM9,
HostLoc::XMM10,
HostLoc::XMM11,
HostLoc::XMM12,
HostLoc::XMM13,
HostLoc::XMM14,
HostLoc::XMM15,
});
constexpr HostLoc ABI_JIT_PTR = HostLoc::R15;
#ifdef _WIN32
@ -29,7 +66,7 @@ constexpr HostLoc ABI_PARAM2 = HostLoc::RDX;
constexpr HostLoc ABI_PARAM3 = HostLoc::R8;
constexpr HostLoc ABI_PARAM4 = HostLoc::R9;
constexpr std::array<HostLoc, 13> ABI_ALL_CALLER_SAVE = {
constexpr std::bitset<32> ABI_ALL_CALLER_SAVE = BuildRegSet({
HostLoc::RAX,
HostLoc::RCX,
HostLoc::RDX,
@ -43,9 +80,9 @@ constexpr std::array<HostLoc, 13> ABI_ALL_CALLER_SAVE = {
HostLoc::XMM3,
HostLoc::XMM4,
HostLoc::XMM5,
};
});
constexpr std::array<HostLoc, 18> ABI_ALL_CALLEE_SAVE = {
constexpr std::bitset<32> ABI_ALL_CALLEE_SAVE = BuildRegSet({
HostLoc::RBX,
HostLoc::RSI,
HostLoc::RDI,
@ -64,7 +101,7 @@ constexpr std::array<HostLoc, 18> ABI_ALL_CALLEE_SAVE = {
HostLoc::XMM13,
HostLoc::XMM14,
HostLoc::XMM15,
};
});
constexpr size_t ABI_SHADOW_SPACE = 32; // bytes
@ -82,7 +119,7 @@ constexpr HostLoc ABI_PARAM4 = HostLoc::RCX;
constexpr HostLoc ABI_PARAM5 = HostLoc::R8;
constexpr HostLoc ABI_PARAM6 = HostLoc::R9;
constexpr std::array<HostLoc, 25> ABI_ALL_CALLER_SAVE = {
constexpr std::bitset<32> ABI_ALL_CALLER_SAVE = BuildRegSet({
HostLoc::RAX,
HostLoc::RCX,
HostLoc::RDX,
@ -108,22 +145,22 @@ constexpr std::array<HostLoc, 25> ABI_ALL_CALLER_SAVE = {
HostLoc::XMM13,
HostLoc::XMM14,
HostLoc::XMM15,
};
});
constexpr std::array<HostLoc, 6> ABI_ALL_CALLEE_SAVE = {
constexpr std::bitset<32> ABI_ALL_CALLEE_SAVE = BuildRegSet({
HostLoc::RBX,
HostLoc::RBP,
HostLoc::R12,
HostLoc::R13,
HostLoc::R14,
HostLoc::R15,
};
});
constexpr size_t ABI_SHADOW_SPACE = 0; // bytes
#endif
static_assert(ABI_ALL_CALLER_SAVE.size() + ABI_ALL_CALLEE_SAVE.size() == 31, "Invalid total number of registers");
//static_assert(ABI_ALL_CALLER_SAVE.count() + ABI_ALL_CALLEE_SAVE.count() == 31, "Invalid total number of registers");
void ABI_PushCalleeSaveRegistersAndAdjustStack(BlockOfCode& code, size_t frame_size = 0);
void ABI_PopCalleeSaveRegistersAndAdjustStack(BlockOfCode& code, size_t frame_size = 0);

View File

@ -7,6 +7,9 @@
*/
#pragma once
#include <bitset>
#include <xbyak/xbyak.h>
#include "dynarmic/common/assert.h"
#include "dynarmic/common/common_types.h"
#include "dynarmic/backend/x64/xbyak.h"
@ -106,13 +109,18 @@ constexpr size_t HostLocBitWidth(HostLoc loc) {
UNREACHABLE();
}
using HostLocList = std::initializer_list<HostLoc>;
constexpr std::bitset<32> BuildRegSet(std::initializer_list<HostLoc> regs) {
size_t bits = 0;
for (auto const& reg : regs)
bits |= size_t{1} << size_t(reg);
return {bits};
}
// RSP is preserved for function calls
// R13 contains fastmem pointer if any
// R14 contains the pagetable pointer
// R15 contains the JitState pointer
const HostLocList any_gpr = {
const std::bitset<32> any_gpr = BuildRegSet({
HostLoc::RAX,
HostLoc::RBX,
HostLoc::RCX,
@ -128,13 +136,13 @@ const HostLocList any_gpr = {
HostLoc::R13,
HostLoc::R14,
//HostLoc::R15,
};
});
// XMM0 is reserved for use by instructions that implicitly use it as an argument
// XMM1 is used by 128 mem accessors
// XMM2 is also used by that (and other stuff)
// Basically dont use either XMM0, XMM1 or XMM2 ever; they're left for the regsel
const HostLocList any_xmm = {
const std::bitset<32> any_xmm = BuildRegSet({
//HostLoc::XMM1,
//HostLoc::XMM2,
HostLoc::XMM3,
@ -150,7 +158,7 @@ const HostLocList any_xmm = {
HostLoc::XMM13,
HostLoc::XMM14,
HostLoc::XMM15,
};
});
inline Xbyak::Reg64 HostLocToReg64(HostLoc loc) noexcept {
ASSERT(HostLocIsGPR(loc));

View File

@ -6,19 +6,19 @@
* SPDX-License-Identifier: 0BSD
*/
#include "dynarmic/backend/x64/reg_alloc.h"
#include <algorithm>
#include <limits>
#include <numeric>
#include <utility>
#include <fmt/ostream.h>
#include "dynarmic/backend/x64/hostloc.h"
#include "dynarmic/common/assert.h"
#include <bit>
#include "dynarmic/backend/x64/xbyak.h"
#include "dynarmic/backend/x64/abi.h"
#include "dynarmic/backend/x64/reg_alloc.h"
#include "dynarmic/backend/x64/stack_layout.h"
#include "dynarmic/backend/x64/verbose_debugging_output.h"
@ -185,9 +185,8 @@ bool Argument::IsInMemory(RegAlloc& reg_alloc) const noexcept {
return HostLocIsSpill(*reg_alloc.ValueLocation(value.GetInst()));
}
RegAlloc::RegAlloc(boost::container::static_vector<HostLoc, 28> gpr_order, boost::container::static_vector<HostLoc, 28> xmm_order) noexcept
: gpr_order(gpr_order),
xmm_order(xmm_order)
RegAlloc::RegAlloc(std::bitset<32> gpr_order, std::bitset<32> xmm_order) noexcept
: gpr_order(gpr_order), xmm_order(xmm_order)
{}
RegAlloc::ArgumentInfo RegAlloc::GetArgumentInfo(const IR::Inst* inst) noexcept {
@ -237,7 +236,7 @@ Xbyak::Xmm RegAlloc::UseScratchXmm(BlockOfCode& code, Argument& arg) noexcept {
void RegAlloc::UseScratch(BlockOfCode& code, Argument& arg, HostLoc host_loc) noexcept {
ASSERT(!arg.allocated);
arg.allocated = true;
UseScratchImpl(code, arg.value, {host_loc});
UseScratchImpl(code, arg.value, BuildRegSet({host_loc}));
}
void RegAlloc::DefineValue(BlockOfCode& code, IR::Inst* inst, const Xbyak::Reg& reg) noexcept {
@ -258,7 +257,7 @@ void RegAlloc::Release(const Xbyak::Reg& reg) noexcept {
LocInfo(hostloc).ReleaseOne();
}
HostLoc RegAlloc::UseImpl(BlockOfCode& code, IR::Value use_value, const boost::container::static_vector<HostLoc, 28>& desired_locations) noexcept {
HostLoc RegAlloc::UseImpl(BlockOfCode& code, IR::Value use_value, std::bitset<32> desired_locations) noexcept {
if (use_value.IsImmediate()) {
return LoadImmediate(code, use_value, ScratchImpl(code, desired_locations));
}
@ -266,8 +265,7 @@ HostLoc RegAlloc::UseImpl(BlockOfCode& code, IR::Value use_value, const boost::c
auto const* use_inst = use_value.GetInst();
HostLoc const current_location = *ValueLocation(use_inst);
const bool can_use_current_location = std::find(desired_locations.begin(), desired_locations.end(), current_location) != desired_locations.end();
if (can_use_current_location) {
if (HostLocIsRegister(current_location) && desired_locations.test(size_t(current_location))) {
LocInfo(current_location).ReadLock();
return current_location;
}
@ -290,7 +288,7 @@ HostLoc RegAlloc::UseImpl(BlockOfCode& code, IR::Value use_value, const boost::c
return destination_location;
}
HostLoc RegAlloc::UseScratchImpl(BlockOfCode& code, IR::Value use_value, const boost::container::static_vector<HostLoc, 28>& desired_locations) noexcept {
HostLoc RegAlloc::UseScratchImpl(BlockOfCode& code, IR::Value use_value, std::bitset<32> desired_locations) noexcept {
if (use_value.IsImmediate()) {
return LoadImmediate(code, use_value, ScratchImpl(code, desired_locations));
}
@ -298,9 +296,7 @@ HostLoc RegAlloc::UseScratchImpl(BlockOfCode& code, IR::Value use_value, const b
const auto* use_inst = use_value.GetInst();
const HostLoc current_location = *ValueLocation(use_inst);
const size_t bit_width = GetBitWidth(use_inst->GetType());
const bool can_use_current_location = std::find(desired_locations.begin(), desired_locations.end(), current_location) != desired_locations.end();
if (can_use_current_location && !LocInfo(current_location).IsLocked()) {
if (HostLocIsRegister(current_location) && desired_locations.test(size_t(current_location)) && !LocInfo(current_location).IsLocked()) {
if (LocInfo(current_location).IsLastUse()) {
LocInfo(current_location).is_set_last_use = true;
} else {
@ -317,7 +313,7 @@ HostLoc RegAlloc::UseScratchImpl(BlockOfCode& code, IR::Value use_value, const b
return destination_location;
}
HostLoc RegAlloc::ScratchImpl(BlockOfCode& code, const boost::container::static_vector<HostLoc, 28>& desired_locations) noexcept {
HostLoc RegAlloc::ScratchImpl(BlockOfCode& code, std::bitset<32> desired_locations) noexcept {
const HostLoc location = SelectARegister(desired_locations);
MoveOutOfTheWay(code, location);
LocInfo(location).WriteLock();
@ -336,11 +332,11 @@ void RegAlloc::HostCall(
constexpr std::array<HostLoc, args_count> args_hostloc = {ABI_PARAM1, ABI_PARAM2, ABI_PARAM3, ABI_PARAM4};
const std::array<std::optional<Argument::copyable_reference>, args_count> args = {arg0, arg1, arg2, arg3};
static const boost::container::static_vector<HostLoc, 28> other_caller_save = [args_hostloc]() noexcept {
boost::container::static_vector<HostLoc, 28> ret(ABI_ALL_CALLER_SAVE.begin(), ABI_ALL_CALLER_SAVE.end());
ret.erase(std::find(ret.begin(), ret.end(), ABI_RETURN));
static const std::bitset<32> other_caller_save = [args_hostloc]() noexcept {
std::bitset<32> ret = ABI_ALL_CALLER_SAVE;
ret.reset(size_t(ABI_RETURN));
for (auto const hostloc : args_hostloc)
ret.erase(std::find(ret.begin(), ret.end(), hostloc));
ret.reset(size_t(hostloc));
return ret;
}();
@ -356,9 +352,11 @@ void RegAlloc::HostCall(
}
}
// Must match with with ScratchImpl
for (auto const gpr : other_caller_save) {
MoveOutOfTheWay(code, gpr);
LocInfo(gpr).WriteLock();
for (size_t i = 0; i < other_caller_save.size(); ++i) {
if (other_caller_save[i]) {
MoveOutOfTheWay(code, HostLoc(i));
LocInfo(HostLoc(i)).WriteLock();
}
}
for (size_t i = 0; i < args.size(); i++) {
if (args[i] && !args[i]->get().IsVoid()) {
@ -397,46 +395,48 @@ void RegAlloc::ReleaseStackSpace(BlockOfCode& code, const size_t stack_space) no
code.add(code.rsp, u32(stack_space));
}
HostLoc RegAlloc::SelectARegister(const boost::container::static_vector<HostLoc, 28>& desired_locations) const noexcept {
HostLoc RegAlloc::SelectARegister(std::bitset<32> desired_locations) const noexcept {
// TODO(lizzie): Overspill causes issues (reads to 0 and such) on some games, I need to make a testbench
// to later track this down - however I just modified the LRU algo so it prefers empty registers first
// we need to test high register pressure (and spills, maybe 32 regs?)
static_assert(size_t(HostLoc::FirstSpill) >= 32);
// Selects the best location out of the available locations.
// NOTE: Using last is BAD because new REX prefix for each insn using the last regs
// TODO: Actually do LRU or something. Currently we just try to pick something without a value if possible.
auto min_lru_counter = size_t(-1);
auto it_candidate = desired_locations.cend(); //default fallback if everything fails
auto it_rex_candidate = desired_locations.cend();
auto it_empty_candidate = desired_locations.cend();
for (auto it = desired_locations.cbegin(); it != desired_locations.cend(); it++) {
auto const& loc_info = LocInfo(*it);
DEBUG_ASSERT(*it != ABI_JIT_PTR);
// Abstain from using upper registers unless absolutely nescesary
if (loc_info.IsLocked()) {
// skip, not suitable for allocation
// While R13 and R14 are technically available, we avoid allocating for them
// at all costs, because theoretically skipping them is better than spilling
// all over the place - it also fixes bugs with high reg pressure
} else if (*it >= HostLoc::R13 && *it <= HostLoc::R15) {
// skip, do not touch
// Intel recommends to reuse registers as soon as they're overwritable (DO NOT SPILL)
} else if (loc_info.IsEmpty()) {
it_empty_candidate = it;
break;
// No empty registers for some reason (very evil) - just do normal LRU
} else if (loc_info.lru_counter < min_lru_counter) {
// Otherwise a "quasi"-LRU
min_lru_counter = loc_info.lru_counter;
if (*it >= HostLoc::R8 && *it <= HostLoc::R15) {
it_rex_candidate = it;
} else {
it_candidate = it;
auto it_candidate = HostLoc::FirstSpill; //default fallback if everything fails
auto it_rex_candidate = HostLoc::FirstSpill;
auto it_empty_candidate = HostLoc::FirstSpill;
for (HostLoc i = HostLoc(0); i < HostLoc(desired_locations.size()); i = HostLoc(size_t(i) + 1)) {
if (desired_locations.test(size_t(i))) {
auto const& loc_info = LocInfo(i);
DEBUG_ASSERT(i != ABI_JIT_PTR);
// Abstain from using upper registers unless absolutely nescesary
if (loc_info.IsLocked()) {
// skip, not suitable for allocation
// While R13 and R14 are technically available, we avoid allocating for them
// at all costs, because theoretically skipping them is better than spilling
// all over the place - i also fixes bugs with high reg pressure
} else if (i >= HostLoc::R13 && i <= HostLoc::R15) {
// skip, do not touch
// Intel recommends to reuse registers as soon as they're overwritable (DO NOT SPILL)
} else if (loc_info.IsEmpty()) {
it_empty_candidate = i;
break;
// No empty registers for some reason (very evil) - just do normal LRU
} else if (loc_info.lru_counter < min_lru_counter) {
// Otherwise a "quasi"-LRU
min_lru_counter = loc_info.lru_counter;
if (i >= HostLoc::R8 && i <= HostLoc::R15) {
it_rex_candidate = i;
} else {
it_candidate = i;
}
// There used to be a break here - DO NOT BREAK away you MUST
// evaluate ALL of the registers BEFORE making a decision on when to take
// otherwise reg pressure will get high and bugs will seep :)
// TODO(lizzie): Investigate these god awful annoying reg pressure issues
}
// There used to be a break here - DO NOT BREAK away you MUST
// evaluate ALL of the registers BEFORE making a decision on when to take
// otherwise reg pressure will get high and bugs will seep :)
// TODO(lizzie): Investigate these god awful annoying reg pressure issues
}
}
// Final resolution goes as follows:
@ -445,13 +445,13 @@ HostLoc RegAlloc::SelectARegister(const boost::container::static_vector<HostLoc,
// 3 => Try using a REX prefixed one
// We avoid using REX-addressable registers because they add +1 REX prefix which
// do we really need? The trade-off may not be worth it.
auto const it_final = it_empty_candidate != desired_locations.cend()
? it_empty_candidate : it_candidate != desired_locations.cend()
auto const it_final = it_empty_candidate != HostLoc::FirstSpill
? it_empty_candidate : it_candidate != HostLoc::FirstSpill
? it_candidate : it_rex_candidate;
ASSERT(it_final != desired_locations.cend() && "All candidate registers have already been allocated");
ASSERT(it_final != HostLoc::FirstSpill && "All candidate registers have already been allocated");
// Evil magic - increment LRU counter (will wrap at 256)
const_cast<RegAlloc*>(this)->LocInfo(*it_final).lru_counter++;
return *it_final;
const_cast<RegAlloc*>(this)->LocInfo(HostLoc(it_final)).lru_counter++;
return HostLoc(it_final);
}
std::optional<HostLoc> RegAlloc::ValueLocation(const IR::Inst* value) const noexcept {

View File

@ -139,7 +139,7 @@ class RegAlloc final {
public:
using ArgumentInfo = std::array<Argument, IR::max_arg_count>;
RegAlloc() noexcept = default;
RegAlloc(boost::container::static_vector<HostLoc, 28> gpr_order, boost::container::static_vector<HostLoc, 28> xmm_order) noexcept;
RegAlloc(std::bitset<32> gpr_order, std::bitset<32> xmm_order) noexcept;
ArgumentInfo GetArgumentInfo(const IR::Inst* inst) noexcept;
void RegisterPseudoOperation(const IR::Inst* inst) noexcept;
@ -162,7 +162,7 @@ public:
inline void Use(BlockOfCode& code, Argument& arg, const HostLoc host_loc) noexcept {
ASSERT(!arg.allocated);
arg.allocated = true;
UseImpl(code, arg.value, {host_loc});
UseImpl(code, arg.value, BuildRegSet({host_loc}));
}
Xbyak::Reg64 UseScratchGpr(BlockOfCode& code, Argument& arg) noexcept;
@ -178,13 +178,13 @@ public:
return HostLocToReg64(ScratchImpl(code, gpr_order));
}
inline Xbyak::Reg64 ScratchGpr(BlockOfCode& code, const HostLoc desired_location) noexcept {
return HostLocToReg64(ScratchImpl(code, {desired_location}));
return HostLocToReg64(ScratchImpl(code, BuildRegSet({desired_location})));
}
inline Xbyak::Xmm ScratchXmm(BlockOfCode& code) noexcept {
return HostLocToXmm(ScratchImpl(code, xmm_order));
}
inline Xbyak::Xmm ScratchXmm(BlockOfCode& code, HostLoc desired_location) noexcept {
return HostLocToXmm(ScratchImpl(code, {desired_location}));
return HostLocToXmm(ScratchImpl(code, BuildRegSet({desired_location})));
}
void HostCall(
@ -216,11 +216,11 @@ public:
private:
friend struct Argument;
HostLoc SelectARegister(const boost::container::static_vector<HostLoc, 28>& desired_locations) const noexcept;
HostLoc SelectARegister(std::bitset<32> desired_locations) const noexcept;
std::optional<HostLoc> ValueLocation(const IR::Inst* value) const noexcept;
HostLoc UseImpl(BlockOfCode& code, IR::Value use_value, const boost::container::static_vector<HostLoc, 28>& desired_locations) noexcept;
HostLoc UseScratchImpl(BlockOfCode& code, IR::Value use_value, const boost::container::static_vector<HostLoc, 28>& desired_locations) noexcept;
HostLoc ScratchImpl(BlockOfCode& code, const boost::container::static_vector<HostLoc, 28>& desired_locations) noexcept;
HostLoc UseImpl(BlockOfCode& code, IR::Value use_value, std::bitset<32> desired_locations) noexcept;
HostLoc UseScratchImpl(BlockOfCode& code, IR::Value use_value, std::bitset<32> desired_locations) noexcept;
HostLoc ScratchImpl(BlockOfCode& code, std::bitset<32> desired_locations) noexcept;
void DefineValueImpl(BlockOfCode& code, IR::Inst* def_inst, HostLoc host_loc) noexcept;
void DefineValueImpl(BlockOfCode& code, IR::Inst* def_inst, const IR::Value& use_inst) noexcept;
@ -246,12 +246,10 @@ private:
void EmitExchange(BlockOfCode& code, const HostLoc a, const HostLoc b) noexcept;
//data
alignas(64) boost::container::static_vector<HostLoc, 28> gpr_order;
alignas(64) boost::container::static_vector<HostLoc, 28> xmm_order;
alignas(64) std::array<HostLocInfo, NonSpillHostLocCount + SpillCount> hostloc_info;
std::bitset<32> gpr_order;
std::bitset<32> xmm_order;
size_t reserved_stack_space = 0;
};
// Ensure a cache line (or less) is used, this is primordial
static_assert(sizeof(boost::container::static_vector<HostLoc, 28>) < 64);
} // namespace Dynarmic::Backend::X64