From 83a28dc251c287888d99e045a2682a89c8cd4efe Mon Sep 17 00:00:00 2001
From: lizzie <lizzie@eden-emu.dev>
Date: Fri, 16 Jan 2026 23:39:16 +0100
Subject: [PATCH] [common, core] remove uneeded memory indirection overhead at
 startup (#3306)

for core stuff:
just remove unique ptrs that dont need any pointer stability at all (afterall its an allocation within an allocation so yeah)

for fibers:
Main reasoning behind this is because virtualBuffer<> is stupidly fucking expensive and it also clutters my fstat view
ALSO mmap is a syscall, syscalls are bad for performance or whatever
ALSO std::vector<> is better suited for handling this kind of "fixed size thing where its like big but not THAT big" (512 KiB isn't going to kill your memory usage for each fiber...)

for core.cpp stuff
- inlines stuff into std::optional<> as opposed to std::unique_ptr<> (because yknow, we are making the Impl from an unique_ptr, allocating within an allocation is unnecessary)
- reorganizes the structures a bit so padding doesnt screw us up (it's not perfect but eh saves a measly 44 bytes)
- removes unused/dead code
- uses std::vector<> instead of std::deque<>

no perf impact expected, maybe some initialisation boost but very minimal impact nonethless
lto gets rid of most calls anyways - the heavy issue is with shared_ptr and the cache coherency from the atomics... but i clumped them together because well, they kinda do not suffer from cache coherency - hopefully not a mistake

this balloons the size of Impl to about 1.67 MB - which is fine because we throw it in the stack anyways

REST OF INTERFACES: most of them ballooned in size as well, but overhead is ok since its an allocation within an alloc, no stack is used (when it comes to storing these i mean)

Signed-off-by: lizzie lizzie@eden-emu.dev
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/3306
Reviewed-by: CamilleLaVey <camillelavey99@gmail.com>
Reviewed-by: MaranBr <maranbr@eden-emu.dev>
Co-authored-by: lizzie <lizzie@eden-emu.dev>
Co-committed-by: lizzie <lizzie@eden-emu.dev>
---
 src/audio_core/adsp/adsp.cpp                  |   11 +-
 src/audio_core/adsp/adsp.h                    |    9 +-
 src/audio_core/opus/decoder.cpp               |   38 +-
 src/audio_core/opus/decoder.h                 |    6 +-
 src/common/fiber.cpp                          |  110 +-
 src/common/fiber.h                            |   10 -
 src/common/wall_clock.cpp                     |    8 +-
 src/common/wall_clock.h                       |    7 +-
 src/core/arm/dynarmic/arm_dynarmic_32.cpp     |  307 ++-
 src/core/arm/dynarmic/arm_dynarmic_32.h       |   49 +-
 src/core/arm/dynarmic/arm_dynarmic_64.cpp     |  390 ++--
 src/core/arm/dynarmic/arm_dynarmic_64.h       |   57 +-
 src/core/core.cpp                             |  185 +-
 src/core/core.h                               |   13 +-
 src/core/core_timing.cpp                      |   14 +-
 src/core/core_timing.h                        |    7 +-
 src/core/hle/kernel/k_process.cpp             |   14 +-
 src/core/hle/kernel/k_process.h               |  104 +-
 src/core/hle/kernel/kernel.cpp                |  119 +-
 src/core/hle/service/am/applet.h              |    6 +-
 .../service/ns/platform_service_manager.cpp   |   91 +-
 src/video_core/CMakeLists.txt                 |   12 +-
 src/video_core/dma_pusher.cpp                 |   67 +-
 src/video_core/engines/maxwell_3d.cpp         |   13 +-
 src/video_core/engines/maxwell_3d.h           |    4 +-
 src/video_core/macro.cpp                      | 1667 +++++++++++++++++
 src/video_core/{macro => }/macro.h            |   40 +-
 src/video_core/macro/macro.cpp                |  140 --
 src/video_core/macro/macro_hle.cpp            |  606 ------
 src/video_core/macro/macro_hle.h              |   33 -
 src/video_core/macro/macro_interpreter.cpp    |  362 ----
 src/video_core/macro/macro_interpreter.h      |   27 -
 src/video_core/macro/macro_jit_x64.cpp        |  678 -------
 src/video_core/macro/macro_jit_x64.h          |   26 -
 .../renderer_opengl/gl_texture_cache.cpp      |   17 +-
 .../renderer_opengl/gl_texture_cache.h        |    2 +-
 .../renderer_vulkan/vk_rasterizer.cpp         |   51 +-
 .../renderer_vulkan/vk_texture_cache.cpp      |   82 +-
 .../renderer_vulkan/vk_texture_cache.h        |  175 +-
 src/video_core/texture_cache/texture_cache.h  |    8 +-
 40 files changed, 2602 insertions(+), 2963 deletions(-)
 create mode 100644 src/video_core/macro.cpp
 rename src/video_core/{macro => }/macro.h (74%)
 delete mode 100644 src/video_core/macro/macro.cpp
 delete mode 100644 src/video_core/macro/macro_hle.cpp
 delete mode 100644 src/video_core/macro/macro_hle.h
 delete mode 100644 src/video_core/macro/macro_interpreter.cpp
 delete mode 100644 src/video_core/macro/macro_interpreter.h
 delete mode 100644 src/video_core/macro/macro_jit_x64.cpp
 delete mode 100644 src/video_core/macro/macro_jit_x64.h

diff --git a/src/audio_core/adsp/adsp.cpp b/src/audio_core/adsp/adsp.cpp
index 48f0a63d4a..a578461f7c 100644
--- a/src/audio_core/adsp/adsp.cpp
+++ b/src/audio_core/adsp/adsp.cpp
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
@@ -7,8 +10,8 @@
 namespace AudioCore::ADSP {
 
 ADSP::ADSP(Core::System& system, Sink::Sink& sink) {
-    audio_renderer = std::make_unique<AudioRenderer::AudioRenderer>(system, sink);
-    opus_decoder = std::make_unique<OpusDecoder::OpusDecoder>(system);
+    audio_renderer.emplace(system, sink);
+    opus_decoder.emplace(system);
     opus_decoder->Send(Direction::DSP, OpusDecoder::Message::Start);
     if (opus_decoder->Receive(Direction::Host) != OpusDecoder::Message::StartOK) {
         LOG_ERROR(Service_Audio, "OpusDecoder failed to initialize.");
@@ -17,11 +20,11 @@ ADSP::ADSP(Core::System& system, Sink::Sink& sink) {
 }
 
 AudioRenderer::AudioRenderer& ADSP::AudioRenderer() {
-    return *audio_renderer.get();
+    return *audio_renderer;
 }
 
 OpusDecoder::OpusDecoder& ADSP::OpusDecoder() {
-    return *opus_decoder.get();
+    return *opus_decoder;
 }
 
 } // namespace AudioCore::ADSP
diff --git a/src/audio_core/adsp/adsp.h b/src/audio_core/adsp/adsp.h
index a0c24a16a2..028d87939d 100644
--- a/src/audio_core/adsp/adsp.h
+++ b/src/audio_core/adsp/adsp.h
@@ -1,8 +1,13 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
 #pragma once
 
+#include <optional>
+
 #include "audio_core/adsp/apps/audio_renderer/audio_renderer.h"
 #include "audio_core/adsp/apps/opus/opus_decoder.h"
 #include "common/common_types.h"
@@ -45,8 +50,8 @@ public:
 
 private:
     /// AudioRenderer app
-    std::unique_ptr<AudioRenderer::AudioRenderer> audio_renderer{};
-    std::unique_ptr<OpusDecoder::OpusDecoder> opus_decoder{};
+    std::optional<AudioRenderer::AudioRenderer> audio_renderer{};
+    std::optional<OpusDecoder::OpusDecoder> opus_decoder{};
 };
 
 } // namespace ADSP
diff --git a/src/audio_core/opus/decoder.cpp b/src/audio_core/opus/decoder.cpp
index e60a7d48d4..7d0cce74db 100644
--- a/src/audio_core/opus/decoder.cpp
+++ b/src/audio_core/opus/decoder.cpp
@@ -27,33 +27,31 @@ OpusDecoder::OpusDecoder(Core::System& system_, HardwareOpus& hardware_opus_)
 
 OpusDecoder::~OpusDecoder() {
     if (decode_object_initialized) {
-        hardware_opus.ShutdownDecodeObject(shared_buffer.get(), shared_buffer_size);
+        hardware_opus.ShutdownDecodeObject(shared_buffer.data(), shared_buffer.size());
     }
 }
 
-Result OpusDecoder::Initialize(const OpusParametersEx& params,
-                               Kernel::KTransferMemory* transfer_memory, u64 transfer_memory_size) {
+Result OpusDecoder::Initialize(const OpusParametersEx& params, Kernel::KTransferMemory* transfer_memory, u64 transfer_memory_size) {
     auto frame_size{params.use_large_frame_size ? 5760 : 1920};
-    shared_buffer_size = transfer_memory_size;
-    shared_buffer = std::make_unique<u8[]>(shared_buffer_size);
+    shared_buffer.resize(transfer_memory_size);
     shared_memory_mapped = true;
 
     buffer_size =
         Common::AlignUp((frame_size * params.channel_count) / (48'000 / params.sample_rate), 16);
 
-    out_data = {shared_buffer.get() + shared_buffer_size - buffer_size, buffer_size};
+    out_data = {shared_buffer.data() + shared_buffer.size() - buffer_size, buffer_size};
     size_t in_data_size{0x600u};
     in_data = {out_data.data() - in_data_size, in_data_size};
 
     ON_RESULT_FAILURE {
         if (shared_memory_mapped) {
             shared_memory_mapped = false;
-            ASSERT(R_SUCCEEDED(hardware_opus.UnmapMemory(shared_buffer.get(), shared_buffer_size)));
+            ASSERT(R_SUCCEEDED(hardware_opus.UnmapMemory(shared_buffer.data(), shared_buffer.size())));
         }
     };
 
     R_TRY(hardware_opus.InitializeDecodeObject(params.sample_rate, params.channel_count,
-                                               shared_buffer.get(), shared_buffer_size));
+                                               shared_buffer.data(), shared_buffer.size()));
 
     sample_rate = params.sample_rate;
     channel_count = params.channel_count;
@@ -62,31 +60,29 @@ Result OpusDecoder::Initialize(const OpusParametersEx& params,
     R_SUCCEED();
 }
 
-Result OpusDecoder::Initialize(const OpusMultiStreamParametersEx& params,
-                               Kernel::KTransferMemory* transfer_memory, u64 transfer_memory_size) {
+Result OpusDecoder::Initialize(const OpusMultiStreamParametersEx& params, Kernel::KTransferMemory* transfer_memory, u64 transfer_memory_size) {
     auto frame_size{params.use_large_frame_size ? 5760 : 1920};
-    shared_buffer_size = transfer_memory_size;
-    shared_buffer = std::make_unique<u8[]>(shared_buffer_size);
+    shared_buffer.resize(transfer_memory_size, 0);
     shared_memory_mapped = true;
 
     buffer_size =
         Common::AlignUp((frame_size * params.channel_count) / (48'000 / params.sample_rate), 16);
 
-    out_data = {shared_buffer.get() + shared_buffer_size - buffer_size, buffer_size};
+    out_data = {shared_buffer.data() + shared_buffer.size() - buffer_size, buffer_size};
     size_t in_data_size{Common::AlignUp(1500ull * params.total_stream_count, 64u)};
     in_data = {out_data.data() - in_data_size, in_data_size};
 
     ON_RESULT_FAILURE {
         if (shared_memory_mapped) {
             shared_memory_mapped = false;
-            ASSERT(R_SUCCEEDED(hardware_opus.UnmapMemory(shared_buffer.get(), shared_buffer_size)));
+            ASSERT(R_SUCCEEDED(hardware_opus.UnmapMemory(shared_buffer.data(), shared_buffer.size())));
         }
     };
 
     R_TRY(hardware_opus.InitializeMultiStreamDecodeObject(
         params.sample_rate, params.channel_count, params.total_stream_count,
-        params.stereo_stream_count, params.mappings.data(), shared_buffer.get(),
-        shared_buffer_size));
+        params.stereo_stream_count, params.mappings.data(), shared_buffer.data(),
+        shared_buffer.size()));
 
     sample_rate = params.sample_rate;
     channel_count = params.channel_count;
@@ -113,7 +109,7 @@ Result OpusDecoder::DecodeInterleaved(u32* out_data_size, u64* out_time_taken,
              ResultBufferTooSmall);
 
     if (!shared_memory_mapped) {
-        R_TRY(hardware_opus.MapMemory(shared_buffer.get(), shared_buffer_size));
+        R_TRY(hardware_opus.MapMemory(shared_buffer.data(), shared_buffer.size()));
         shared_memory_mapped = true;
     }
 
@@ -121,7 +117,7 @@ Result OpusDecoder::DecodeInterleaved(u32* out_data_size, u64* out_time_taken,
 
     R_TRY(hardware_opus.DecodeInterleaved(out_samples, out_data.data(), out_data.size_bytes(),
                                           channel_count, in_data.data(), header.size,
-                                          shared_buffer.get(), time_taken, reset));
+                                          shared_buffer.data(), time_taken, reset));
 
     std::memcpy(output_data.data(), out_data.data(), out_samples * channel_count * sizeof(s16));
 
@@ -136,7 +132,7 @@ Result OpusDecoder::DecodeInterleaved(u32* out_data_size, u64* out_time_taken,
 Result OpusDecoder::SetContext([[maybe_unused]] std::span<const u8> context) {
     R_SUCCEED_IF(shared_memory_mapped);
     shared_memory_mapped = true;
-    R_RETURN(hardware_opus.MapMemory(shared_buffer.get(), shared_buffer_size));
+    R_RETURN(hardware_opus.MapMemory(shared_buffer.data(), shared_buffer.size()));
 }
 
 Result OpusDecoder::DecodeInterleavedForMultiStream(u32* out_data_size, u64* out_time_taken,
@@ -159,7 +155,7 @@ Result OpusDecoder::DecodeInterleavedForMultiStream(u32* out_data_size, u64* out
              ResultBufferTooSmall);
 
     if (!shared_memory_mapped) {
-        R_TRY(hardware_opus.MapMemory(shared_buffer.get(), shared_buffer_size));
+        R_TRY(hardware_opus.MapMemory(shared_buffer.data(), shared_buffer.size()));
         shared_memory_mapped = true;
     }
 
@@ -167,7 +163,7 @@ Result OpusDecoder::DecodeInterleavedForMultiStream(u32* out_data_size, u64* out
 
     R_TRY(hardware_opus.DecodeInterleavedForMultiStream(
         out_samples, out_data.data(), out_data.size_bytes(), channel_count, in_data.data(),
-        header.size, shared_buffer.get(), time_taken, reset));
+        header.size, shared_buffer.data(), time_taken, reset));
 
     std::memcpy(output_data.data(), out_data.data(), out_samples * channel_count * sizeof(s16));
 
diff --git a/src/audio_core/opus/decoder.h b/src/audio_core/opus/decoder.h
index 1b8c257d43..33bf88e349 100644
--- a/src/audio_core/opus/decoder.h
+++ b/src/audio_core/opus/decoder.h
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
@@ -36,8 +39,7 @@ public:
 private:
     Core::System& system;
     HardwareOpus& hardware_opus;
-    std::unique_ptr<u8[]> shared_buffer{};
-    u64 shared_buffer_size;
+    std::vector<u8> shared_buffer{};
     std::span<u8> in_data{};
     std::span<u8> out_data{};
     u64 buffer_size{};
diff --git a/src/common/fiber.cpp b/src/common/fiber.cpp
index 4f0f2b6430..ea3da3d053 100644
--- a/src/common/fiber.cpp
+++ b/src/common/fiber.cpp
@@ -4,6 +4,7 @@
 // SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
+#include <thread>
 #include <mutex>
 
 #include "common/assert.h"
@@ -14,100 +15,70 @@
 
 namespace Common {
 
-constexpr std::size_t default_stack_size = 512 * 1024;
+constexpr size_t DEFAULT_STACK_SIZE = 128 * 4096;
+constexpr u32 CANARY_VALUE = 0xDEADBEEF;
 
 struct Fiber::FiberImpl {
-    FiberImpl() : stack{default_stack_size}, rewind_stack{default_stack_size} {}
+    FiberImpl() {}
 
-    VirtualBuffer<u8> stack;
-    VirtualBuffer<u8> rewind_stack;
+    std::array<u8, DEFAULT_STACK_SIZE> stack{};
+    std::array<u8, DEFAULT_STACK_SIZE> rewind_stack{};
+    u32 canary = CANARY_VALUE;
+
+    boost::context::detail::fcontext_t context{};
+    boost::context::detail::fcontext_t rewind_context{};
 
     std::mutex guard;
     std::function<void()> entry_point;
     std::function<void()> rewind_point;
     std::shared_ptr<Fiber> previous_fiber;
-    bool is_thread_fiber{};
-    bool released{};
 
-    u8* stack_limit{};
-    u8* rewind_stack_limit{};
-    boost::context::detail::fcontext_t context{};
-    boost::context::detail::fcontext_t rewind_context{};
+    u8* stack_limit = nullptr;
+    u8* rewind_stack_limit = nullptr;
+    bool is_thread_fiber = false;
+    bool released = false;
 };
 
 void Fiber::SetRewindPoint(std::function<void()>&& rewind_func) {
     impl->rewind_point = std::move(rewind_func);
 }
 
-void Fiber::Start(boost::context::detail::transfer_t& transfer) {
-    ASSERT(impl->previous_fiber != nullptr);
-    impl->previous_fiber->impl->context = transfer.fctx;
-    impl->previous_fiber->impl->guard.unlock();
-    impl->previous_fiber.reset();
-    impl->entry_point();
-    UNREACHABLE();
-}
-
-void Fiber::OnRewind([[maybe_unused]] boost::context::detail::transfer_t& transfer) {
-    ASSERT(impl->context != nullptr);
-    impl->context = impl->rewind_context;
-    impl->rewind_context = nullptr;
-    u8* tmp = impl->stack_limit;
-    impl->stack_limit = impl->rewind_stack_limit;
-    impl->rewind_stack_limit = tmp;
-    impl->rewind_point();
-    UNREACHABLE();
-}
-
-void Fiber::FiberStartFunc(boost::context::detail::transfer_t transfer) {
-    auto* fiber = static_cast<Fiber*>(transfer.data);
-    fiber->Start(transfer);
-}
-
-void Fiber::RewindStartFunc(boost::context::detail::transfer_t transfer) {
-    auto* fiber = static_cast<Fiber*>(transfer.data);
-    fiber->OnRewind(transfer);
-}
-
 Fiber::Fiber(std::function<void()>&& entry_point_func) : impl{std::make_unique<FiberImpl>()} {
     impl->entry_point = std::move(entry_point_func);
     impl->stack_limit = impl->stack.data();
     impl->rewind_stack_limit = impl->rewind_stack.data();
-    u8* stack_base = impl->stack_limit + default_stack_size;
-    impl->context =
-        boost::context::detail::make_fcontext(stack_base, impl->stack.size(), FiberStartFunc);
+    u8* stack_base = impl->stack_limit + DEFAULT_STACK_SIZE;
+    impl->context = boost::context::detail::make_fcontext(stack_base, impl->stack.size(), [](boost::context::detail::transfer_t transfer) -> void {
+        auto* fiber = static_cast<Fiber*>(transfer.data);
+        ASSERT(fiber && fiber->impl && fiber->impl->previous_fiber && fiber->impl->previous_fiber->impl);
+        ASSERT(fiber->impl->canary == CANARY_VALUE);
+        fiber->impl->previous_fiber->impl->context = transfer.fctx;
+        fiber->impl->previous_fiber->impl->guard.unlock();
+        fiber->impl->previous_fiber.reset();
+        fiber->impl->entry_point();
+        UNREACHABLE();
+    });
 }
 
 Fiber::Fiber() : impl{std::make_unique<FiberImpl>()} {}
 
 Fiber::~Fiber() {
-    if (impl->released) {
-        return;
-    }
-    // Make sure the Fiber is not being used
-    const bool locked = impl->guard.try_lock();
-    ASSERT_MSG(locked, "Destroying a fiber that's still running");
-    if (locked) {
-        impl->guard.unlock();
+    if (!impl->released) {
+        // Make sure the Fiber is not being used
+        const bool locked = impl->guard.try_lock();
+        ASSERT_MSG(locked, "Destroying a fiber that's still running");
+        if (locked) {
+            impl->guard.unlock();
+        }
     }
 }
 
 void Fiber::Exit() {
     ASSERT_MSG(impl->is_thread_fiber, "Exiting non main thread fiber");
-    if (!impl->is_thread_fiber) {
-        return;
+    if (impl->is_thread_fiber) {
+        impl->guard.unlock();
+        impl->released = true;
     }
-    impl->guard.unlock();
-    impl->released = true;
-}
-
-void Fiber::Rewind() {
-    ASSERT(impl->rewind_point);
-    ASSERT(impl->rewind_context == nullptr);
-    u8* stack_base = impl->rewind_stack_limit + default_stack_size;
-    impl->rewind_context =
-        boost::context::detail::make_fcontext(stack_base, impl->stack.size(), RewindStartFunc);
-    boost::context::detail::jump_fcontext(impl->rewind_context, this);
 }
 
 void Fiber::YieldTo(std::weak_ptr<Fiber> weak_from, Fiber& to) {
@@ -115,16 +86,15 @@ void Fiber::YieldTo(std::weak_ptr<Fiber> weak_from, Fiber& to) {
     to.impl->previous_fiber = weak_from.lock();
 
     auto transfer = boost::context::detail::jump_fcontext(to.impl->context, &to);
-
     // "from" might no longer be valid if the thread was killed
     if (auto from = weak_from.lock()) {
         if (from->impl->previous_fiber == nullptr) {
-            ASSERT_MSG(false, "previous_fiber is nullptr!");
-            return;
+            ASSERT(false && "previous_fiber is nullptr!");
+        } else {
+            from->impl->previous_fiber->impl->context = transfer.fctx;
+            from->impl->previous_fiber->impl->guard.unlock();
+            from->impl->previous_fiber.reset();
         }
-        from->impl->previous_fiber->impl->context = transfer.fctx;
-        from->impl->previous_fiber->impl->guard.unlock();
-        from->impl->previous_fiber.reset();
     }
 }
 
diff --git a/src/common/fiber.h b/src/common/fiber.h
index 8af6ae4d3a..eb128f4bb2 100644
--- a/src/common/fiber.h
+++ b/src/common/fiber.h
@@ -45,22 +45,12 @@ public:
     /// Fiber 'from' must be the currently running fiber.
     static void YieldTo(std::weak_ptr<Fiber> weak_from, Fiber& to);
     [[nodiscard]] static std::shared_ptr<Fiber> ThreadToFiber();
-
     void SetRewindPoint(std::function<void()>&& rewind_func);
-
-    void Rewind();
-
     /// Only call from main thread's fiber
     void Exit();
-
 private:
     Fiber();
-
-    void OnRewind(boost::context::detail::transfer_t& transfer);
     void Start(boost::context::detail::transfer_t& transfer);
-    static void FiberStartFunc(boost::context::detail::transfer_t transfer);
-    static void RewindStartFunc(boost::context::detail::transfer_t transfer);
-
     struct FiberImpl;
     std::unique_ptr<FiberImpl> impl;
 };
diff --git a/src/common/wall_clock.cpp b/src/common/wall_clock.cpp
index e14bf3e651..4f9c240905 100644
--- a/src/common/wall_clock.cpp
+++ b/src/common/wall_clock.cpp
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
@@ -9,7 +12,6 @@
 #include "common/x64/native_clock.h"
 #include "common/x64/rdtsc.h"
 #endif
-
 #ifdef HAS_NCE
 #include "common/arm64/native_clock.h"
 #endif
@@ -73,8 +75,4 @@ std::unique_ptr<WallClock> CreateOptimalClock() {
 #endif
 }
 
-std::unique_ptr<WallClock> CreateStandardWallClock() {
-    return std::make_unique<StandardWallClock>();
-}
-
 } // namespace Common
diff --git a/src/common/wall_clock.h b/src/common/wall_clock.h
index 3a0c43909a..7ad6536930 100644
--- a/src/common/wall_clock.h
+++ b/src/common/wall_clock.h
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
@@ -84,8 +87,6 @@ protected:
     using CPUTickToGPUTickRatio = std::ratio<GPUTickFreq, CPUTickFreq>;
 };
 
-std::unique_ptr<WallClock> CreateOptimalClock();
-
-std::unique_ptr<WallClock> CreateStandardWallClock();
+[[nodiscard]] std::unique_ptr<WallClock> CreateOptimalClock();
 
 } // namespace Common
diff --git a/src/core/arm/dynarmic/arm_dynarmic_32.cpp b/src/core/arm/dynarmic/arm_dynarmic_32.cpp
index b57996cb8b..0fa4ca6f06 100644
--- a/src/core/arm/dynarmic/arm_dynarmic_32.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic_32.cpp
@@ -16,170 +16,160 @@ namespace Core {
 
 using namespace Common::Literals;
 
-class DynarmicCallbacks32 : public Dynarmic::A32::UserCallbacks {
-public:
-    explicit DynarmicCallbacks32(ArmDynarmic32& parent, Kernel::KProcess* process)
-        : m_parent{parent}, m_memory(process->GetMemory()),
-          m_process(process), m_debugger_enabled{parent.m_system.DebuggerEnabled()},
-          m_check_memory_access{m_debugger_enabled ||
-                                !Settings::values.cpuopt_ignore_memory_aborts.GetValue()} {}
+DynarmicCallbacks32::DynarmicCallbacks32(ArmDynarmic32& parent, Kernel::KProcess* process)
+    : m_parent{parent}, m_memory(process->GetMemory())
+    , m_process(process), m_debugger_enabled{parent.m_system.DebuggerEnabled()}
+    , m_check_memory_access{m_debugger_enabled || !Settings::values.cpuopt_ignore_memory_aborts.GetValue()}
+{}
 
-    u8 MemoryRead8(u32 vaddr) override {
-        CheckMemoryAccess(vaddr, 1, Kernel::DebugWatchpointType::Read);
-        return m_memory.Read8(vaddr);
-    }
-    u16 MemoryRead16(u32 vaddr) override {
-        CheckMemoryAccess(vaddr, 2, Kernel::DebugWatchpointType::Read);
-        return m_memory.Read16(vaddr);
-    }
-    u32 MemoryRead32(u32 vaddr) override {
-        CheckMemoryAccess(vaddr, 4, Kernel::DebugWatchpointType::Read);
-        return m_memory.Read32(vaddr);
-    }
-    u64 MemoryRead64(u32 vaddr) override {
-        CheckMemoryAccess(vaddr, 8, Kernel::DebugWatchpointType::Read);
-        return m_memory.Read64(vaddr);
-    }
-    std::optional<u32> MemoryReadCode(u32 vaddr) override {
-        if (!m_memory.IsValidVirtualAddressRange(vaddr, sizeof(u32))) {
-            return std::nullopt;
-        }
-        return m_memory.Read32(vaddr);
-    }
+u8 DynarmicCallbacks32::MemoryRead8(u32 vaddr) {
+    CheckMemoryAccess(vaddr, 1, Kernel::DebugWatchpointType::Read);
+    return m_memory.Read8(vaddr);
+}
+u16 DynarmicCallbacks32::MemoryRead16(u32 vaddr) {
+    CheckMemoryAccess(vaddr, 2, Kernel::DebugWatchpointType::Read);
+    return m_memory.Read16(vaddr);
+}
+u32 DynarmicCallbacks32::MemoryRead32(u32 vaddr) {
+    CheckMemoryAccess(vaddr, 4, Kernel::DebugWatchpointType::Read);
+    return m_memory.Read32(vaddr);
+}
+u64 DynarmicCallbacks32::MemoryRead64(u32 vaddr) {
+    CheckMemoryAccess(vaddr, 8, Kernel::DebugWatchpointType::Read);
+    return m_memory.Read64(vaddr);
+}
+std::optional<u32> DynarmicCallbacks32::MemoryReadCode(u32 vaddr) {
+    if (!m_memory.IsValidVirtualAddressRange(vaddr, sizeof(u32)))
+        return std::nullopt;
+    return m_memory.Read32(vaddr);
+}
 
-    void MemoryWrite8(u32 vaddr, u8 value) override {
-        if (CheckMemoryAccess(vaddr, 1, Kernel::DebugWatchpointType::Write)) {
-            m_memory.Write8(vaddr, value);
-        }
+void DynarmicCallbacks32::MemoryWrite8(u32 vaddr, u8 value) {
+    if (CheckMemoryAccess(vaddr, 1, Kernel::DebugWatchpointType::Write)) {
+        m_memory.Write8(vaddr, value);
     }
-    void MemoryWrite16(u32 vaddr, u16 value) override {
-        if (CheckMemoryAccess(vaddr, 2, Kernel::DebugWatchpointType::Write)) {
-            m_memory.Write16(vaddr, value);
-        }
+}
+void DynarmicCallbacks32::MemoryWrite16(u32 vaddr, u16 value) {
+    if (CheckMemoryAccess(vaddr, 2, Kernel::DebugWatchpointType::Write)) {
+        m_memory.Write16(vaddr, value);
     }
-    void MemoryWrite32(u32 vaddr, u32 value) override {
-        if (CheckMemoryAccess(vaddr, 4, Kernel::DebugWatchpointType::Write)) {
-            m_memory.Write32(vaddr, value);
-        }
+}
+void DynarmicCallbacks32::MemoryWrite32(u32 vaddr, u32 value) {
+    if (CheckMemoryAccess(vaddr, 4, Kernel::DebugWatchpointType::Write)) {
+        m_memory.Write32(vaddr, value);
     }
-    void MemoryWrite64(u32 vaddr, u64 value) override {
-        if (CheckMemoryAccess(vaddr, 8, Kernel::DebugWatchpointType::Write)) {
-            m_memory.Write64(vaddr, value);
-        }
+}
+void DynarmicCallbacks32::MemoryWrite64(u32 vaddr, u64 value) {
+    if (CheckMemoryAccess(vaddr, 8, Kernel::DebugWatchpointType::Write)) {
+        m_memory.Write64(vaddr, value);
     }
+}
 
-    bool MemoryWriteExclusive8(u32 vaddr, u8 value, u8 expected) override {
-        return CheckMemoryAccess(vaddr, 1, Kernel::DebugWatchpointType::Write) &&
-               m_memory.WriteExclusive8(vaddr, value, expected);
-    }
-    bool MemoryWriteExclusive16(u32 vaddr, u16 value, u16 expected) override {
-        return CheckMemoryAccess(vaddr, 2, Kernel::DebugWatchpointType::Write) &&
-               m_memory.WriteExclusive16(vaddr, value, expected);
-    }
-    bool MemoryWriteExclusive32(u32 vaddr, u32 value, u32 expected) override {
-        return CheckMemoryAccess(vaddr, 4, Kernel::DebugWatchpointType::Write) &&
-               m_memory.WriteExclusive32(vaddr, value, expected);
-    }
-    bool MemoryWriteExclusive64(u32 vaddr, u64 value, u64 expected) override {
-        return CheckMemoryAccess(vaddr, 8, Kernel::DebugWatchpointType::Write) &&
-               m_memory.WriteExclusive64(vaddr, value, expected);
-    }
+bool DynarmicCallbacks32::MemoryWriteExclusive8(u32 vaddr, u8 value, u8 expected) {
+    return CheckMemoryAccess(vaddr, 1, Kernel::DebugWatchpointType::Write) &&
+            m_memory.WriteExclusive8(vaddr, value, expected);
+}
+bool DynarmicCallbacks32::MemoryWriteExclusive16(u32 vaddr, u16 value, u16 expected) {
+    return CheckMemoryAccess(vaddr, 2, Kernel::DebugWatchpointType::Write) &&
+            m_memory.WriteExclusive16(vaddr, value, expected);
+}
+bool DynarmicCallbacks32::MemoryWriteExclusive32(u32 vaddr, u32 value, u32 expected) {
+    return CheckMemoryAccess(vaddr, 4, Kernel::DebugWatchpointType::Write) &&
+            m_memory.WriteExclusive32(vaddr, value, expected);
+}
+bool DynarmicCallbacks32::MemoryWriteExclusive64(u32 vaddr, u64 value, u64 expected) {
+    return CheckMemoryAccess(vaddr, 8, Kernel::DebugWatchpointType::Write) &&
+            m_memory.WriteExclusive64(vaddr, value, expected);
+}
 
-    void InterpreterFallback(u32 pc, std::size_t num_instructions) override {
-        m_parent.LogBacktrace(m_process);
-        LOG_ERROR(Core_ARM,
-                  "Unimplemented instruction @ {:#X} for {} instructions (instr = {:08X})", pc,
-                  num_instructions, m_memory.Read32(pc));
-    }
+void DynarmicCallbacks32::InterpreterFallback(u32 pc, std::size_t num_instructions) {
+    m_parent.LogBacktrace(m_process);
+    LOG_ERROR(Core_ARM,
+                "Unimplemented instruction @ {:#X} for {} instructions (instr = {:08X})", pc,
+                num_instructions, m_memory.Read32(pc));
+}
 
-    void ExceptionRaised(u32 pc, Dynarmic::A32::Exception exception) override {
-        switch (exception) {
-        case Dynarmic::A32::Exception::NoExecuteFault:
-            LOG_CRITICAL(Core_ARM, "Cannot execute instruction at unmapped address {:#08x}", pc);
-            ReturnException(pc, PrefetchAbort);
+void DynarmicCallbacks32::ExceptionRaised(u32 pc, Dynarmic::A32::Exception exception) {
+    switch (exception) {
+    case Dynarmic::A32::Exception::NoExecuteFault:
+        LOG_CRITICAL(Core_ARM, "Cannot execute instruction at unmapped address {:#08x}", pc);
+        ReturnException(pc, PrefetchAbort);
+        return;
+    default:
+        if (m_debugger_enabled) {
+            ReturnException(pc, InstructionBreakpoint);
             return;
-        default:
-            if (m_debugger_enabled) {
-                ReturnException(pc, InstructionBreakpoint);
-                return;
-            }
-
-            m_parent.LogBacktrace(m_process);
-            LOG_CRITICAL(Core_ARM,
-                         "ExceptionRaised(exception = {}, pc = {:08X}, code = {:08X}, thumb = {})",
-                         exception, pc, m_memory.Read32(pc), m_parent.IsInThumbMode());
         }
+
+        m_parent.LogBacktrace(m_process);
+        LOG_CRITICAL(Core_ARM,
+                        "ExceptionRaised(exception = {}, pc = {:08X}, code = {:08X}, thumb = {})",
+                        exception, pc, m_memory.Read32(pc), m_parent.IsInThumbMode());
     }
+}
 
-    void CallSVC(u32 swi) override {
-        m_parent.m_svc_swi = swi;
-        m_parent.m_jit->HaltExecution(SupervisorCall);
-    }
+void DynarmicCallbacks32::CallSVC(u32 swi) {
+    m_parent.m_svc_swi = swi;
+    m_parent.m_jit->HaltExecution(SupervisorCall);
+}
 
-    void AddTicks(u64 ticks) override {
-        ASSERT_MSG(!m_parent.m_uses_wall_clock, "Dynarmic ticking disabled");
+void DynarmicCallbacks32::AddTicks(u64 ticks) {
+    ASSERT_MSG(!m_parent.m_uses_wall_clock, "Dynarmic ticking disabled");
 
-        // Divide the number of ticks by the amount of CPU cores. TODO(Subv): This yields only a
-        // rough approximation of the amount of executed ticks in the system, it may be thrown off
-        // if not all cores are doing a similar amount of work. Instead of doing this, we should
-        // device a way so that timing is consistent across all cores without increasing the ticks 4
-        // times.
-        u64 amortized_ticks = ticks / Core::Hardware::NUM_CPU_CORES;
-        // Always execute at least one tick.
-        amortized_ticks = std::max<u64>(amortized_ticks, 1);
+    // Divide the number of ticks by the amount of CPU cores. TODO(Subv): This yields only a
+    // rough approximation of the amount of executed ticks in the system, it may be thrown off
+    // if not all cores are doing a similar amount of work. Instead of doing this, we should
+    // device a way so that timing is consistent across all cores without increasing the ticks 4
+    // times.
+    u64 amortized_ticks = ticks / Core::Hardware::NUM_CPU_CORES;
+    // Always execute at least one tick.
+    amortized_ticks = std::max<u64>(amortized_ticks, 1);
 
-        m_parent.m_system.CoreTiming().AddTicks(amortized_ticks);
-    }
+    m_parent.m_system.CoreTiming().AddTicks(amortized_ticks);
+}
 
-    u64 GetTicksRemaining() override {
-        ASSERT_MSG(!m_parent.m_uses_wall_clock, "Dynarmic ticking disabled");
+u64 DynarmicCallbacks32::GetTicksRemaining() {
+    ASSERT_MSG(!m_parent.m_uses_wall_clock, "Dynarmic ticking disabled");
 
-        return std::max<s64>(m_parent.m_system.CoreTiming().GetDowncount(), 0);
-    }
-
-    bool CheckMemoryAccess(u64 addr, u64 size, Kernel::DebugWatchpointType type) {
-        if (!m_check_memory_access) {
-            return true;
-        }
-
-        if (!m_memory.IsValidVirtualAddressRange(addr, size)) {
-            LOG_CRITICAL(Core_ARM, "Stopping execution due to unmapped memory access at {:#x}",
-                         addr);
-            m_parent.m_jit->HaltExecution(PrefetchAbort);
-            return false;
-        }
-
-        if (!m_debugger_enabled) {
-            return true;
-        }
-
-        const auto match{m_parent.MatchingWatchpoint(addr, size, type)};
-        if (match) {
-            m_parent.m_halted_watchpoint = match;
-            m_parent.m_jit->HaltExecution(DataAbort);
-            return false;
-        }
+    return std::max<s64>(m_parent.m_system.CoreTiming().GetDowncount(), 0);
+}
 
+bool DynarmicCallbacks32::CheckMemoryAccess(u64 addr, u64 size, Kernel::DebugWatchpointType type) {
+    if (!m_check_memory_access) {
         return true;
     }
 
-    void ReturnException(u32 pc, Dynarmic::HaltReason hr) {
-        m_parent.GetContext(m_parent.m_breakpoint_context);
-        m_parent.m_breakpoint_context.pc = pc;
-        m_parent.m_breakpoint_context.r[15] = pc;
-        m_parent.m_jit->HaltExecution(hr);
+    if (!m_memory.IsValidVirtualAddressRange(addr, size)) {
+        LOG_CRITICAL(Core_ARM, "Stopping execution due to unmapped memory access at {:#x}",
+                        addr);
+        m_parent.m_jit->HaltExecution(PrefetchAbort);
+        return false;
     }
 
-    ArmDynarmic32& m_parent;
-    Core::Memory::Memory& m_memory;
-    Kernel::KProcess* m_process{};
-    const bool m_debugger_enabled{};
-    const bool m_check_memory_access{};
-};
+    if (!m_debugger_enabled) {
+        return true;
+    }
 
-std::shared_ptr<Dynarmic::A32::Jit> ArmDynarmic32::MakeJit(Common::PageTable* page_table) const {
+    const auto match{m_parent.MatchingWatchpoint(addr, size, type)};
+    if (match) {
+        m_parent.m_halted_watchpoint = match;
+        m_parent.m_jit->HaltExecution(DataAbort);
+        return false;
+    }
+
+    return true;
+}
+
+void DynarmicCallbacks32::ReturnException(u32 pc, Dynarmic::HaltReason hr) {
+    m_parent.GetContext(m_parent.m_breakpoint_context);
+    m_parent.m_breakpoint_context.pc = pc;
+    m_parent.m_breakpoint_context.r[15] = pc;
+    m_parent.m_jit->HaltExecution(hr);
+}
+
+void ArmDynarmic32::MakeJit(Common::PageTable* page_table) {
     Dynarmic::A32::UserConfig config;
-    config.callbacks = m_cb.get();
+    config.callbacks = std::addressof(*m_cb);
     config.coprocessors[15] = m_cp15;
     config.define_unpredictable_behaviour = true;
 
@@ -315,7 +305,7 @@ std::shared_ptr<Dynarmic::A32::Jit> ArmDynarmic32::MakeJit(Common::PageTable* pa
     default:
         break;
     }
-    return std::make_unique<Dynarmic::A32::Jit>(config);
+    m_jit.emplace(config);
 }
 
 static std::pair<u32, u32> FpscrToFpsrFpcr(u32 fpscr) {
@@ -360,21 +350,17 @@ u32 ArmDynarmic32::GetSvcNumber() const {
 }
 
 void ArmDynarmic32::GetSvcArguments(std::span<uint64_t, 8> args) const {
-    Dynarmic::A32::Jit& j = *m_jit;
+    Dynarmic::A32::Jit const& j = *m_jit;
     auto& gpr = j.Regs();
-
-    for (size_t i = 0; i < 8; i++) {
+    for (size_t i = 0; i < 8; i++)
         args[i] = gpr[i];
-    }
 }
 
 void ArmDynarmic32::SetSvcArguments(std::span<const uint64_t, 8> args) {
     Dynarmic::A32::Jit& j = *m_jit;
     auto& gpr = j.Regs();
-
-    for (size_t i = 0; i < 8; i++) {
-        gpr[i] = static_cast<u32>(args[i]);
-    }
+    for (size_t i = 0; i < 8; i++)
+        gpr[i] = u32(args[i]);
 }
 
 const Kernel::DebugWatchpoint* ArmDynarmic32::HaltedWatchpoint() const {
@@ -387,11 +373,12 @@ void ArmDynarmic32::RewindBreakpointInstruction() {
 
 ArmDynarmic32::ArmDynarmic32(System& system, bool uses_wall_clock, Kernel::KProcess* process,
                              DynarmicExclusiveMonitor& exclusive_monitor, std::size_t core_index)
-    : ArmInterface{uses_wall_clock}, m_system{system}, m_exclusive_monitor{exclusive_monitor},
-      m_cb(std::make_unique<DynarmicCallbacks32>(*this, process)),
-      m_cp15(std::make_shared<DynarmicCP15>(*this)), m_core_index{core_index} {
+    : ArmInterface{uses_wall_clock}, m_system{system}, m_exclusive_monitor{exclusive_monitor}
+    , m_cb(std::make_optional<DynarmicCallbacks32>(*this, process))
+    , m_cp15(std::make_shared<DynarmicCP15>(*this)), m_core_index{core_index}
+{
     auto& page_table_impl = process->GetPageTable().GetBasePageTable().GetImpl();
-    m_jit = MakeJit(&page_table_impl);
+    MakeJit(&page_table_impl);
 }
 
 ArmDynarmic32::~ArmDynarmic32() = default;
@@ -401,23 +388,18 @@ void ArmDynarmic32::SetTpidrroEl0(u64 value) {
 }
 
 void ArmDynarmic32::GetContext(Kernel::Svc::ThreadContext& ctx) const {
-    Dynarmic::A32::Jit& j = *m_jit;
+    Dynarmic::A32::Jit const& j = *m_jit;
     auto& gpr = j.Regs();
     auto& fpr = j.ExtRegs();
-
-    for (size_t i = 0; i < 16; i++) {
+    for (size_t i = 0; i < 16; i++)
         ctx.r[i] = gpr[i];
-    }
-
     ctx.fp = gpr[11];
     ctx.sp = gpr[13];
     ctx.lr = gpr[14];
     ctx.pc = gpr[15];
     ctx.pstate = j.Cpsr();
-
     static_assert(sizeof(fpr) <= sizeof(ctx.v));
     std::memcpy(ctx.v.data(), &fpr, sizeof(fpr));
-
     auto [fpsr, fpcr] = FpscrToFpsrFpcr(j.Fpscr());
     ctx.fpcr = fpcr;
     ctx.fpsr = fpsr;
@@ -428,16 +410,11 @@ void ArmDynarmic32::SetContext(const Kernel::Svc::ThreadContext& ctx) {
     Dynarmic::A32::Jit& j = *m_jit;
     auto& gpr = j.Regs();
     auto& fpr = j.ExtRegs();
-
-    for (size_t i = 0; i < 16; i++) {
-        gpr[i] = static_cast<u32>(ctx.r[i]);
-    }
-
+    for (size_t i = 0; i < 16; i++)
+        gpr[i] = u32(ctx.r[i]);
     j.SetCpsr(ctx.pstate);
-
     static_assert(sizeof(fpr) <= sizeof(ctx.v));
     std::memcpy(&fpr, ctx.v.data(), sizeof(fpr));
-
     j.SetFpscr(FpsrFpcrToFpscr(ctx.fpsr, ctx.fpcr));
     m_cp15->uprw = static_cast<u32>(ctx.tpidr);
 }
diff --git a/src/core/arm/dynarmic/arm_dynarmic_32.h b/src/core/arm/dynarmic/arm_dynarmic_32.h
index b580efe615..1934934bd9 100644
--- a/src/core/arm/dynarmic/arm_dynarmic_32.h
+++ b/src/core/arm/dynarmic/arm_dynarmic_32.h
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
@@ -12,16 +15,50 @@ namespace Core::Memory {
 class Memory;
 }
 
+namespace Kernel {
+enum class DebugWatchpointType : u8;
+class KPRocess;
+}
+
 namespace Core {
 
-class DynarmicCallbacks32;
+class ArmDynarmic32;
 class DynarmicCP15;
 class System;
 
+class DynarmicCallbacks32 : public Dynarmic::A32::UserCallbacks {
+public:
+    explicit DynarmicCallbacks32(ArmDynarmic32& parent, Kernel::KProcess* process);
+    u8 MemoryRead8(u32 vaddr) override;
+    u16 MemoryRead16(u32 vaddr) override;
+    u32 MemoryRead32(u32 vaddr) override;
+    u64 MemoryRead64(u32 vaddr) override;
+    std::optional<u32> MemoryReadCode(u32 vaddr) override;
+    void MemoryWrite8(u32 vaddr, u8 value) override;
+    void MemoryWrite16(u32 vaddr, u16 value) override;
+    void MemoryWrite32(u32 vaddr, u32 value) override;
+    void MemoryWrite64(u32 vaddr, u64 value) override;
+    bool MemoryWriteExclusive8(u32 vaddr, u8 value, u8 expected) override;
+    bool MemoryWriteExclusive16(u32 vaddr, u16 value, u16 expected) override;
+    bool MemoryWriteExclusive32(u32 vaddr, u32 value, u32 expected) override;
+    bool MemoryWriteExclusive64(u32 vaddr, u64 value, u64 expected) override;
+    void InterpreterFallback(u32 pc, std::size_t num_instructions) override;
+    void ExceptionRaised(u32 pc, Dynarmic::A32::Exception exception) override;
+    void CallSVC(u32 swi) override;
+    void AddTicks(u64 ticks) override;
+    u64 GetTicksRemaining() override;
+    bool CheckMemoryAccess(u64 addr, u64 size, Kernel::DebugWatchpointType type);
+    void ReturnException(u32 pc, Dynarmic::HaltReason hr);
+    ArmDynarmic32& m_parent;
+    Core::Memory::Memory& m_memory;
+    Kernel::KProcess* m_process{};
+    const bool m_debugger_enabled{};
+    const bool m_check_memory_access{};
+};
+
 class ArmDynarmic32 final : public ArmInterface {
 public:
-    ArmDynarmic32(System& system, bool uses_wall_clock, Kernel::KProcess* process,
-                  DynarmicExclusiveMonitor& exclusive_monitor, std::size_t core_index);
+    ArmDynarmic32(System& system, bool uses_wall_clock, Kernel::KProcess* process, DynarmicExclusiveMonitor& exclusive_monitor, std::size_t core_index);
     ~ArmDynarmic32() override;
 
     Architecture GetArchitecture() const override {
@@ -57,13 +94,13 @@ private:
     friend class DynarmicCallbacks32;
     friend class DynarmicCP15;
 
-    std::shared_ptr<Dynarmic::A32::Jit> MakeJit(Common::PageTable* page_table) const;
+    void MakeJit(Common::PageTable* page_table);
 
-    std::unique_ptr<DynarmicCallbacks32> m_cb{};
+    std::optional<DynarmicCallbacks32> m_cb{};
     std::shared_ptr<DynarmicCP15> m_cp15{};
     std::size_t m_core_index{};
 
-    std::shared_ptr<Dynarmic::A32::Jit> m_jit{};
+    std::optional<Dynarmic::A32::Jit> m_jit{};
 
     // SVC callback
     u32 m_svc_swi{};
diff --git a/src/core/arm/dynarmic/arm_dynarmic_64.cpp b/src/core/arm/dynarmic/arm_dynarmic_64.cpp
index ba6178c1e4..92e1a70458 100644
--- a/src/core/arm/dynarmic/arm_dynarmic_64.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic_64.cpp
@@ -13,223 +13,203 @@
 
 namespace Core {
 
-using Vector = Dynarmic::A64::Vector;
 using namespace Common::Literals;
 
-class DynarmicCallbacks64 : public Dynarmic::A64::UserCallbacks {
-public:
-    explicit DynarmicCallbacks64(ArmDynarmic64& parent, Kernel::KProcess* process)
-        : m_parent{parent}, m_memory(process->GetMemory()),
-          m_process(process), m_debugger_enabled{parent.m_system.DebuggerEnabled()},
-          m_check_memory_access{m_debugger_enabled ||
-                                !Settings::values.cpuopt_ignore_memory_aborts.GetValue()} {}
+DynarmicCallbacks64::DynarmicCallbacks64(ArmDynarmic64& parent, Kernel::KProcess* process)
+    : m_parent{parent}, m_memory(process->GetMemory())
+    , m_process(process), m_debugger_enabled{parent.m_system.DebuggerEnabled()}
+    , m_check_memory_access{m_debugger_enabled || !Settings::values.cpuopt_ignore_memory_aborts.GetValue()}
+{}
 
-    u8 MemoryRead8(u64 vaddr) override {
-        CheckMemoryAccess(vaddr, 1, Kernel::DebugWatchpointType::Read);
-        return m_memory.Read8(vaddr);
-    }
-    u16 MemoryRead16(u64 vaddr) override {
-        CheckMemoryAccess(vaddr, 2, Kernel::DebugWatchpointType::Read);
-        return m_memory.Read16(vaddr);
-    }
-    u32 MemoryRead32(u64 vaddr) override {
-        CheckMemoryAccess(vaddr, 4, Kernel::DebugWatchpointType::Read);
-        return m_memory.Read32(vaddr);
-    }
-    u64 MemoryRead64(u64 vaddr) override {
-        CheckMemoryAccess(vaddr, 8, Kernel::DebugWatchpointType::Read);
-        return m_memory.Read64(vaddr);
-    }
-    Vector MemoryRead128(u64 vaddr) override {
-        CheckMemoryAccess(vaddr, 16, Kernel::DebugWatchpointType::Read);
-        return {m_memory.Read64(vaddr), m_memory.Read64(vaddr + 8)};
-    }
-    std::optional<u32> MemoryReadCode(u64 vaddr) override {
-        if (!m_memory.IsValidVirtualAddressRange(vaddr, sizeof(u32))) {
-            return std::nullopt;
-        }
-        return m_memory.Read32(vaddr);
-    }
+u8 DynarmicCallbacks64::MemoryRead8(u64 vaddr) {
+    CheckMemoryAccess(vaddr, 1, Kernel::DebugWatchpointType::Read);
+    return m_memory.Read8(vaddr);
+}
+u16 DynarmicCallbacks64::MemoryRead16(u64 vaddr) {
+    CheckMemoryAccess(vaddr, 2, Kernel::DebugWatchpointType::Read);
+    return m_memory.Read16(vaddr);
+}
+u32 DynarmicCallbacks64::MemoryRead32(u64 vaddr) {
+    CheckMemoryAccess(vaddr, 4, Kernel::DebugWatchpointType::Read);
+    return m_memory.Read32(vaddr);
+}
+u64 DynarmicCallbacks64::MemoryRead64(u64 vaddr) {
+    CheckMemoryAccess(vaddr, 8, Kernel::DebugWatchpointType::Read);
+    return m_memory.Read64(vaddr);
+}
+Dynarmic::A64::Vector DynarmicCallbacks64::MemoryRead128(u64 vaddr) {
+    CheckMemoryAccess(vaddr, 16, Kernel::DebugWatchpointType::Read);
+    return {m_memory.Read64(vaddr), m_memory.Read64(vaddr + 8)};
+}
+std::optional<u32> DynarmicCallbacks64::MemoryReadCode(u64 vaddr) {
+    if (!m_memory.IsValidVirtualAddressRange(vaddr, sizeof(u32)))
+        return std::nullopt;
+    return m_memory.Read32(vaddr);
+}
 
-    void MemoryWrite8(u64 vaddr, u8 value) override {
-        if (CheckMemoryAccess(vaddr, 1, Kernel::DebugWatchpointType::Write)) {
-            m_memory.Write8(vaddr, value);
-        }
+void DynarmicCallbacks64::MemoryWrite8(u64 vaddr, u8 value) {
+    if (CheckMemoryAccess(vaddr, 1, Kernel::DebugWatchpointType::Write)) {
+        m_memory.Write8(vaddr, value);
     }
-    void MemoryWrite16(u64 vaddr, u16 value) override {
-        if (CheckMemoryAccess(vaddr, 2, Kernel::DebugWatchpointType::Write)) {
-            m_memory.Write16(vaddr, value);
-        }
+}
+void DynarmicCallbacks64::MemoryWrite16(u64 vaddr, u16 value) {
+    if (CheckMemoryAccess(vaddr, 2, Kernel::DebugWatchpointType::Write)) {
+        m_memory.Write16(vaddr, value);
     }
-    void MemoryWrite32(u64 vaddr, u32 value) override {
-        if (CheckMemoryAccess(vaddr, 4, Kernel::DebugWatchpointType::Write)) {
-            m_memory.Write32(vaddr, value);
-        }
+}
+void DynarmicCallbacks64::MemoryWrite32(u64 vaddr, u32 value) {
+    if (CheckMemoryAccess(vaddr, 4, Kernel::DebugWatchpointType::Write)) {
+        m_memory.Write32(vaddr, value);
     }
-    void MemoryWrite64(u64 vaddr, u64 value) override {
-        if (CheckMemoryAccess(vaddr, 8, Kernel::DebugWatchpointType::Write)) {
-            m_memory.Write64(vaddr, value);
-        }
+}
+void DynarmicCallbacks64::MemoryWrite64(u64 vaddr, u64 value) {
+    if (CheckMemoryAccess(vaddr, 8, Kernel::DebugWatchpointType::Write)) {
+        m_memory.Write64(vaddr, value);
     }
-    void MemoryWrite128(u64 vaddr, Vector value) override {
-        if (CheckMemoryAccess(vaddr, 16, Kernel::DebugWatchpointType::Write)) {
-            m_memory.Write64(vaddr, value[0]);
-            m_memory.Write64(vaddr + 8, value[1]);
-        }
+}
+void DynarmicCallbacks64::MemoryWrite128(u64 vaddr, Dynarmic::A64::Vector value) {
+    if (CheckMemoryAccess(vaddr, 16, Kernel::DebugWatchpointType::Write)) {
+        m_memory.Write64(vaddr, value[0]);
+        m_memory.Write64(vaddr + 8, value[1]);
     }
+}
 
-    bool MemoryWriteExclusive8(u64 vaddr, std::uint8_t value, std::uint8_t expected) override {
-        return CheckMemoryAccess(vaddr, 1, Kernel::DebugWatchpointType::Write) &&
-               m_memory.WriteExclusive8(vaddr, value, expected);
-    }
-    bool MemoryWriteExclusive16(u64 vaddr, std::uint16_t value, std::uint16_t expected) override {
-        return CheckMemoryAccess(vaddr, 2, Kernel::DebugWatchpointType::Write) &&
-               m_memory.WriteExclusive16(vaddr, value, expected);
-    }
-    bool MemoryWriteExclusive32(u64 vaddr, std::uint32_t value, std::uint32_t expected) override {
-        return CheckMemoryAccess(vaddr, 4, Kernel::DebugWatchpointType::Write) &&
-               m_memory.WriteExclusive32(vaddr, value, expected);
-    }
-    bool MemoryWriteExclusive64(u64 vaddr, std::uint64_t value, std::uint64_t expected) override {
-        return CheckMemoryAccess(vaddr, 8, Kernel::DebugWatchpointType::Write) &&
-               m_memory.WriteExclusive64(vaddr, value, expected);
-    }
-    bool MemoryWriteExclusive128(u64 vaddr, Vector value, Vector expected) override {
-        return CheckMemoryAccess(vaddr, 16, Kernel::DebugWatchpointType::Write) &&
-               m_memory.WriteExclusive128(vaddr, value, expected);
-    }
+bool DynarmicCallbacks64::MemoryWriteExclusive8(u64 vaddr, std::uint8_t value, std::uint8_t expected) {
+    return CheckMemoryAccess(vaddr, 1, Kernel::DebugWatchpointType::Write) &&
+            m_memory.WriteExclusive8(vaddr, value, expected);
+}
+bool DynarmicCallbacks64::MemoryWriteExclusive16(u64 vaddr, std::uint16_t value, std::uint16_t expected) {
+    return CheckMemoryAccess(vaddr, 2, Kernel::DebugWatchpointType::Write) &&
+            m_memory.WriteExclusive16(vaddr, value, expected);
+}
+bool DynarmicCallbacks64::MemoryWriteExclusive32(u64 vaddr, std::uint32_t value, std::uint32_t expected) {
+    return CheckMemoryAccess(vaddr, 4, Kernel::DebugWatchpointType::Write) &&
+            m_memory.WriteExclusive32(vaddr, value, expected);
+}
+bool DynarmicCallbacks64::MemoryWriteExclusive64(u64 vaddr, std::uint64_t value, std::uint64_t expected) {
+    return CheckMemoryAccess(vaddr, 8, Kernel::DebugWatchpointType::Write) &&
+            m_memory.WriteExclusive64(vaddr, value, expected);
+}
+bool DynarmicCallbacks64::MemoryWriteExclusive128(u64 vaddr, Dynarmic::A64::Vector value, Dynarmic::A64::Vector expected) {
+    return CheckMemoryAccess(vaddr, 16, Kernel::DebugWatchpointType::Write) &&
+            m_memory.WriteExclusive128(vaddr, value, expected);
+}
 
-    void InterpreterFallback(u64 pc, std::size_t num_instructions) override {
-        m_parent.LogBacktrace(m_process);
-        LOG_ERROR(Core_ARM,
-                  "Unimplemented instruction @ {:#X} for {} instructions (instr = {:08X})", pc,
-                  num_instructions, m_memory.Read32(pc));
+void DynarmicCallbacks64::InterpreterFallback(u64 pc, std::size_t num_instructions) {
+    m_parent.LogBacktrace(m_process);
+    LOG_ERROR(Core_ARM, "Unimplemented instruction @ {:#X} for {} instructions (instr = {:08X})", pc,
+        num_instructions, m_memory.Read32(pc));
+    ReturnException(pc, PrefetchAbort);
+}
+
+void DynarmicCallbacks64::InstructionCacheOperationRaised(Dynarmic::A64::InstructionCacheOperation op, u64 value) {
+    switch (op) {
+    case Dynarmic::A64::InstructionCacheOperation::InvalidateByVAToPoU: {
+        static constexpr u64 ICACHE_LINE_SIZE = 64;
+        const u64 cache_line_start = value & ~(ICACHE_LINE_SIZE - 1);
+        m_parent.InvalidateCacheRange(cache_line_start, ICACHE_LINE_SIZE);
+        break;
+    }
+    case Dynarmic::A64::InstructionCacheOperation::InvalidateAllToPoU:
+        m_parent.ClearInstructionCache();
+        break;
+    case Dynarmic::A64::InstructionCacheOperation::InvalidateAllToPoUInnerSharable:
+    default:
+        LOG_DEBUG(Core_ARM, "Unprocesseed instruction cache operation: {}", op);
+        break;
+    }
+    m_parent.m_jit->HaltExecution(Dynarmic::HaltReason::CacheInvalidation);
+}
+
+void DynarmicCallbacks64::ExceptionRaised(u64 pc, Dynarmic::A64::Exception exception) {
+    switch (exception) {
+    case Dynarmic::A64::Exception::WaitForInterrupt:
+    case Dynarmic::A64::Exception::WaitForEvent:
+    case Dynarmic::A64::Exception::SendEvent:
+    case Dynarmic::A64::Exception::SendEventLocal:
+    case Dynarmic::A64::Exception::Yield:
+        LOG_TRACE(Core_ARM, "ExceptionRaised(exception = {}, pc = {:08X}, code = {:08X})", static_cast<std::size_t>(exception), pc, m_memory.Read32(pc));
+        return;
+    case Dynarmic::A64::Exception::NoExecuteFault:
+        LOG_CRITICAL(Core_ARM, "Cannot execute instruction at unmapped address {:#016x}", pc);
         ReturnException(pc, PrefetchAbort);
-    }
-
-    void InstructionCacheOperationRaised(Dynarmic::A64::InstructionCacheOperation op,
-                                         u64 value) override {
-        switch (op) {
-        case Dynarmic::A64::InstructionCacheOperation::InvalidateByVAToPoU: {
-            static constexpr u64 ICACHE_LINE_SIZE = 64;
-
-            const u64 cache_line_start = value & ~(ICACHE_LINE_SIZE - 1);
-            m_parent.InvalidateCacheRange(cache_line_start, ICACHE_LINE_SIZE);
-            break;
-        }
-        case Dynarmic::A64::InstructionCacheOperation::InvalidateAllToPoU:
-            m_parent.ClearInstructionCache();
-            break;
-        case Dynarmic::A64::InstructionCacheOperation::InvalidateAllToPoUInnerSharable:
-        default:
-            LOG_DEBUG(Core_ARM, "Unprocesseed instruction cache operation: {}", op);
-            break;
-        }
-
-        m_parent.m_jit->HaltExecution(Dynarmic::HaltReason::CacheInvalidation);
-    }
-
-    void ExceptionRaised(u64 pc, Dynarmic::A64::Exception exception) override {
-        switch (exception) {
-        case Dynarmic::A64::Exception::WaitForInterrupt:
-        case Dynarmic::A64::Exception::WaitForEvent:
-        case Dynarmic::A64::Exception::SendEvent:
-        case Dynarmic::A64::Exception::SendEventLocal:
-        case Dynarmic::A64::Exception::Yield:
-            LOG_TRACE(Core_ARM, "ExceptionRaised(exception = {}, pc = {:08X}, code = {:08X})", static_cast<std::size_t>(exception), pc, m_memory.Read32(pc));
-            return;
-        case Dynarmic::A64::Exception::NoExecuteFault:
-            LOG_CRITICAL(Core_ARM, "Cannot execute instruction at unmapped address {:#016x}", pc);
-            ReturnException(pc, PrefetchAbort);
-            return;
-        default:
-            if (m_debugger_enabled) {
-                ReturnException(pc, InstructionBreakpoint);
-            } else {
-                m_parent.LogBacktrace(m_process);
-                LOG_CRITICAL(Core_ARM, "ExceptionRaised(exception = {}, pc = {:08X}, code = {:08X})", static_cast<std::size_t>(exception), pc, m_memory.Read32(pc));
-            }
+        return;
+    default:
+        if (m_debugger_enabled) {
+            ReturnException(pc, InstructionBreakpoint);
+        } else {
+            m_parent.LogBacktrace(m_process);
+            LOG_CRITICAL(Core_ARM, "ExceptionRaised(exception = {}, pc = {:08X}, code = {:08X})", static_cast<std::size_t>(exception), pc, m_memory.Read32(pc));
         }
     }
+}
 
-    void CallSVC(u32 svc) override {
-        m_parent.m_svc = svc;
-        m_parent.m_jit->HaltExecution(SupervisorCall);
-    }
+void DynarmicCallbacks64::CallSVC(u32 svc) {
+    m_parent.m_svc = svc;
+    m_parent.m_jit->HaltExecution(SupervisorCall);
+}
 
-    void AddTicks(u64 ticks) override {
-        ASSERT_MSG(!m_parent.m_uses_wall_clock, "Dynarmic ticking disabled");
+void DynarmicCallbacks64::AddTicks(u64 ticks) {
+    ASSERT_MSG(!m_parent.m_uses_wall_clock, "Dynarmic ticking disabled");
 
-        // Divide the number of ticks by the amount of CPU cores. TODO(Subv): This yields only a
-        // rough approximation of the amount of executed ticks in the system, it may be thrown off
-        // if not all cores are doing a similar amount of work. Instead of doing this, we should
-        // device a way so that timing is consistent across all cores without increasing the ticks 4
-        // times.
-        u64 amortized_ticks = ticks / Core::Hardware::NUM_CPU_CORES;
-        // Always execute at least one tick.
-        amortized_ticks = std::max<u64>(amortized_ticks, 1);
+    // Divide the number of ticks by the amount of CPU cores. TODO(Subv): This yields only a
+    // rough approximation of the amount of executed ticks in the system, it may be thrown off
+    // if not all cores are doing a similar amount of work. Instead of doing this, we should
+    // device a way so that timing is consistent across all cores without increasing the ticks 4
+    // times.
+    u64 amortized_ticks = ticks / Core::Hardware::NUM_CPU_CORES;
+    // Always execute at least one tick.
+    amortized_ticks = std::max<u64>(amortized_ticks, 1);
 
-        m_parent.m_system.CoreTiming().AddTicks(amortized_ticks);
-    }
+    m_parent.m_system.CoreTiming().AddTicks(amortized_ticks);
+}
 
-    u64 GetTicksRemaining() override {
-        ASSERT_MSG(!m_parent.m_uses_wall_clock, "Dynarmic ticking disabled");
+u64 DynarmicCallbacks64::GetTicksRemaining() {
+    ASSERT(!m_parent.m_uses_wall_clock && "Dynarmic ticking disabled");
+    return std::max<s64>(m_parent.m_system.CoreTiming().GetDowncount(), 0);
+}
 
-        return std::max<s64>(m_parent.m_system.CoreTiming().GetDowncount(), 0);
-    }
-
-    u64 GetCNTPCT() override {
-        return m_parent.m_system.CoreTiming().GetClockTicks();
-    }
-
-    bool CheckMemoryAccess(u64 addr, u64 size, Kernel::DebugWatchpointType type) {
-        if (!m_check_memory_access) {
-            return true;
-        }
-
-        if (!m_memory.IsValidVirtualAddressRange(addr, size)) {
-            LOG_CRITICAL(Core_ARM, "Stopping execution due to unmapped memory access at {:#x}",
-                         addr);
-            m_parent.m_jit->HaltExecution(PrefetchAbort);
-            return false;
-        }
-
-        if (!m_debugger_enabled) {
-            return true;
-        }
-
-        const auto match{m_parent.MatchingWatchpoint(addr, size, type)};
-        if (match) {
-            m_parent.m_halted_watchpoint = match;
-            m_parent.m_jit->HaltExecution(DataAbort);
-            return false;
-        }
+u64 DynarmicCallbacks64::GetCNTPCT() {
+    return m_parent.m_system.CoreTiming().GetClockTicks();
+}
 
+bool DynarmicCallbacks64::CheckMemoryAccess(u64 addr, u64 size, Kernel::DebugWatchpointType type) {
+    if (!m_check_memory_access) {
         return true;
     }
 
-    void ReturnException(u64 pc, Dynarmic::HaltReason hr) {
-        m_parent.GetContext(m_parent.m_breakpoint_context);
-        m_parent.m_breakpoint_context.pc = pc;
-        m_parent.m_jit->HaltExecution(hr);
+    if (!m_memory.IsValidVirtualAddressRange(addr, size)) {
+        LOG_CRITICAL(Core_ARM, "Stopping execution due to unmapped memory access at {:#x}",
+                        addr);
+        m_parent.m_jit->HaltExecution(PrefetchAbort);
+        return false;
     }
 
-    ArmDynarmic64& m_parent;
-    Core::Memory::Memory& m_memory;
-    u64 m_tpidrro_el0{};
-    u64 m_tpidr_el0{};
-    Kernel::KProcess* m_process{};
-    const bool m_debugger_enabled{};
-    const bool m_check_memory_access{};
-    static constexpr u64 MinimumRunCycles = 10000U;
-};
+    if (!m_debugger_enabled) {
+        return true;
+    }
 
-std::shared_ptr<Dynarmic::A64::Jit> ArmDynarmic64::MakeJit(Common::PageTable* page_table,
-                                                           std::size_t address_space_bits) const {
+    const auto match{m_parent.MatchingWatchpoint(addr, size, type)};
+    if (match) {
+        m_parent.m_halted_watchpoint = match;
+        m_parent.m_jit->HaltExecution(DataAbort);
+        return false;
+    }
+
+    return true;
+}
+
+void DynarmicCallbacks64::ReturnException(u64 pc, Dynarmic::HaltReason hr) {
+    m_parent.GetContext(m_parent.m_breakpoint_context);
+    m_parent.m_breakpoint_context.pc = pc;
+    m_parent.m_jit->HaltExecution(hr);
+}
+
+void ArmDynarmic64::MakeJit(Common::PageTable* page_table, std::size_t address_space_bits) {
     Dynarmic::A64::UserConfig config;
 
     // Callbacks
-    config.callbacks = m_cb.get();
+    config.callbacks = std::addressof(*m_cb);
 
     // Memory
     if (page_table) {
@@ -375,7 +355,7 @@ std::shared_ptr<Dynarmic::A64::Jit> ArmDynarmic64::MakeJit(Common::PageTable* pa
     default:
         break;
     }
-    return std::make_shared<Dynarmic::A64::Jit>(config);
+    m_jit.emplace(config);
 }
 
 HaltReason ArmDynarmic64::RunThread(Kernel::KThread* thread) {
@@ -393,19 +373,15 @@ u32 ArmDynarmic64::GetSvcNumber() const {
 }
 
 void ArmDynarmic64::GetSvcArguments(std::span<uint64_t, 8> args) const {
-    Dynarmic::A64::Jit& j = *m_jit;
-
-    for (size_t i = 0; i < 8; i++) {
+    Dynarmic::A64::Jit const& j = *m_jit;
+    for (size_t i = 0; i < 8; i++)
         args[i] = j.GetRegister(i);
-    }
 }
 
 void ArmDynarmic64::SetSvcArguments(std::span<const uint64_t, 8> args) {
     Dynarmic::A64::Jit& j = *m_jit;
-
-    for (size_t i = 0; i < 8; i++) {
+    for (size_t i = 0; i < 8; i++)
         j.SetRegister(i, args[i]);
-    }
 }
 
 const Kernel::DebugWatchpoint* ArmDynarmic64::HaltedWatchpoint() const {
@@ -416,13 +392,14 @@ void ArmDynarmic64::RewindBreakpointInstruction() {
     this->SetContext(m_breakpoint_context);
 }
 
-ArmDynarmic64::ArmDynarmic64(System& system, bool uses_wall_clock, Kernel::KProcess* process,
-                             DynarmicExclusiveMonitor& exclusive_monitor, std::size_t core_index)
-    : ArmInterface{uses_wall_clock}, m_system{system}, m_exclusive_monitor{exclusive_monitor},
-      m_cb(std::make_unique<DynarmicCallbacks64>(*this, process)), m_core_index{core_index} {
+ArmDynarmic64::ArmDynarmic64(System& system, bool uses_wall_clock, Kernel::KProcess* process, DynarmicExclusiveMonitor& exclusive_monitor, std::size_t core_index)
+    : ArmInterface{uses_wall_clock}, m_system{system}, m_exclusive_monitor{exclusive_monitor}
+    , m_cb(std::make_optional<DynarmicCallbacks64>(*this, process))
+    , m_core_index{core_index}
+{
     auto& page_table = process->GetPageTable().GetBasePageTable();
     auto& page_table_impl = page_table.GetImpl();
-    m_jit = MakeJit(&page_table_impl, page_table.GetAddressSpaceWidth());
+    MakeJit(&page_table_impl, page_table.GetAddressSpaceWidth());
 }
 
 ArmDynarmic64::~ArmDynarmic64() = default;
@@ -432,17 +409,14 @@ void ArmDynarmic64::SetTpidrroEl0(u64 value) {
 }
 
 void ArmDynarmic64::GetContext(Kernel::Svc::ThreadContext& ctx) const {
-    Dynarmic::A64::Jit& j = *m_jit;
+    Dynarmic::A64::Jit const& j = *m_jit;
     auto gpr = j.GetRegisters();
     auto fpr = j.GetVectors();
-
     // TODO: this is inconvenient
-    for (size_t i = 0; i < 29; i++) {
+    for (size_t i = 0; i < 29; i++)
         ctx.r[i] = gpr[i];
-    }
     ctx.fp = gpr[29];
     ctx.lr = gpr[30];
-
     ctx.sp = j.GetSP();
     ctx.pc = j.GetPC();
     ctx.pstate = j.GetPstate();
@@ -454,16 +428,12 @@ void ArmDynarmic64::GetContext(Kernel::Svc::ThreadContext& ctx) const {
 
 void ArmDynarmic64::SetContext(const Kernel::Svc::ThreadContext& ctx) {
     Dynarmic::A64::Jit& j = *m_jit;
-
     // TODO: this is inconvenient
     std::array<u64, 31> gpr;
-
-    for (size_t i = 0; i < 29; i++) {
+    for (size_t i = 0; i < 29; i++)
         gpr[i] = ctx.r[i];
-    }
     gpr[29] = ctx.fp;
     gpr[30] = ctx.lr;
-
     j.SetRegisters(gpr);
     j.SetSP(ctx.sp);
     j.SetPC(ctx.pc);
diff --git a/src/core/arm/dynarmic/arm_dynarmic_64.h b/src/core/arm/dynarmic/arm_dynarmic_64.h
index 08cd982b30..2ea1505ce7 100644
--- a/src/core/arm/dynarmic/arm_dynarmic_64.h
+++ b/src/core/arm/dynarmic/arm_dynarmic_64.h
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
@@ -17,12 +20,57 @@ namespace Core::Memory {
 class Memory;
 }
 
+namespace Kernel {
+enum class DebugWatchpointType : u8;
+class KPRocess;
+}
+
 namespace Core {
 
-class DynarmicCallbacks64;
+class ArmDynarmic64;
 class DynarmicExclusiveMonitor;
 class System;
 
+class DynarmicCallbacks64 : public Dynarmic::A64::UserCallbacks {
+public:
+    explicit DynarmicCallbacks64(ArmDynarmic64& parent, Kernel::KProcess* process);
+
+    u8 MemoryRead8(u64 vaddr) override;
+    u16 MemoryRead16(u64 vaddr) override;
+    u32 MemoryRead32(u64 vaddr) override;
+    u64 MemoryRead64(u64 vaddr) override;
+    Dynarmic::A64::Vector MemoryRead128(u64 vaddr) override;
+    std::optional<u32> MemoryReadCode(u64 vaddr) override;
+    void MemoryWrite8(u64 vaddr, u8 value) override;
+    void MemoryWrite16(u64 vaddr, u16 value) override;
+    void MemoryWrite32(u64 vaddr, u32 value) override;
+    void MemoryWrite64(u64 vaddr, u64 value) override;
+    void MemoryWrite128(u64 vaddr, Dynarmic::A64::Vector value) override;
+    bool MemoryWriteExclusive8(u64 vaddr, std::uint8_t value, std::uint8_t expected) override;
+    bool MemoryWriteExclusive16(u64 vaddr, std::uint16_t value, std::uint16_t expected) override;
+    bool MemoryWriteExclusive32(u64 vaddr, std::uint32_t value, std::uint32_t expected) override;
+    bool MemoryWriteExclusive64(u64 vaddr, std::uint64_t value, std::uint64_t expected) override;
+    bool MemoryWriteExclusive128(u64 vaddr, Dynarmic::A64::Vector value, Dynarmic::A64::Vector expected) override;
+    void InterpreterFallback(u64 pc, std::size_t num_instructions) override;
+    void InstructionCacheOperationRaised(Dynarmic::A64::InstructionCacheOperation op, u64 value) override;
+    void ExceptionRaised(u64 pc, Dynarmic::A64::Exception exception) override;
+    void CallSVC(u32 svc) override;
+    void AddTicks(u64 ticks) override;
+    u64 GetTicksRemaining() override;
+    u64 GetCNTPCT() override;
+    bool CheckMemoryAccess(u64 addr, u64 size, Kernel::DebugWatchpointType type);
+    void ReturnException(u64 pc, Dynarmic::HaltReason hr);
+
+    ArmDynarmic64& m_parent;
+    Core::Memory::Memory& m_memory;
+    u64 m_tpidrro_el0{};
+    u64 m_tpidr_el0{};
+    Kernel::KProcess* m_process{};
+    const bool m_debugger_enabled{};
+    const bool m_check_memory_access{};
+    static constexpr u64 MinimumRunCycles = 10000U;
+};
+
 class ArmDynarmic64 final : public ArmInterface {
 public:
     ArmDynarmic64(System& system, bool uses_wall_clock, Kernel::KProcess* process,
@@ -59,12 +107,11 @@ private:
 private:
     friend class DynarmicCallbacks64;
 
-    std::shared_ptr<Dynarmic::A64::Jit> MakeJit(Common::PageTable* page_table,
-                                                std::size_t address_space_bits) const;
-    std::unique_ptr<DynarmicCallbacks64> m_cb{};
+    void MakeJit(Common::PageTable* page_table, std::size_t address_space_bits);
+    std::optional<DynarmicCallbacks64> m_cb{};
     std::size_t m_core_index{};
 
-    std::shared_ptr<Dynarmic::A64::Jit> m_jit{};
+    std::optional<Dynarmic::A64::Jit> m_jit{};
 
     // SVC callback
     u32 m_svc{};
diff --git a/src/core/core.cpp b/src/core/core.cpp
index bf97184f8f..aea2b2b060 100644
--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@@ -112,11 +112,10 @@ struct System::Impl {
     u64 program_id;
 
     void Initialize(System& system) {
-        device_memory = std::make_unique<Core::DeviceMemory>();
+        device_memory.emplace();
 
         is_multicore = Settings::values.use_multi_core.GetValue();
-        extended_memory_layout =
-            Settings::values.memory_layout_mode.GetValue() != Settings::MemoryLayout::Memory_4Gb;
+        extended_memory_layout = Settings::values.memory_layout_mode.GetValue() != Settings::MemoryLayout::Memory_4Gb;
 
         core_timing.SetMulticore(is_multicore);
         core_timing.Initialize([&system]() { system.RegisterHostThread(); });
@@ -132,7 +131,7 @@ struct System::Impl {
         // Create default implementations of applets if one is not provided.
         frontend_applets.SetDefaultAppletsIfMissing();
 
-        is_async_gpu = Settings::values.use_asynchronous_gpu_emulation.GetValue();
+        auto const is_async_gpu = Settings::values.use_asynchronous_gpu_emulation.GetValue();
 
         kernel.SetMulticore(is_multicore);
         cpu_manager.SetMulticore(is_multicore);
@@ -254,7 +253,7 @@ struct System::Impl {
     }
 
     void InitializeDebugger(System& system, u16 port) {
-        debugger = std::make_unique<Debugger>(system, port);
+        debugger.emplace(system, port);
     }
 
     void InitializeKernel(System& system) {
@@ -268,24 +267,22 @@ struct System::Impl {
     }
 
     SystemResultStatus SetupForApplicationProcess(System& system, Frontend::EmuWindow& emu_window) {
-        host1x_core = std::make_unique<Tegra::Host1x::Host1x>(system);
+        host1x_core.emplace(system);
         gpu_core = VideoCore::CreateGPU(emu_window, system);
-        if (!gpu_core) {
+        if (!gpu_core)
             return SystemResultStatus::ErrorVideoCore;
-        }
 
-        audio_core = std::make_unique<AudioCore::AudioCore>(system);
+        audio_core.emplace(system);
 
         service_manager = std::make_shared<Service::SM::ServiceManager>(kernel);
-        services =
-            std::make_unique<Service::Services>(service_manager, system, stop_event.get_token());
+        services.emplace(service_manager, system, stop_event.get_token());
 
         is_powered_on = true;
         exit_locked = false;
         exit_requested = false;
 
         if (Settings::values.enable_renderdoc_hotkey) {
-            renderdoc_api = std::make_unique<Tools::RenderdocAPI>();
+            renderdoc_api.emplace();
         }
 
         LOG_DEBUG(Core, "Initialized OK");
@@ -303,16 +300,11 @@ struct System::Impl {
         // Create the application process
         Loader::ResultStatus load_result{};
         std::vector<u8> control;
-        auto process =
-            Service::AM::CreateApplicationProcess(control, app_loader, load_result, system, file,
-                                                  params.program_id, params.program_index);
-
+        auto process = Service::AM::CreateApplicationProcess(control, app_loader, load_result, system, file, params.program_id, params.program_index);
         if (load_result != Loader::ResultStatus::Success) {
             LOG_CRITICAL(Core, "Failed to load ROM (Error {})!", load_result);
             ShutdownMainProcess();
-
-            return static_cast<SystemResultStatus>(
-                static_cast<u32>(SystemResultStatus::ErrorLoader) + static_cast<u32>(load_result));
+            return SystemResultStatus(u32(SystemResultStatus::ErrorLoader) + u32(load_result));
         }
 
         if (!app_loader) {
@@ -337,8 +329,7 @@ struct System::Impl {
         // Set up the rest of the system.
         SystemResultStatus init_result{SetupForApplicationProcess(system, emu_window)};
         if (init_result != SystemResultStatus::Success) {
-            LOG_CRITICAL(Core, "Failed to initialize system (Error {})!",
-                         static_cast<int>(init_result));
+            LOG_CRITICAL(Core, "Failed to initialize system (Error {})!", int(init_result));
             ShutdownMainProcess();
             return init_result;
         }
@@ -361,24 +352,19 @@ struct System::Impl {
             }
         }
 
-        perf_stats = std::make_unique<PerfStats>(params.program_id);
+        perf_stats.emplace(params.program_id);
         // Reset counters and set time origin to current frame
         GetAndResetPerfStats();
         perf_stats->BeginSystemFrame();
 
-        std::string title_version;
-        const FileSys::PatchManager pm(params.program_id, system.GetFileSystemController(),
-                                       system.GetContentProvider());
-        const auto metadata = pm.GetControlMetadata();
-        if (metadata.first != nullptr) {
-            title_version = metadata.first->GetVersionString();
-        }
+        const FileSys::PatchManager pm(params.program_id, system.GetFileSystemController(), system.GetContentProvider());
+        auto const metadata = pm.GetControlMetadata();
+        std::string title_version = metadata.first != nullptr ? metadata.first->GetVersionString() : "";
 
         if (app_loader->ReadProgramId(program_id) != Loader::ResultStatus::Success) {
             LOG_ERROR(Core, "Failed to find program id for ROM");
         }
 
-
         GameSettings::LoadOverrides(program_id, gpu_core->Renderer());
         if (auto room_member = Network::GetRoomMember().lock()) {
             Network::GameInfo game_info;
@@ -387,9 +373,7 @@ struct System::Impl {
             game_info.version = title_version;
             room_member->SendGameInfo(game_info);
         }
-
-        status = SystemResultStatus::Success;
-        return status;
+        return SystemResultStatus::Success;
     }
 
     void ShutdownMainProcess() {
@@ -448,112 +432,79 @@ struct System::Impl {
     }
 
     Loader::ResultStatus GetGameName(std::string& out) const {
-        if (app_loader == nullptr)
-            return Loader::ResultStatus::ErrorNotInitialized;
-        return app_loader->ReadTitle(out);
-    }
-
-    void SetStatus(SystemResultStatus new_status, const char* details = nullptr) {
-        status = new_status;
-        if (details) {
-            status_details = details;
-        }
+        return app_loader ? app_loader->ReadTitle(out) : Loader::ResultStatus::ErrorNotInitialized;
     }
 
     PerfStatsResults GetAndResetPerfStats() {
         return perf_stats->GetAndResetStats(core_timing.GetGlobalTimeUs());
     }
 
-    mutable std::mutex suspend_guard;
-    std::atomic_bool is_paused{};
-    std::atomic<bool> is_shutting_down{};
-
     Timing::CoreTiming core_timing;
     Kernel::KernelCore kernel;
     /// RealVfsFilesystem instance
     FileSys::VirtualFilesystem virtual_filesystem;
-    /// ContentProviderUnion instance
-    std::unique_ptr<FileSys::ContentProviderUnion> content_provider;
     Service::FileSystem::FileSystemController fs_controller;
-    /// AppLoader used to load the current executing application
-    std::unique_ptr<Loader::AppLoader> app_loader;
-    std::unique_ptr<Tegra::GPU> gpu_core;
-    std::unique_ptr<Tegra::Host1x::Host1x> host1x_core;
-    std::unique_ptr<Core::DeviceMemory> device_memory;
-    std::unique_ptr<AudioCore::AudioCore> audio_core;
     Core::HID::HIDCore hid_core;
-
     CpuManager cpu_manager;
-    std::atomic_bool is_powered_on{};
-    bool exit_locked = false;
-    bool exit_requested = false;
-
-    bool nvdec_active{};
-
     Reporter reporter;
-    std::unique_ptr<Memory::CheatEngine> cheat_engine;
-    std::unique_ptr<Tools::Freezer> memory_freezer;
-    std::array<u8, 0x20> build_id{};
-
-    std::unique_ptr<Tools::RenderdocAPI> renderdoc_api;
-
     /// Applets
     Service::AM::AppletManager applet_manager;
     Service::AM::Frontend::FrontendAppletHolder frontend_applets;
-
     /// APM (Performance) services
     Service::APM::Controller apm_controller{core_timing};
-
     /// Service State
     Service::Glue::ARPManager arp_manager;
     Service::Account::ProfileManager profile_manager;
+    /// Network instance
+    Network::NetworkInstance network_instance;
+    Core::SpeedLimiter speed_limiter;
+    ExecuteProgramCallback execute_program_callback;
+    ExitCallback exit_callback;
+
+    std::optional<Service::Services> services;
+    std::optional<Core::Debugger> debugger;
+    std::optional<Service::KernelHelpers::ServiceContext> general_channel_context;
+    std::optional<Service::Event> general_channel_event;
+    std::optional<Core::PerfStats> perf_stats;
+    std::optional<Tegra::Host1x::Host1x> host1x_core;
+    std::optional<Core::DeviceMemory> device_memory;
+    std::optional<AudioCore::AudioCore> audio_core;
+    std::optional<Memory::CheatEngine> cheat_engine;
+    std::optional<Tools::Freezer> memory_freezer;
+    std::optional<Tools::RenderdocAPI> renderdoc_api;
+
+    std::array<Core::GPUDirtyMemoryManager, Core::Hardware::NUM_CPU_CORES> gpu_dirty_memory_managers;
+    std::vector<std::vector<u8>> user_channel;
+    std::vector<std::vector<u8>> general_channel;
+
+    std::array<u64, Core::Hardware::NUM_CPU_CORES> dynarmic_ticks{};
+    std::array<u8, 0x20> build_id{};
 
     /// Service manager
     std::shared_ptr<Service::SM::ServiceManager> service_manager;
-
-    /// Services
-    std::unique_ptr<Service::Services> services;
-
-    /// Network instance
-    Network::NetworkInstance network_instance;
-
-    /// Debugger
-    std::unique_ptr<Core::Debugger> debugger;
-
-    SystemResultStatus status = SystemResultStatus::Success;
-    std::string status_details = "";
-
-    std::unique_ptr<Core::PerfStats> perf_stats;
-    Core::SpeedLimiter speed_limiter;
-
-    bool is_multicore{};
-    bool is_async_gpu{};
-    bool extended_memory_layout{};
-
-    ExecuteProgramCallback execute_program_callback;
-    ExitCallback exit_callback;
+    /// ContentProviderUnion instance
+    std::unique_ptr<FileSys::ContentProviderUnion> content_provider;
+    /// AppLoader used to load the current executing application
+    std::unique_ptr<Loader::AppLoader> app_loader;
+    std::unique_ptr<Tegra::GPU> gpu_core;
     std::stop_source stop_event;
 
-    std::array<u64, Core::Hardware::NUM_CPU_CORES> dynarmic_ticks{};
-
-    std::array<Core::GPUDirtyMemoryManager, Core::Hardware::NUM_CPU_CORES>
-        gpu_dirty_memory_managers;
-
-    std::deque<std::vector<u8>> user_channel;
-
+    mutable std::mutex suspend_guard;
     std::mutex general_channel_mutex;
-    std::deque<std::vector<u8>> general_channel;
-    std::unique_ptr<Service::KernelHelpers::ServiceContext> general_channel_context; // lazy
-    std::unique_ptr<Service::Event> general_channel_event;                           // lazy
-    bool general_channel_initialized{false};
+    std::atomic_bool is_paused{};
+    std::atomic_bool is_shutting_down{};
+    std::atomic_bool is_powered_on{};
+    bool is_multicore : 1 = false;
+    bool extended_memory_layout : 1 = false;
+    bool exit_locked : 1 = false;
+    bool exit_requested : 1 = false;
+    bool nvdec_active : 1 = false;
 
     void EnsureGeneralChannelInitialized(System& system) {
-        if (general_channel_initialized) {
-            return;
+        if (!general_channel_event) {
+            general_channel_context.emplace(system, "GeneralChannel");
+            general_channel_event.emplace(*general_channel_context);
         }
-        general_channel_context = std::make_unique<Service::KernelHelpers::ServiceContext>(system, "GeneralChannel");
-        general_channel_event = std::make_unique<Service::Event>(*general_channel_context);
-        general_channel_initialized = true;
     }
 };
 
@@ -776,14 +727,6 @@ Loader::ResultStatus System::GetGameName(std::string& out) const {
     return impl->GetGameName(out);
 }
 
-void System::SetStatus(SystemResultStatus new_status, const char* details) {
-    impl->SetStatus(new_status, details);
-}
-
-const std::string& System::GetStatusDetails() const {
-    return impl->status_details;
-}
-
 Loader::AppLoader& System::GetAppLoader() {
     return *impl->app_loader;
 }
@@ -803,7 +746,7 @@ FileSys::VirtualFilesystem System::GetFilesystem() const {
 void System::RegisterCheatList(const std::vector<Memory::CheatEntry>& list,
                                const std::array<u8, 32>& build_id, u64 main_region_begin,
                                u64 main_region_size) {
-    impl->cheat_engine = std::make_unique<Memory::CheatEngine>(*this, list, build_id);
+    impl->cheat_engine.emplace(*this, list, build_id);
     impl->cheat_engine->SetMainMemoryParameters(main_region_begin, main_region_size);
 }
 
@@ -964,11 +907,13 @@ void System::ExecuteProgram(std::size_t program_index) {
     }
 }
 
-std::deque<std::vector<u8>>& System::GetUserChannel() {
+/// @brief Gets a reference to the user channel stack.
+/// It is used to transfer data between programs.
+std::vector<std::vector<u8>>& System::GetUserChannel() {
     return impl->user_channel;
 }
 
-std::deque<std::vector<u8>>& System::GetGeneralChannel() {
+std::vector<std::vector<u8>>& System::GetGeneralChannel() {
     return impl->general_channel;
 }
 
@@ -984,7 +929,7 @@ void System::PushGeneralChannelData(std::vector<u8>&& data) {
 
 bool System::TryPopGeneralChannel(std::vector<u8>& out_data) {
     std::scoped_lock lk{impl->general_channel_mutex};
-    if (!impl->general_channel_initialized || impl->general_channel.empty()) {
+    if (!impl->general_channel_event || impl->general_channel.empty()) {
         return false;
     }
     out_data = std::move(impl->general_channel.back());
diff --git a/src/core/core.h b/src/core/core.h
index 60bf73d4e1..702c5cc81b 100644
--- a/src/core/core.h
+++ b/src/core/core.h
@@ -325,10 +325,6 @@ public:
     /// Gets the name of the current game
     [[nodiscard]] Loader::ResultStatus GetGameName(std::string& out) const;
 
-    void SetStatus(SystemResultStatus new_status, const char* details);
-
-    [[nodiscard]] const std::string& GetStatusDetails() const;
-
     [[nodiscard]] Loader::AppLoader& GetAppLoader();
     [[nodiscard]] const Loader::AppLoader& GetAppLoader() const;
 
@@ -424,13 +420,8 @@ public:
      */
     void ExecuteProgram(std::size_t program_index);
 
-    /**
-     * Gets a reference to the user channel stack.
-     * It is used to transfer data between programs.
-     */
-    [[nodiscard]] std::deque<std::vector<u8>>& GetUserChannel();
-
-    [[nodiscard]] std::deque<std::vector<u8>>& GetGeneralChannel();
+    [[nodiscard]] std::vector<std::vector<u8>>& GetUserChannel();
+    [[nodiscard]] std::vector<std::vector<u8>>& GetGeneralChannel();
     void PushGeneralChannelData(std::vector<u8>&& data);
     bool TryPopGeneralChannel(std::vector<u8>& out_data);
     [[nodiscard]] Service::Event& GetGeneralChannelEvent();
diff --git a/src/core/core_timing.cpp b/src/core/core_timing.cpp
index 3c847c8359..5a582c8cff 100644
--- a/src/core/core_timing.cpp
+++ b/src/core/core_timing.cpp
@@ -53,13 +53,6 @@ CoreTiming::~CoreTiming() {
     Reset();
 }
 
-void CoreTiming::ThreadEntry(CoreTiming& instance) {
-    Common::SetCurrentThreadName("HostTiming");
-    Common::SetCurrentThreadPriority(Common::ThreadPriority::High);
-    instance.on_thread_init();
-    instance.ThreadLoop();
-}
-
 void CoreTiming::Initialize(std::function<void()>&& on_thread_init_) {
     Reset();
     on_thread_init = std::move(on_thread_init_);
@@ -67,7 +60,12 @@ void CoreTiming::Initialize(std::function<void()>&& on_thread_init_) {
     shutting_down = false;
     cpu_ticks = 0;
     if (is_multicore) {
-        timer_thread = std::make_unique<std::jthread>(ThreadEntry, std::ref(*this));
+        timer_thread.emplace([](CoreTiming& instance) {
+            Common::SetCurrentThreadName("HostTiming");
+            Common::SetCurrentThreadPriority(Common::ThreadPriority::High);
+            instance.on_thread_init();
+            instance.ThreadLoop();
+        }, std::ref(*this));
     }
 }
 
diff --git a/src/core/core_timing.h b/src/core/core_timing.h
index 7e4dff7f3d..ae9f56d519 100644
--- a/src/core/core_timing.h
+++ b/src/core/core_timing.h
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
@@ -140,8 +143,6 @@ public:
 
 private:
     struct Event;
-
-    static void ThreadEntry(CoreTiming& instance);
     void ThreadLoop();
 
     void Reset();
@@ -164,7 +165,7 @@ private:
     Common::Event pause_event{};
     mutable std::mutex basic_lock;
     std::mutex advance_lock;
-    std::unique_ptr<std::jthread> timer_thread;
+    std::optional<std::jthread> timer_thread;
     std::atomic<bool> paused{};
     std::atomic<bool> paused_set{};
     std::atomic<bool> wait_set{};
diff --git a/src/core/hle/kernel/k_process.cpp b/src/core/hle/kernel/k_process.cpp
index 322f971ba3..082049f957 100644
--- a/src/core/hle/kernel/k_process.cpp
+++ b/src/core/hle/kernel/k_process.cpp
@@ -1148,9 +1148,17 @@ Result KProcess::GetThreadList(s32* out_num_threads, KProcessAddress out_thread_
 void KProcess::Switch(KProcess* cur_process, KProcess* next_process) {}
 
 KProcess::KProcess(KernelCore& kernel)
-    : KAutoObjectWithSlabHeapAndContainer(kernel), m_page_table{kernel}, m_state_lock{kernel},
-      m_list_lock{kernel}, m_cond_var{kernel.System()}, m_address_arbiter{kernel.System()},
-      m_handle_table{kernel}, m_exclusive_monitor{}, m_memory{kernel.System()} {}
+    : KAutoObjectWithSlabHeapAndContainer(kernel)
+    , m_exclusive_monitor{}
+    , m_memory{kernel.System()}
+    , m_handle_table{kernel}
+    , m_page_table{kernel}
+    , m_state_lock{kernel}
+    , m_list_lock{kernel}
+    , m_cond_var{kernel.System()}
+    , m_address_arbiter{kernel.System()}
+{}
+
 KProcess::~KProcess() = default;
 
 Result KProcess::LoadFromMetadata(const FileSys::ProgramMetadata& metadata, std::size_t code_size,
diff --git a/src/core/hle/kernel/k_process.h b/src/core/hle/kernel/k_process.h
index 92ddb1aca4..13717cc090 100644
--- a/src/core/hle/kernel/k_process.h
+++ b/src/core/hle/kernel/k_process.h
@@ -66,60 +66,55 @@ public:
 
 private:
     using SharedMemoryInfoList = Common::IntrusiveListBaseTraits<KSharedMemoryInfo>::ListType;
-    using TLPTree =
-        Common::IntrusiveRedBlackTreeBaseTraits<KThreadLocalPage>::TreeType<KThreadLocalPage>;
+    using TLPTree = Common::IntrusiveRedBlackTreeBaseTraits<KThreadLocalPage>::TreeType<KThreadLocalPage>;
     using TLPIterator = TLPTree::iterator;
 
 private:
-    KProcessPageTable m_page_table;
-    std::atomic<size_t> m_used_kernel_memory_size{};
-    TLPTree m_fully_used_tlp_tree{};
-    TLPTree m_partially_used_tlp_tree{};
-    s32 m_ideal_core_id{};
-    KResourceLimit* m_resource_limit{};
-    KSystemResource* m_system_resource{};
-    size_t m_memory_release_hint{};
-    State m_state{};
-    KLightLock m_state_lock;
-    KLightLock m_list_lock;
-    KConditionVariable m_cond_var;
-    KAddressArbiter m_address_arbiter;
-    std::array<u64, 4> m_entropy{};
-    bool m_is_signaled{};
-    bool m_is_initialized{};
-    u32 m_pointer_buffer_size = 0x8000;  // Default pointer buffer size (can be game-specific later)
-    bool m_is_application{};
-    bool m_is_default_application_system_resource{};
-    bool m_is_hbl{};
-    std::array<char, 13> m_name{};
-    std::atomic<u16> m_num_running_threads{};
-    Svc::CreateProcessFlag m_flags{};
-    KMemoryManager::Pool m_memory_pool{};
-    s64 m_schedule_count{};
-    KCapabilities m_capabilities{};
-    u64 m_program_id{};
-    u64 m_process_id{};
-    KProcessAddress m_code_address{};
-    size_t m_code_size{};
-    size_t m_main_thread_stack_size{};
-    size_t m_max_process_memory{};
-    u32 m_version{};
-    KHandleTable m_handle_table;
-    KProcessAddress m_plr_address{};
-    KThread* m_exception_thread{};
-    ThreadList m_thread_list{};
-    SharedMemoryInfoList m_shared_memory_list{};
-    bool m_is_suspended{};
-    bool m_is_immortal{};
-    bool m_is_handle_table_initialized{};
-    std::array<std::unique_ptr<Core::ArmInterface>, Core::Hardware::NUM_CPU_CORES>
-        m_arm_interfaces{};
+    std::array<std::unique_ptr<Core::ArmInterface>, Core::Hardware::NUM_CPU_CORES> m_arm_interfaces{};
     std::array<KThread*, Core::Hardware::NUM_CPU_CORES> m_running_threads{};
     std::array<u64, Core::Hardware::NUM_CPU_CORES> m_running_thread_idle_counts{};
     std::array<u64, Core::Hardware::NUM_CPU_CORES> m_running_thread_switch_counts{};
     std::array<KThread*, Core::Hardware::NUM_CPU_CORES> m_pinned_threads{};
     std::array<DebugWatchpoint, Core::Hardware::NUM_WATCHPOINTS> m_watchpoints{};
     std::map<KProcessAddress, u64> m_debug_page_refcounts{};
+#ifdef HAS_NCE
+    std::unordered_map<u64, u64> m_post_handlers{};
+#endif
+    std::unique_ptr<Core::ExclusiveMonitor> m_exclusive_monitor;
+    Core::Memory::Memory m_memory;
+    KCapabilities m_capabilities{};
+    KProcessAddress m_code_address{};
+    KHandleTable m_handle_table;
+    KProcessAddress m_plr_address{};
+    ThreadList m_thread_list{};
+    SharedMemoryInfoList m_shared_memory_list{};
+    KProcessPageTable m_page_table;
+    std::atomic<size_t> m_used_kernel_memory_size{};
+    TLPTree m_fully_used_tlp_tree{};
+    TLPTree m_partially_used_tlp_tree{};
+    State m_state{};
+    KLightLock m_state_lock;
+    KLightLock m_list_lock;
+    KConditionVariable m_cond_var;
+    KAddressArbiter m_address_arbiter;
+    std::array<u64, 4> m_entropy{};
+    u32 m_pointer_buffer_size = 0x8000;  // Default pointer buffer size (can be game-specific later)
+    std::array<char, 13> m_name{};
+    Svc::CreateProcessFlag m_flags{};
+    KMemoryManager::Pool m_memory_pool{};
+
+    KResourceLimit* m_resource_limit{};
+    KSystemResource* m_system_resource{};
+    KThread* m_exception_thread{};
+
+    size_t m_code_size{};
+    size_t m_main_thread_stack_size{};
+    size_t m_max_process_memory{};
+    size_t m_memory_release_hint{};
+    s64 m_schedule_count{};
+    u64 m_program_id{};
+    u64 m_process_id{};
+
     std::atomic<s64> m_cpu_time{};
     std::atomic<s64> m_num_process_switches{};
     std::atomic<s64> m_num_thread_switches{};
@@ -128,11 +123,20 @@ private:
     std::atomic<s64> m_num_ipc_messages{};
     std::atomic<s64> m_num_ipc_replies{};
     std::atomic<s64> m_num_ipc_receives{};
-#ifdef HAS_NCE
-    std::unordered_map<u64, u64> m_post_handlers{};
-#endif
-    std::unique_ptr<Core::ExclusiveMonitor> m_exclusive_monitor;
-    Core::Memory::Memory m_memory;
+
+    s32 m_ideal_core_id{};
+    u32 m_version{};
+
+    std::atomic<u16> m_num_running_threads{};
+
+    bool m_is_signaled : 1 = false;
+    bool m_is_initialized : 1 = false;
+    bool m_is_application : 1 = false;
+    bool m_is_default_application_system_resource : 1 = false;
+    bool m_is_hbl : 1 = false;
+    bool m_is_suspended : 1 = false;
+    bool m_is_immortal : 1 = false;
+    bool m_is_handle_table_initialized : 1 = false;
 
 private:
     Result StartTermination();
diff --git a/src/core/hle/kernel/kernel.cpp b/src/core/hle/kernel/kernel.cpp
index 062387a29b..6986a98e35 100644
--- a/src/core/hle/kernel/kernel.cpp
+++ b/src/core/hle/kernel/kernel.cpp
@@ -88,11 +88,11 @@ struct KernelCore::Impl {
     }
 
     void Initialize(KernelCore& kernel) {
-        hardware_timer = std::make_unique<Kernel::KHardwareTimer>(kernel);
+        hardware_timer.emplace(kernel);
         hardware_timer->Initialize();
 
-        global_object_list_container = std::make_unique<KAutoObjectWithListContainer>(kernel);
-        global_scheduler_context = std::make_unique<Kernel::GlobalSchedulerContext>(kernel);
+        global_object_list_container.emplace(kernel);
+        global_scheduler_context.emplace(kernel);
 
         // Derive the initial memory layout from the emulated board
         Init::InitializeSlabResourceCounts(kernel);
@@ -212,10 +212,9 @@ struct KernelCore::Impl {
 
     void InitializePhysicalCores() {
         for (u32 i = 0; i < Core::Hardware::NUM_CPU_CORES; i++) {
-            const s32 core{static_cast<s32>(i)};
-
-            schedulers[i] = std::make_unique<Kernel::KScheduler>(system.Kernel());
-            cores[i] = std::make_unique<Kernel::PhysicalCore>(system.Kernel(), i);
+            auto const core = s32(i);
+            schedulers[i].emplace(system.Kernel());
+            cores[i].emplace(system.Kernel(), i);
 
             auto* main_thread{Kernel::KThread::Create(system.Kernel())};
             main_thread->SetCurrentCore(core);
@@ -280,57 +279,56 @@ struct KernelCore::Impl {
         size -= rc_size;
 
         // Initialize the resource managers' shared page manager.
-        resource_manager_page_manager = std::make_unique<KDynamicPageManager>();
+        resource_manager_page_manager.emplace();
         resource_manager_page_manager->Initialize(address, size, std::max<size_t>(PageSize, KPageBufferSlabHeap::BufferSize));
 
         // Initialize the KPageBuffer slab heap.
         page_buffer_slab_heap.Initialize(system);
 
         // Initialize the fixed-size slabheaps.
-        app_memory_block_heap = std::make_unique<KMemoryBlockSlabHeap>();
-        sys_memory_block_heap = std::make_unique<KMemoryBlockSlabHeap>();
-        block_info_heap = std::make_unique<KBlockInfoSlabHeap>();
-        app_memory_block_heap->Initialize(resource_manager_page_manager.get(), ApplicationMemoryBlockSlabHeapSize);
-        sys_memory_block_heap->Initialize(resource_manager_page_manager.get(), SystemMemoryBlockSlabHeapSize);
-        block_info_heap->Initialize(resource_manager_page_manager.get(), BlockInfoSlabHeapSize);
+        app_memory_block_heap.emplace();
+        sys_memory_block_heap.emplace();
+        block_info_heap.emplace();
+        app_memory_block_heap->Initialize(std::addressof(*resource_manager_page_manager), ApplicationMemoryBlockSlabHeapSize);
+        sys_memory_block_heap->Initialize(std::addressof(*resource_manager_page_manager), SystemMemoryBlockSlabHeapSize);
+        block_info_heap->Initialize(std::addressof(*resource_manager_page_manager), BlockInfoSlabHeapSize);
 
         // Reserve all but a fixed number of remaining pages for the page table heap.
         const size_t num_pt_pages = resource_manager_page_manager->GetCount() - resource_manager_page_manager->GetUsed() - ReservedDynamicPageCount;
-        page_table_heap = std::make_unique<KPageTableSlabHeap>();
+        page_table_heap.emplace();
 
         // TODO(bunnei): Pass in address once we support kernel virtual memory allocations.
         page_table_heap->Initialize(
-            resource_manager_page_manager.get(), num_pt_pages,
+            std::addressof(*resource_manager_page_manager), num_pt_pages,
             /*GetPointer<KPageTableManager::RefCount>(address + size)*/ nullptr);
 
         // Setup the slab managers.
         KDynamicPageManager* const app_dynamic_page_manager = nullptr;
         KDynamicPageManager* const sys_dynamic_page_manager =
             /*KTargetSystem::IsDynamicResourceLimitsEnabled()*/ true
-            ? resource_manager_page_manager.get()
-            : nullptr;
-        app_memory_block_manager = std::make_unique<KMemoryBlockSlabManager>();
-        sys_memory_block_manager = std::make_unique<KMemoryBlockSlabManager>();
-        app_block_info_manager = std::make_unique<KBlockInfoManager>();
-        sys_block_info_manager = std::make_unique<KBlockInfoManager>();
-        app_page_table_manager = std::make_unique<KPageTableManager>();
-        sys_page_table_manager = std::make_unique<KPageTableManager>();
+            ? std::addressof(*resource_manager_page_manager) : nullptr;
+        app_memory_block_manager.emplace();
+        sys_memory_block_manager.emplace();
+        app_block_info_manager.emplace();
+        sys_block_info_manager.emplace();
+        app_page_table_manager.emplace();
+        sys_page_table_manager.emplace();
 
-        app_memory_block_manager->Initialize(app_dynamic_page_manager, app_memory_block_heap.get());
-        sys_memory_block_manager->Initialize(sys_dynamic_page_manager, sys_memory_block_heap.get());
+        app_memory_block_manager->Initialize(app_dynamic_page_manager, std::addressof(*app_memory_block_heap));
+        sys_memory_block_manager->Initialize(sys_dynamic_page_manager, std::addressof(*sys_memory_block_heap));
 
-        app_block_info_manager->Initialize(app_dynamic_page_manager, block_info_heap.get());
-        sys_block_info_manager->Initialize(sys_dynamic_page_manager, block_info_heap.get());
+        app_block_info_manager->Initialize(app_dynamic_page_manager, std::addressof(*block_info_heap));
+        sys_block_info_manager->Initialize(sys_dynamic_page_manager, std::addressof(*block_info_heap));
 
-        app_page_table_manager->Initialize(app_dynamic_page_manager, page_table_heap.get());
-        sys_page_table_manager->Initialize(sys_dynamic_page_manager, page_table_heap.get());
+        app_page_table_manager->Initialize(app_dynamic_page_manager, std::addressof(*page_table_heap));
+        sys_page_table_manager->Initialize(sys_dynamic_page_manager, std::addressof(*page_table_heap));
 
         // Check that we have the correct number of dynamic pages available.
         ASSERT(resource_manager_page_manager->GetCount() - resource_manager_page_manager->GetUsed() == ReservedDynamicPageCount);
 
         // Create the system page table managers.
-        app_system_resource = std::make_unique<KSystemResource>(kernel);
-        sys_system_resource = std::make_unique<KSystemResource>(kernel);
+        app_system_resource.emplace(kernel);
+        sys_system_resource.emplace(kernel);
         KAutoObject::Create(std::addressof(*app_system_resource));
         KAutoObject::Create(std::addressof(*sys_system_resource));
 
@@ -349,7 +347,7 @@ struct KernelCore::Impl {
     }
 
     void InitializeGlobalData(KernelCore& kernel) {
-        object_name_global_data = std::make_unique<KObjectNameGlobalData>(kernel);
+        object_name_global_data.emplace(kernel);
     }
 
     void MakeApplicationProcess(KProcess* process) {
@@ -431,7 +429,7 @@ struct KernelCore::Impl {
     }
 
     void DeriveInitialMemoryLayout() {
-        memory_layout = std::make_unique<KMemoryLayout>();
+        memory_layout.emplace();
 
         // Insert the root region for the virtual memory tree, from which all other regions will
         // derive.
@@ -726,7 +724,7 @@ struct KernelCore::Impl {
 
     void InitializeMemoryLayout() {
         // Initialize the memory manager.
-        memory_manager = std::make_unique<KMemoryManager>(system);
+        memory_manager.emplace(system);
         const auto& management_region = memory_layout->GetPoolManagementRegion();
         ASSERT(management_region.GetEndAddress() != 0);
         memory_manager->Initialize(management_region.GetAddress(), management_region.GetSize());
@@ -774,8 +772,8 @@ struct KernelCore::Impl {
     std::mutex process_list_lock;
     std::vector<KProcess*> process_list;
     KProcess* application_process{};
-    std::unique_ptr<Kernel::GlobalSchedulerContext> global_scheduler_context;
-    std::unique_ptr<Kernel::KHardwareTimer> hardware_timer;
+    std::optional<Kernel::GlobalSchedulerContext> global_scheduler_context;
+    std::optional<Kernel::KHardwareTimer> hardware_timer;
 
     Init::KSlabResourceCounts slab_resource_counts{};
     KResourceLimit* system_resource_limit{};
@@ -784,9 +782,9 @@ struct KernelCore::Impl {
 
     std::shared_ptr<Core::Timing::EventType> preemption_event;
 
-    std::unique_ptr<KAutoObjectWithListContainer> global_object_list_container;
+    std::optional<KAutoObjectWithListContainer> global_object_list_container;
 
-    std::unique_ptr<KObjectNameGlobalData> object_name_global_data;
+    std::optional<KObjectNameGlobalData> object_name_global_data;
 
     std::unordered_set<KAutoObject*> registered_objects;
     std::unordered_set<KAutoObject*> registered_in_use_objects;
@@ -794,28 +792,28 @@ struct KernelCore::Impl {
     std::mutex server_lock;
     std::vector<std::unique_ptr<Service::ServerManager>> server_managers;
 
-    std::array<std::unique_ptr<Kernel::PhysicalCore>, Core::Hardware::NUM_CPU_CORES> cores;
+    std::array<std::optional<Kernel::PhysicalCore>, Core::Hardware::NUM_CPU_CORES> cores;
 
     // Next host thead ID to use, 0-3 IDs represent core threads, >3 represent others
     std::atomic<u32> next_host_thread_id{Core::Hardware::NUM_CPU_CORES};
 
     // Kernel memory management
-    std::unique_ptr<KMemoryManager> memory_manager;
+    std::optional<KMemoryManager> memory_manager;
 
     // Resource managers
-    std::unique_ptr<KDynamicPageManager> resource_manager_page_manager;
-    std::unique_ptr<KPageTableSlabHeap> page_table_heap;
-    std::unique_ptr<KMemoryBlockSlabHeap> app_memory_block_heap;
-    std::unique_ptr<KMemoryBlockSlabHeap> sys_memory_block_heap;
-    std::unique_ptr<KBlockInfoSlabHeap> block_info_heap;
-    std::unique_ptr<KPageTableManager> app_page_table_manager;
-    std::unique_ptr<KPageTableManager> sys_page_table_manager;
-    std::unique_ptr<KMemoryBlockSlabManager> app_memory_block_manager;
-    std::unique_ptr<KMemoryBlockSlabManager> sys_memory_block_manager;
-    std::unique_ptr<KBlockInfoManager> app_block_info_manager;
-    std::unique_ptr<KBlockInfoManager> sys_block_info_manager;
-    std::unique_ptr<KSystemResource> app_system_resource;
-    std::unique_ptr<KSystemResource> sys_system_resource;
+    std::optional<KDynamicPageManager> resource_manager_page_manager;
+    std::optional<KPageTableSlabHeap> page_table_heap;
+    std::optional<KMemoryBlockSlabHeap> app_memory_block_heap;
+    std::optional<KMemoryBlockSlabHeap> sys_memory_block_heap;
+    std::optional<KBlockInfoSlabHeap> block_info_heap;
+    std::optional<KPageTableManager> app_page_table_manager;
+    std::optional<KPageTableManager> sys_page_table_manager;
+    std::optional<KMemoryBlockSlabManager> app_memory_block_manager;
+    std::optional<KMemoryBlockSlabManager> sys_memory_block_manager;
+    std::optional<KBlockInfoManager> app_block_info_manager;
+    std::optional<KBlockInfoManager> sys_block_info_manager;
+    std::optional<KSystemResource> app_system_resource;
+    std::optional<KSystemResource> sys_system_resource;
 
     // Shared memory for services
     Kernel::KSharedMemory* hid_shared_mem{};
@@ -825,10 +823,10 @@ struct KernelCore::Impl {
     Kernel::KSharedMemory* hidbus_shared_mem{};
 
     // Memory layout
-    std::unique_ptr<KMemoryLayout> memory_layout;
+    std::optional<KMemoryLayout> memory_layout;
 
     std::array<KThread*, Core::Hardware::NUM_CPU_CORES> shutdown_threads{};
-    std::array<std::unique_ptr<Kernel::KScheduler>, Core::Hardware::NUM_CPU_CORES> schedulers{};
+    std::array<std::optional<Kernel::KScheduler>, Core::Hardware::NUM_CPU_CORES> schedulers{};
 
     bool is_multicore{};
     std::atomic_bool is_shutting_down{};
@@ -948,12 +946,9 @@ const Kernel::PhysicalCore& KernelCore::CurrentPhysicalCore() const {
 }
 
 Kernel::KScheduler* KernelCore::CurrentScheduler() {
-    const u32 core_id = impl->GetCurrentHostThreadID();
-    if (core_id >= Core::Hardware::NUM_CPU_CORES) {
-        // This is expected when called from not a guest thread
-        return {};
-    }
-    return impl->schedulers[core_id].get();
+    if (auto const core_id = impl->GetCurrentHostThreadID(); core_id < Core::Hardware::NUM_CPU_CORES)
+        return std::addressof(*impl->schedulers[core_id]);
+    return {}; // This is expected when called from not a guest thread
 }
 
 Kernel::KHardwareTimer& KernelCore::HardwareTimer() {
diff --git a/src/core/hle/service/am/applet.h b/src/core/hle/service/am/applet.h
index 0763a5838e..a693a47d7a 100644
--- a/src/core/hle/service/am/applet.h
+++ b/src/core/hle/service/am/applet.h
@@ -95,9 +95,9 @@ struct Applet {
     bool request_exit_to_library_applet_at_execute_next_program_enabled{};
 
     // Channels
-    std::deque<std::vector<u8>> user_channel_launch_parameter{};
-    std::deque<std::vector<u8>> preselected_user_launch_parameter{};
-    std::deque<std::vector<u8>> friend_invitation_storage_channel{};
+    std::vector<std::vector<u8>> user_channel_launch_parameter{};
+    std::vector<std::vector<u8>> preselected_user_launch_parameter{};
+    std::vector<std::vector<u8>> friend_invitation_storage_channel{};
 
     // Context Stack
     std::stack<SharedPointer<IStorage>> context_stack{};
diff --git a/src/core/hle/service/ns/platform_service_manager.cpp b/src/core/hle/service/ns/platform_service_manager.cpp
index 293c014eae..ec9f64945d 100644
--- a/src/core/hle/service/ns/platform_service_manager.cpp
+++ b/src/core/hle/service/ns/platform_service_manager.cpp
@@ -7,6 +7,7 @@
 #include <algorithm>
 #include <cstring>
 #include <vector>
+#include <boost/container/static_vector.hpp>
 
 #include "common/assert.h"
 #include "common/common_types.h"
@@ -40,96 +41,51 @@ constexpr u32 EXPECTED_MAGIC{0x36f81a1e};  // What we expect the encrypted bfttf
 constexpr u64 SHARED_FONT_MEM_SIZE{0x1100000};
 constexpr FontRegion EMPTY_REGION{0, 0};
 
-static void DecryptSharedFont(const std::vector<u32>& input, Kernel::PhysicalMemory& output,
-                              std::size_t& offset) {
-    ASSERT_MSG(offset + (input.size() * sizeof(u32)) < SHARED_FONT_MEM_SIZE,
-               "Shared fonts exceeds 17mb!");
-    ASSERT_MSG(input[0] == EXPECTED_MAGIC, "Failed to derive key, unexpected magic number");
-
+static void DecryptSharedFont(const std::span<u32 const> input, std::span<u8> output, std::size_t& offset) {
+    ASSERT(offset + (input.size() * sizeof(u32)) < SHARED_FONT_MEM_SIZE && "Shared fonts exceeds 17mb!");
+    ASSERT(input[0] == EXPECTED_MAGIC && "Failed to derive key, unexpected magic number");
     const u32 KEY = input[0] ^ EXPECTED_RESULT; // Derive key using an inverse xor
     std::vector<u32> transformed_font(input.size());
     // TODO(ogniK): Figure out a better way to do this
-    std::transform(input.begin(), input.end(), transformed_font.begin(),
-                   [&KEY](u32 font_data) { return Common::swap32(font_data ^ KEY); });
+    std::transform(input.begin(), input.end(), transformed_font.begin(), [&KEY](u32 font_data) { return Common::swap32(font_data ^ KEY); });
     transformed_font[1] = Common::swap32(transformed_font[1]) ^ KEY; // "re-encrypt" the size
-    std::memcpy(output.data() + offset, transformed_font.data(),
-                transformed_font.size() * sizeof(u32));
+    std::memcpy(output.data() + offset, transformed_font.data(), transformed_font.size() * sizeof(u32));
     offset += transformed_font.size() * sizeof(u32);
 }
 
 void DecryptSharedFontToTTF(const std::vector<u32>& input, std::vector<u8>& output) {
     ASSERT_MSG(input[0] == EXPECTED_MAGIC, "Failed to derive key, unexpected magic number");
-
     if (input.size() < 2) {
         LOG_ERROR(Service_NS, "Input font is empty");
         return;
     }
-
     const u32 KEY = input[0] ^ EXPECTED_RESULT; // Derive key using an inverse xor
     std::vector<u32> transformed_font(input.size());
     // TODO(ogniK): Figure out a better way to do this
-    std::transform(input.begin(), input.end(), transformed_font.begin(),
-                   [&KEY](u32 font_data) { return Common::swap32(font_data ^ KEY); });
-    std::memcpy(output.data(), transformed_font.data() + 2,
-                (transformed_font.size() - 2) * sizeof(u32));
+    std::transform(input.begin(), input.end(), transformed_font.begin(), [&KEY](u32 font_data) { return Common::swap32(font_data ^ KEY); });
+    std::memcpy(output.data(), transformed_font.data() + 2, (transformed_font.size() - 2) * sizeof(u32));
 }
 
-void EncryptSharedFont(const std::vector<u32>& input, std::vector<u8>& output,
-                       std::size_t& offset) {
-    ASSERT_MSG(offset + (input.size() * sizeof(u32)) < SHARED_FONT_MEM_SIZE,
-               "Shared fonts exceeds 17mb!");
-
+void EncryptSharedFont(const std::vector<u32>& input, std::vector<u8>& output, std::size_t& offset) {
+    ASSERT(offset + (input.size() * sizeof(u32)) < SHARED_FONT_MEM_SIZE && "Shared fonts exceeds 17mb!");
     const auto key = Common::swap32(EXPECTED_RESULT ^ EXPECTED_MAGIC);
     std::vector<u32> transformed_font(input.size() + 2);
     transformed_font[0] = Common::swap32(EXPECTED_MAGIC);
     transformed_font[1] = Common::swap32(static_cast<u32>(input.size() * sizeof(u32))) ^ key;
-    std::transform(input.begin(), input.end(), transformed_font.begin() + 2,
-                   [key](u32 in) { return in ^ key; });
-    std::memcpy(output.data() + offset, transformed_font.data(),
-                transformed_font.size() * sizeof(u32));
+    std::transform(input.begin(), input.end(), transformed_font.begin() + 2, [key](u32 in) { return in ^ key; });
+    std::memcpy(output.data() + offset, transformed_font.data(), transformed_font.size() * sizeof(u32));
     offset += transformed_font.size() * sizeof(u32);
 }
 
-// Helper function to make BuildSharedFontsRawRegions a bit nicer
-static u32 GetU32Swapped(const u8* data) {
-    u32 value;
-    std::memcpy(&value, data, sizeof(value));
-    return Common::swap32(value);
-}
-
 struct IPlatformServiceManager::Impl {
     const FontRegion& GetSharedFontRegion(std::size_t index) const {
-        if (index >= shared_font_regions.size() || shared_font_regions.empty()) {
-            // No font fallback
-            return EMPTY_REGION;
-        }
-        return shared_font_regions.at(index);
+        return index < shared_font_regions.size() ? shared_font_regions[index] : EMPTY_REGION;
     }
-
-    void BuildSharedFontsRawRegions(const Kernel::PhysicalMemory& input) {
-        // As we can derive the xor key we can just populate the offsets
-        // based on the shared memory dump
-        unsigned cur_offset = 0;
-
-        for (std::size_t i = 0; i < SHARED_FONTS.size(); i++) {
-            // Out of shared fonts/invalid font
-            if (GetU32Swapped(input.data() + cur_offset) != EXPECTED_RESULT) {
-                break;
-            }
-
-            // Derive key within inverse xor
-            const u32 KEY = GetU32Swapped(input.data() + cur_offset) ^ EXPECTED_MAGIC;
-            const u32 SIZE = GetU32Swapped(input.data() + cur_offset + 4) ^ KEY;
-            shared_font_regions.push_back(FontRegion{cur_offset + 8, SIZE});
-            cur_offset += SIZE + 8;
-        }
-    }
-
-    /// Backing memory for the shared font data
-    std::shared_ptr<Kernel::PhysicalMemory> shared_font;
-
     // Automatically populated based on shared_fonts dump or system archives.
-    std::vector<FontRegion> shared_font_regions;
+    // 6 builtin fonts + extra 2 for whatever may come after
+    boost::container::static_vector<FontRegion, 8> shared_font_regions;
+    /// Backing memory for the shared font data
+    std::array<u8, SHARED_FONT_MEM_SIZE> shared_font;
 };
 
 IPlatformServiceManager::IPlatformServiceManager(Core::System& system_, const char* service_name_)
@@ -162,8 +118,6 @@ IPlatformServiceManager::IPlatformServiceManager(Core::System& system_, const ch
     const auto* nand = fsc.GetSystemNANDContents();
     std::size_t offset = 0;
     // Rebuild shared fonts from data ncas or synthesize
-
-    impl->shared_font = std::make_shared<Kernel::PhysicalMemory>(SHARED_FONT_MEM_SIZE);
     for (auto& font : SHARED_FONTS) {
         FileSys::VirtualFile romfs;
         const auto nca =
@@ -197,9 +151,8 @@ IPlatformServiceManager::IPlatformServiceManager(Core::System& system_, const ch
         std::transform(font_data_u32.begin(), font_data_u32.end(), font_data_u32.begin(),
                        Common::swap32);
         // Font offset and size do not account for the header
-        const FontRegion region{static_cast<u32>(offset + 8),
-                                static_cast<u32>((font_data_u32.size() * sizeof(u32)) - 8)};
-        DecryptSharedFont(font_data_u32, *impl->shared_font, offset);
+        const FontRegion region{u32(offset + 8), u32((font_data_u32.size() * sizeof(u32)) - 8)};
+        DecryptSharedFont(font_data_u32, impl->shared_font, offset);
         impl->shared_font_regions.push_back(region);
     }
 }
@@ -231,14 +184,12 @@ Result IPlatformServiceManager::GetSharedMemoryAddressOffset(Out<u32> out_shared
     R_SUCCEED();
 }
 
-Result IPlatformServiceManager::GetSharedMemoryNativeHandle(
-    OutCopyHandle<Kernel::KSharedMemory> out_shared_memory_native_handle) {
+Result IPlatformServiceManager::GetSharedMemoryNativeHandle(OutCopyHandle<Kernel::KSharedMemory> out_shared_memory_native_handle) {
     // Map backing memory for the font data
     LOG_DEBUG(Service_NS, "called");
 
     // Create shared font memory object
-    std::memcpy(kernel.GetFontSharedMem().GetPointer(), impl->shared_font->data(),
-                impl->shared_font->size());
+    std::memcpy(kernel.GetFontSharedMem().GetPointer(), impl->shared_font.data(), impl->shared_font.size());
 
     // FIXME: this shouldn't belong to the kernel
     *out_shared_memory_native_handle = &kernel.GetFontSharedMem();
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 94599532b3..c94b66e6bc 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -87,12 +87,8 @@ add_library(video_core STATIC
     host1x/syncpoint_manager.h
     host1x/vic.cpp
     host1x/vic.h
-    macro/macro.cpp
-    macro/macro.h
-    macro/macro_hle.cpp
-    macro/macro_hle.h
-    macro/macro_interpreter.cpp
-    macro/macro_interpreter.h
+    macro.cpp
+    macro.h
     fence_manager.h
     gpu.cpp
     gpu.h
@@ -375,10 +371,6 @@ else()
 endif()
 
 if (ARCHITECTURE_x86_64)
-    target_sources(video_core PRIVATE
-        macro/macro_jit_x64.cpp
-        macro/macro_jit_x64.h
-    )
     target_link_libraries(video_core PUBLIC xbyak::xbyak)
 endif()
 
diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp
index a67b35453b..03b2e3fdf9 100644
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -107,35 +107,27 @@ bool DmaPusher::Step() {
 }
 
 void DmaPusher::ProcessCommands(std::span<const CommandHeader> commands) {
-    for (std::size_t index = 0; index < commands.size();) {
-        const CommandHeader& command_header = commands[index];
-
-        if (dma_state.method_count) {
-            // Data word of methods command
-            dma_state.dma_word_offset = static_cast<u32>(index * sizeof(u32));
-            if (dma_state.non_incrementing) {
-                const u32 max_write = static_cast<u32>(
-                    std::min<std::size_t>(index + dma_state.method_count, commands.size()) - index);
-                CallMultiMethod(&command_header.argument, max_write);
-                dma_state.method_count -= max_write;
-                dma_state.is_last_call = true;
-                index += max_write;
-                continue;
-            } else {
-                dma_state.is_last_call = dma_state.method_count <= 1;
-                CallMethod(command_header.argument);
-            }
-
-            if (!dma_state.non_incrementing) {
-                dma_state.method++;
-            }
-
-            if (dma_increment_once) {
-                dma_state.non_incrementing = true;
-            }
-
+    for (size_t index = 0; index < commands.size();) {
+        // Data word of methods command
+        if (dma_state.method_count && dma_state.non_incrementing) {
+            auto const& command_header = commands[index]; //must ref (MUltiMethod re)
+            dma_state.dma_word_offset = u32(index * sizeof(u32));
+            const u32 max_write = u32(std::min<std::size_t>(index + dma_state.method_count, commands.size()) - index);
+            CallMultiMethod(&command_header.argument, max_write);
+            dma_state.method_count -= max_write;
+            dma_state.is_last_call = true;
+            index += max_write;
+        } else if (dma_state.method_count) {
+            auto const command_header = commands[index]; //can copy
+            dma_state.dma_word_offset = u32(index * sizeof(u32));
+            dma_state.is_last_call = dma_state.method_count <= 1;
+            CallMethod(command_header.argument);
+            dma_state.method += !dma_state.non_incrementing ? 1 : 0;
+            dma_state.non_incrementing |= dma_increment_once;
             dma_state.method_count--;
+            index++;
         } else {
+            auto const command_header = commands[index]; //can copy
             // No command active - this is the first word of a new one
             switch (command_header.mode) {
             case SubmissionMode::Increasing:
@@ -151,8 +143,7 @@ void DmaPusher::ProcessCommands(std::span<const CommandHeader> commands) {
             case SubmissionMode::Inline:
                 dma_state.method = command_header.method;
                 dma_state.subchannel = command_header.subchannel;
-                dma_state.dma_word_offset = static_cast<u64>(
-                    -static_cast<s64>(dma_state.dma_get)); // negate to set address as 0
+                dma_state.dma_word_offset = u64(-s64(dma_state.dma_get)); // negate to set address as 0
                 CallMethod(command_header.arg_count);
                 dma_state.non_incrementing = true;
                 dma_increment_once = false;
@@ -165,8 +156,8 @@ void DmaPusher::ProcessCommands(std::span<const CommandHeader> commands) {
             default:
                 break;
             }
+            index++;
         }
-        index++;
     }
 }
 
@@ -186,26 +177,24 @@ void DmaPusher::CallMethod(u32 argument) const {
         });
     } else {
         auto subchannel = subchannels[dma_state.subchannel];
-        if (!subchannel->execution_mask[dma_state.method]) [[likely]] {
+        if (!subchannel->execution_mask[dma_state.method]) {
             subchannel->method_sink.emplace_back(dma_state.method, argument);
-            return;
+        } else {
+            subchannel->ConsumeSink();
+            subchannel->current_dma_segment = dma_state.dma_get + dma_state.dma_word_offset;
+            subchannel->CallMethod(dma_state.method, argument, dma_state.is_last_call);
         }
-        subchannel->ConsumeSink();
-        subchannel->current_dma_segment = dma_state.dma_get + dma_state.dma_word_offset;
-        subchannel->CallMethod(dma_state.method, argument, dma_state.is_last_call);
     }
 }
 
 void DmaPusher::CallMultiMethod(const u32* base_start, u32 num_methods) const {
     if (dma_state.method < non_puller_methods) {
-        puller.CallMultiMethod(dma_state.method, dma_state.subchannel, base_start, num_methods,
-                               dma_state.method_count);
+        puller.CallMultiMethod(dma_state.method, dma_state.subchannel, base_start, num_methods, dma_state.method_count);
     } else {
         auto subchannel = subchannels[dma_state.subchannel];
         subchannel->ConsumeSink();
         subchannel->current_dma_segment = dma_state.dma_get + dma_state.dma_word_offset;
-        subchannel->CallMultiMethod(dma_state.method, base_start, num_methods,
-                                    dma_state.method_count);
+        subchannel->CallMultiMethod(dma_state.method, base_start, num_methods, dma_state.method_count);
     }
 }
 
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 77729fd5b6..d8d2ad74c6 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -27,9 +27,7 @@ constexpr u32 MacroRegistersStart = 0xE00;
 
 Maxwell3D::Maxwell3D(Core::System& system_, MemoryManager& memory_manager_)
     : draw_manager{std::make_unique<DrawManager>(this)}, system{system_},
-      memory_manager{memory_manager_}, macro_engine{GetMacroEngine(*this)}, upload_state{
-                                                                                memory_manager,
-                                                                                regs.upload} {
+      memory_manager{memory_manager_}, macro_engine{GetMacroEngine(*this)}, upload_state{memory_manager, regs.upload} {
     dirty.flags.flip();
     InitializeRegisterDefaults();
     execution_mask.reset();
@@ -329,8 +327,7 @@ void Maxwell3D::ProcessDirtyRegisters(u32 method, u32 argument) {
     }
 }
 
-void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argument,
-                                  bool is_last_call) {
+void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argument, bool is_last_call) {
     switch (method) {
     case MAXWELL3D_REG_INDEX(wait_for_idle):
         return rasterizer->WaitForIdle();
@@ -427,9 +424,7 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
         return;
     }
 
-    ASSERT_MSG(method < Regs::NUM_REGS,
-               "Invalid Maxwell3D register, increase the size of the Regs structure");
-
+    ASSERT(method < Regs::NUM_REGS && "Invalid Maxwell3D register, increase the size of the Regs structure");
     const u32 argument = ProcessShadowRam(method, method_argument);
     ProcessDirtyRegisters(method, argument);
     ProcessMethodCall(method, argument, method_argument, is_last_call);
@@ -670,7 +665,7 @@ Texture::TSCEntry Maxwell3D::GetTSCEntry(u32 tsc_index) const {
 }
 
 u32 Maxwell3D::GetRegisterValue(u32 method) const {
-    ASSERT_MSG(method < Regs::NUM_REGS, "Invalid Maxwell3D register");
+    ASSERT(method < Regs::NUM_REGS && "Invalid Maxwell3D register");
     return regs.reg_array[method];
 }
 
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index ae2e7a84c4..8c50a4ea2f 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -23,7 +23,7 @@
 #include "video_core/engines/engine_interface.h"
 #include "video_core/engines/engine_upload.h"
 #include "video_core/gpu.h"
-#include "video_core/macro/macro.h"
+#include "video_core/macro.h"
 #include "video_core/textures/texture.h"
 
 namespace Core {
@@ -3203,7 +3203,7 @@ private:
     std::vector<u32> macro_params;
 
     /// Interpreter for the macro codes uploaded to the GPU.
-    std::unique_ptr<MacroEngine> macro_engine;
+    std::optional<MacroEngine> macro_engine;
 
     Upload::State upload_state;
 
diff --git a/src/video_core/macro.cpp b/src/video_core/macro.cpp
new file mode 100644
index 0000000000..3fe69be4dd
--- /dev/null
+++ b/src/video_core/macro.cpp
@@ -0,0 +1,1667 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <cstring>
+#include <fstream>
+#include <optional>
+#include <span>
+
+#include <fstream>
+#ifdef ARCHITECTURE_x86_64
+// xbyak hates human beings
+#ifdef __GNUC__
+#pragma GCC diagnostic ignored "-Wconversion"
+#pragma GCC diagnostic ignored "-Wshadow"
+#endif
+#ifdef __clang__
+#pragma clang diagnostic ignored "-Wconversion"
+#pragma clang diagnostic ignored "-Wshadow"
+#endif
+#include <xbyak/xbyak.h>
+#endif
+
+#include "common/assert.h"
+#include "common/scope_exit.h"
+#include "common/fs/fs.h"
+#include "common/fs/path_util.h"
+#include "common/settings.h"
+#include "common/container_hash.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/engines/draw_manager.h"
+#include "video_core/dirty_flags.h"
+#include "video_core/rasterizer_interface.h"
+#include "video_core/macro.h"
+
+#include "common/assert.h"
+#include "common/bit_field.h"
+#include "common/logging/log.h"
+#ifdef ARCHITECTURE_x86_64
+#include "common/x64/xbyak_abi.h"
+#include "common/x64/xbyak_util.h"
+#endif
+#include "video_core/engines/maxwell_3d.h"
+
+namespace Tegra {
+
+using Maxwell3D = Engines::Maxwell3D;
+
+namespace {
+
+bool IsTopologySafe(Maxwell3D::Regs::PrimitiveTopology topology) {
+    switch (topology) {
+    case Maxwell3D::Regs::PrimitiveTopology::Points:
+    case Maxwell3D::Regs::PrimitiveTopology::Lines:
+    case Maxwell3D::Regs::PrimitiveTopology::LineLoop:
+    case Maxwell3D::Regs::PrimitiveTopology::LineStrip:
+    case Maxwell3D::Regs::PrimitiveTopology::Triangles:
+    case Maxwell3D::Regs::PrimitiveTopology::TriangleStrip:
+    case Maxwell3D::Regs::PrimitiveTopology::TriangleFan:
+    case Maxwell3D::Regs::PrimitiveTopology::LinesAdjacency:
+    case Maxwell3D::Regs::PrimitiveTopology::LineStripAdjacency:
+    case Maxwell3D::Regs::PrimitiveTopology::TrianglesAdjacency:
+    case Maxwell3D::Regs::PrimitiveTopology::TriangleStripAdjacency:
+    case Maxwell3D::Regs::PrimitiveTopology::Patches:
+        return true;
+    case Maxwell3D::Regs::PrimitiveTopology::Quads:
+    case Maxwell3D::Regs::PrimitiveTopology::QuadStrip:
+    case Maxwell3D::Regs::PrimitiveTopology::Polygon:
+    default:
+        return false;
+    }
+}
+
+class HLEMacroImpl : public CachedMacro {
+public:
+    explicit HLEMacroImpl(Maxwell3D& maxwell3d_)
+        : CachedMacro(maxwell3d_)
+    {}
+};
+
+/// @note: these macros have two versions, a normal and extended version, with the extended version
+/// also assigning the base vertex/instance.
+template <bool extended>
+class HLE_DrawArraysIndirect final : public HLEMacroImpl {
+public:
+    explicit HLE_DrawArraysIndirect(Maxwell3D& maxwell3d_)
+        : HLEMacroImpl(maxwell3d_)
+    {}
+
+    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
+        auto topology = static_cast<Maxwell3D::Regs::PrimitiveTopology>(parameters[0]);
+        if (!maxwell3d.AnyParametersDirty() || !IsTopologySafe(topology)) {
+            Fallback(parameters);
+            return;
+        }
+
+        auto& params = maxwell3d.draw_manager->GetIndirectParams();
+        params.is_byte_count = false;
+        params.is_indexed = false;
+        params.include_count = false;
+        params.count_start_address = 0;
+        params.indirect_start_address = maxwell3d.GetMacroAddress(1);
+        params.buffer_size = 4 * sizeof(u32);
+        params.max_draw_counts = 1;
+        params.stride = 0;
+
+        if (extended) {
+            maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro;
+            maxwell3d.SetHLEReplacementAttributeType(0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseInstance);
+        }
+
+        maxwell3d.draw_manager->DrawArrayIndirect(topology);
+
+        if (extended) {
+            maxwell3d.engine_state = Maxwell3D::EngineHint::None;
+            maxwell3d.replace_table.clear();
+        }
+    }
+
+private:
+    void Fallback(const std::vector<u32>& parameters) {
+        SCOPE_EXIT {
+            if (extended) {
+                maxwell3d.engine_state = Maxwell3D::EngineHint::None;
+                maxwell3d.replace_table.clear();
+            }
+        };
+        maxwell3d.RefreshParameters();
+        const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
+
+        auto topology = static_cast<Maxwell3D::Regs::PrimitiveTopology>(parameters[0]);
+        const u32 vertex_first = parameters[3];
+        const u32 vertex_count = parameters[1];
+
+        if (!IsTopologySafe(topology) && size_t(maxwell3d.GetMaxCurrentVertices()) < size_t(vertex_first) + size_t(vertex_count)) {
+            ASSERT(false && "Faulty draw!");
+            return;
+        }
+
+        const u32 base_instance = parameters[4];
+        if (extended) {
+            maxwell3d.regs.global_base_instance_index = base_instance;
+            maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro;
+            maxwell3d.SetHLEReplacementAttributeType(
+                0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseInstance);
+        }
+
+        maxwell3d.draw_manager->DrawArray(topology, vertex_first, vertex_count, base_instance,
+                                          instance_count);
+
+        if (extended) {
+            maxwell3d.regs.global_base_instance_index = 0;
+            maxwell3d.engine_state = Maxwell3D::EngineHint::None;
+            maxwell3d.replace_table.clear();
+        }
+    }
+};
+
+/*
+ * @note: these macros have two versions, a normal and extended version, with the extended version
+ * also assigning the base vertex/instance.
+ */
+template <bool extended>
+class HLE_DrawIndexedIndirect final : public HLEMacroImpl {
+public:
+    explicit HLE_DrawIndexedIndirect(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
+
+    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
+        auto topology = static_cast<Maxwell3D::Regs::PrimitiveTopology>(parameters[0]);
+        if (!maxwell3d.AnyParametersDirty() || !IsTopologySafe(topology)) {
+            Fallback(parameters);
+            return;
+        }
+
+        const u32 estimate = static_cast<u32>(maxwell3d.EstimateIndexBufferSize());
+        const u32 element_base = parameters[4];
+        const u32 base_instance = parameters[5];
+        maxwell3d.regs.vertex_id_base = element_base;
+        maxwell3d.regs.global_base_vertex_index = element_base;
+        maxwell3d.regs.global_base_instance_index = base_instance;
+        maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
+        if (extended) {
+            maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro;
+            maxwell3d.SetHLEReplacementAttributeType(0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseVertex);
+            maxwell3d.SetHLEReplacementAttributeType(0, 0x644, Maxwell3D::HLEReplacementAttributeType::BaseInstance);
+        }
+        auto& params = maxwell3d.draw_manager->GetIndirectParams();
+        params.is_byte_count = false;
+        params.is_indexed = true;
+        params.include_count = false;
+        params.count_start_address = 0;
+        params.indirect_start_address = maxwell3d.GetMacroAddress(1);
+        params.buffer_size = 5 * sizeof(u32);
+        params.max_draw_counts = 1;
+        params.stride = 0;
+        maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
+        maxwell3d.draw_manager->DrawIndexedIndirect(topology, 0, estimate);
+        maxwell3d.regs.vertex_id_base = 0x0;
+        maxwell3d.regs.global_base_vertex_index = 0x0;
+        maxwell3d.regs.global_base_instance_index = 0x0;
+        if (extended) {
+            maxwell3d.engine_state = Maxwell3D::EngineHint::None;
+            maxwell3d.replace_table.clear();
+        }
+    }
+
+private:
+    void Fallback(const std::vector<u32>& parameters) {
+        maxwell3d.RefreshParameters();
+        const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
+        const u32 element_base = parameters[4];
+        const u32 base_instance = parameters[5];
+        maxwell3d.regs.vertex_id_base = element_base;
+        maxwell3d.regs.global_base_vertex_index = element_base;
+        maxwell3d.regs.global_base_instance_index = base_instance;
+        maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
+        if (extended) {
+            maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro;
+            maxwell3d.SetHLEReplacementAttributeType(0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseVertex);
+            maxwell3d.SetHLEReplacementAttributeType(0, 0x644, Maxwell3D::HLEReplacementAttributeType::BaseInstance);
+        }
+
+        maxwell3d.draw_manager->DrawIndex(Tegra::Maxwell3D::Regs::PrimitiveTopology(parameters[0]), parameters[3], parameters[1], element_base, base_instance, instance_count);
+
+        maxwell3d.regs.vertex_id_base = 0x0;
+        maxwell3d.regs.global_base_vertex_index = 0x0;
+        maxwell3d.regs.global_base_instance_index = 0x0;
+        if (extended) {
+            maxwell3d.engine_state = Maxwell3D::EngineHint::None;
+            maxwell3d.replace_table.clear();
+        }
+    }
+};
+
+class HLE_MultiLayerClear final : public HLEMacroImpl {
+public:
+    explicit HLE_MultiLayerClear(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
+
+    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
+        maxwell3d.RefreshParameters();
+        ASSERT(parameters.size() == 1);
+
+        const Maxwell3D::Regs::ClearSurface clear_params{parameters[0]};
+        const u32 rt_index = clear_params.RT;
+        const u32 num_layers = maxwell3d.regs.rt[rt_index].depth;
+        ASSERT(clear_params.layer == 0);
+
+        maxwell3d.regs.clear_surface.raw = clear_params.raw;
+        maxwell3d.draw_manager->Clear(num_layers);
+    }
+};
+
+class HLE_MultiDrawIndexedIndirectCount final : public HLEMacroImpl {
+public:
+    explicit HLE_MultiDrawIndexedIndirectCount(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
+
+    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
+        const auto topology = Maxwell3D::Regs::PrimitiveTopology(parameters[2]);
+        if (!IsTopologySafe(topology)) {
+            Fallback(parameters);
+            return;
+        }
+
+        const u32 start_indirect = parameters[0];
+        const u32 end_indirect = parameters[1];
+        if (start_indirect >= end_indirect) {
+            // Nothing to do.
+            return;
+        }
+
+        const u32 padding = parameters[3]; // padding is in words
+
+        // size of each indirect segment
+        const u32 indirect_words = 5 + padding;
+        const u32 stride = indirect_words * sizeof(u32);
+        const std::size_t draw_count = end_indirect - start_indirect;
+        const u32 estimate = static_cast<u32>(maxwell3d.EstimateIndexBufferSize());
+        maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
+        auto& params = maxwell3d.draw_manager->GetIndirectParams();
+        params.is_byte_count = false;
+        params.is_indexed = true;
+        params.include_count = true;
+        params.count_start_address = maxwell3d.GetMacroAddress(4);
+        params.indirect_start_address = maxwell3d.GetMacroAddress(5);
+        params.buffer_size = stride * draw_count;
+        params.max_draw_counts = draw_count;
+        params.stride = stride;
+        maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
+        maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro;
+        maxwell3d.SetHLEReplacementAttributeType(
+            0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseVertex);
+        maxwell3d.SetHLEReplacementAttributeType(
+            0, 0x644, Maxwell3D::HLEReplacementAttributeType::BaseInstance);
+        maxwell3d.SetHLEReplacementAttributeType(0, 0x648,
+                                                 Maxwell3D::HLEReplacementAttributeType::DrawID);
+        maxwell3d.draw_manager->DrawIndexedIndirect(topology, 0, estimate);
+        maxwell3d.engine_state = Maxwell3D::EngineHint::None;
+        maxwell3d.replace_table.clear();
+    }
+
+private:
+    void Fallback(const std::vector<u32>& parameters) {
+        SCOPE_EXIT {
+            // Clean everything.
+            maxwell3d.regs.vertex_id_base = 0x0;
+            maxwell3d.engine_state = Maxwell3D::EngineHint::None;
+            maxwell3d.replace_table.clear();
+        };
+        maxwell3d.RefreshParameters();
+        const u32 start_indirect = parameters[0];
+        const u32 end_indirect = parameters[1];
+        if (start_indirect >= end_indirect) {
+            // Nothing to do.
+            return;
+        }
+        const auto topology = static_cast<Maxwell3D::Regs::PrimitiveTopology>(parameters[2]);
+        const u32 padding = parameters[3];
+        const std::size_t max_draws = parameters[4];
+
+        const u32 indirect_words = 5 + padding;
+        const std::size_t first_draw = start_indirect;
+        const std::size_t effective_draws = end_indirect - start_indirect;
+        const std::size_t last_draw = start_indirect + (std::min)(effective_draws, max_draws);
+
+        for (std::size_t index = first_draw; index < last_draw; index++) {
+            const std::size_t base = index * indirect_words + 5;
+            const u32 base_vertex = parameters[base + 3];
+            const u32 base_instance = parameters[base + 4];
+            maxwell3d.regs.vertex_id_base = base_vertex;
+            maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro;
+            maxwell3d.SetHLEReplacementAttributeType(
+                0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseVertex);
+            maxwell3d.SetHLEReplacementAttributeType(
+                0, 0x644, Maxwell3D::HLEReplacementAttributeType::BaseInstance);
+            maxwell3d.CallMethod(0x8e3, 0x648, true);
+            maxwell3d.CallMethod(0x8e4, static_cast<u32>(index), true);
+            maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
+            maxwell3d.draw_manager->DrawIndex(topology, parameters[base + 2], parameters[base],
+                                              base_vertex, base_instance, parameters[base + 1]);
+        }
+    }
+};
+
+class HLE_DrawIndirectByteCount final : public HLEMacroImpl {
+public:
+    explicit HLE_DrawIndirectByteCount(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
+
+    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
+        const bool force = maxwell3d.Rasterizer().HasDrawTransformFeedback();
+
+        auto topology = static_cast<Maxwell3D::Regs::PrimitiveTopology>(parameters[0] & 0xFFFFU);
+        if (!force && (!maxwell3d.AnyParametersDirty() || !IsTopologySafe(topology))) {
+            Fallback(parameters);
+            return;
+        }
+        auto& params = maxwell3d.draw_manager->GetIndirectParams();
+        params.is_byte_count = true;
+        params.is_indexed = false;
+        params.include_count = false;
+        params.count_start_address = 0;
+        params.indirect_start_address = maxwell3d.GetMacroAddress(2);
+        params.buffer_size = 4;
+        params.max_draw_counts = 1;
+        params.stride = parameters[1];
+        maxwell3d.regs.draw.begin = parameters[0];
+        maxwell3d.regs.draw_auto_stride = parameters[1];
+        maxwell3d.regs.draw_auto_byte_count = parameters[2];
+
+        maxwell3d.draw_manager->DrawArrayIndirect(topology);
+    }
+
+private:
+    void Fallback(const std::vector<u32>& parameters) {
+        maxwell3d.RefreshParameters();
+
+        maxwell3d.regs.draw.begin = parameters[0];
+        maxwell3d.regs.draw_auto_stride = parameters[1];
+        maxwell3d.regs.draw_auto_byte_count = parameters[2];
+
+        maxwell3d.draw_manager->DrawArray(
+            maxwell3d.regs.draw.topology, 0,
+            maxwell3d.regs.draw_auto_byte_count / maxwell3d.regs.draw_auto_stride, 0, 1);
+    }
+};
+
+class HLE_C713C83D8F63CCF3 final : public HLEMacroImpl {
+public:
+    explicit HLE_C713C83D8F63CCF3(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
+
+    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
+        maxwell3d.RefreshParameters();
+        const u32 offset = (parameters[0] & 0x3FFFFFFF) << 2;
+        const u32 address = maxwell3d.regs.shadow_scratch[24];
+        auto& const_buffer = maxwell3d.regs.const_buffer;
+        const_buffer.size = 0x7000;
+        const_buffer.address_high = (address >> 24) & 0xFF;
+        const_buffer.address_low = address << 8;
+        const_buffer.offset = offset;
+    }
+};
+
+class HLE_D7333D26E0A93EDE final : public HLEMacroImpl {
+public:
+    explicit HLE_D7333D26E0A93EDE(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
+
+    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
+        maxwell3d.RefreshParameters();
+        const size_t index = parameters[0];
+        const u32 address = maxwell3d.regs.shadow_scratch[42 + index];
+        const u32 size = maxwell3d.regs.shadow_scratch[47 + index];
+        auto& const_buffer = maxwell3d.regs.const_buffer;
+        const_buffer.size = size;
+        const_buffer.address_high = (address >> 24) & 0xFF;
+        const_buffer.address_low = address << 8;
+    }
+};
+
+class HLE_BindShader final : public HLEMacroImpl {
+public:
+    explicit HLE_BindShader(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
+
+    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
+        maxwell3d.RefreshParameters();
+        auto& regs = maxwell3d.regs;
+        const u32 index = parameters[0];
+        if ((parameters[1] - regs.shadow_scratch[28 + index]) == 0) {
+            return;
+        }
+
+        regs.pipelines[index & 0xF].offset = parameters[2];
+        maxwell3d.dirty.flags[VideoCommon::Dirty::Shaders] = true;
+        regs.shadow_scratch[28 + index] = parameters[1];
+        regs.shadow_scratch[34 + index] = parameters[2];
+
+        const u32 address = parameters[4];
+        auto& const_buffer = regs.const_buffer;
+        const_buffer.size = 0x10000;
+        const_buffer.address_high = (address >> 24) & 0xFF;
+        const_buffer.address_low = address << 8;
+
+        const size_t bind_group_id = parameters[3] & 0x7F;
+        auto& bind_group = regs.bind_groups[bind_group_id];
+        bind_group.raw_config = 0x11;
+        maxwell3d.ProcessCBBind(bind_group_id);
+    }
+};
+
+class HLE_SetRasterBoundingBox final : public HLEMacroImpl {
+public:
+    explicit HLE_SetRasterBoundingBox(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
+
+    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
+        maxwell3d.RefreshParameters();
+        const u32 raster_mode = parameters[0];
+        auto& regs = maxwell3d.regs;
+        const u32 raster_enabled = maxwell3d.regs.conservative_raster_enable;
+        const u32 scratch_data = maxwell3d.regs.shadow_scratch[52];
+        regs.raster_bounding_box.raw = raster_mode & 0xFFFFF00F;
+        regs.raster_bounding_box.pad.Assign(scratch_data & raster_enabled);
+    }
+};
+
+template <size_t base_size>
+class HLE_ClearConstBuffer final : public HLEMacroImpl {
+public:
+    explicit HLE_ClearConstBuffer(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
+
+    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
+        maxwell3d.RefreshParameters();
+        static constexpr std::array<u32, base_size> zeroes{};
+        auto& regs = maxwell3d.regs;
+        regs.const_buffer.size = u32(base_size);
+        regs.const_buffer.address_high = parameters[0];
+        regs.const_buffer.address_low = parameters[1];
+        regs.const_buffer.offset = 0;
+        maxwell3d.ProcessCBMultiData(zeroes.data(), parameters[2] * 4);
+    }
+};
+
+class HLE_ClearMemory final : public HLEMacroImpl {
+public:
+    explicit HLE_ClearMemory(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
+
+    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
+        maxwell3d.RefreshParameters();
+
+        const u32 needed_memory = parameters[2] / sizeof(u32);
+        if (needed_memory > zero_memory.size()) {
+            zero_memory.resize(needed_memory, 0);
+        }
+        auto& regs = maxwell3d.regs;
+        regs.upload.line_length_in = parameters[2];
+        regs.upload.line_count = 1;
+        regs.upload.dest.address_high = parameters[0];
+        regs.upload.dest.address_low = parameters[1];
+        maxwell3d.CallMethod(size_t(MAXWELL3D_REG_INDEX(launch_dma)), 0x1011, true);
+        maxwell3d.CallMultiMethod(size_t(MAXWELL3D_REG_INDEX(inline_data)), zero_memory.data(), needed_memory, needed_memory);
+    }
+
+private:
+    std::vector<u32> zero_memory;
+};
+
+class HLE_TransformFeedbackSetup final : public HLEMacroImpl {
+public:
+    explicit HLE_TransformFeedbackSetup(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
+
+    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
+        maxwell3d.RefreshParameters();
+
+        auto& regs = maxwell3d.regs;
+        regs.transform_feedback_enabled = 1;
+        regs.transform_feedback.buffers[0].start_offset = 0;
+        regs.transform_feedback.buffers[1].start_offset = 0;
+        regs.transform_feedback.buffers[2].start_offset = 0;
+        regs.transform_feedback.buffers[3].start_offset = 0;
+
+        regs.upload.line_length_in = 4;
+        regs.upload.line_count = 1;
+        regs.upload.dest.address_high = parameters[0];
+        regs.upload.dest.address_low = parameters[1];
+        maxwell3d.CallMethod(size_t(MAXWELL3D_REG_INDEX(launch_dma)), 0x1011, true);
+        maxwell3d.CallMethod(size_t(MAXWELL3D_REG_INDEX(inline_data)), regs.transform_feedback.controls[0].stride, true);
+
+        maxwell3d.Rasterizer().RegisterTransformFeedback(regs.upload.dest.Address());
+    }
+};
+
+} // Anonymous namespace
+
+HLEMacro::HLEMacro(Maxwell3D& maxwell3d_) : maxwell3d{maxwell3d_} {}
+
+HLEMacro::~HLEMacro() = default;
+
+std::unique_ptr<CachedMacro> HLEMacro::GetHLEProgram(u64 hash) const {
+    // Compiler will make you a GREAT job at making an ad-hoc hash table :)
+    switch (hash) {
+    case 0x0D61FC9FAAC9FCADULL: return std::make_unique<HLE_DrawArraysIndirect<false>>(maxwell3d);
+    case 0x8A4D173EB99A8603ULL: return std::make_unique<HLE_DrawArraysIndirect<true>>(maxwell3d);
+    case 0x771BB18C62444DA0ULL: return std::make_unique<HLE_DrawIndexedIndirect<false>>(maxwell3d);
+    case 0x0217920100488FF7ULL: return std::make_unique<HLE_DrawIndexedIndirect<true>>(maxwell3d);
+    case 0x3F5E74B9C9A50164ULL: return std::make_unique<HLE_MultiDrawIndexedIndirectCount>(maxwell3d);
+    case 0xEAD26C3E2109B06BULL: return std::make_unique<HLE_MultiLayerClear>(maxwell3d);
+    case 0xC713C83D8F63CCF3ULL: return std::make_unique<HLE_C713C83D8F63CCF3>(maxwell3d);
+    case 0xD7333D26E0A93EDEULL: return std::make_unique<HLE_D7333D26E0A93EDE>(maxwell3d);
+    case 0xEB29B2A09AA06D38ULL: return std::make_unique<HLE_BindShader>(maxwell3d);
+    case 0xDB1341DBEB4C8AF7ULL: return std::make_unique<HLE_SetRasterBoundingBox>(maxwell3d);
+    case 0x6C97861D891EDf7EULL: return std::make_unique<HLE_ClearConstBuffer<0x5F00>>(maxwell3d);
+    case 0xD246FDDF3A6173D7ULL: return std::make_unique<HLE_ClearConstBuffer<0x7000>>(maxwell3d);
+    case 0xEE4D0004BEC8ECF4ULL: return std::make_unique<HLE_ClearMemory>(maxwell3d);
+    case 0xFC0CF27F5FFAA661ULL: return std::make_unique<HLE_TransformFeedbackSetup>(maxwell3d);
+    case 0xB5F74EDB717278ECULL: return std::make_unique<HLE_DrawIndirectByteCount>(maxwell3d);
+    default:
+        return nullptr;
+    }
+}
+
+namespace {
+class MacroInterpreterImpl final : public CachedMacro {
+public:
+    explicit MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d_, const std::vector<u32>& code_)
+        : CachedMacro(maxwell3d_)
+        , code{code_}
+    {}
+
+    void Execute(const std::vector<u32>& params, u32 method) override;
+
+private:
+    /// Resets the execution engine state, zeroing registers, etc.
+    void Reset();
+
+    /**
+     * Executes a single macro instruction located at the current program counter. Returns whether
+     * the interpreter should keep running.
+     *
+     * @param is_delay_slot Whether the current step is being executed due to a delay slot in a
+     *                      previous instruction.
+     */
+    bool Step(bool is_delay_slot);
+
+    /// Calculates the result of an ALU operation. src_a OP src_b;
+    u32 GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b);
+
+    /// Performs the result operation on the input result and stores it in the specified register
+    /// (if necessary).
+    void ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result);
+
+    /// Evaluates the branch condition and returns whether the branch should be taken or not.
+    bool EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const;
+
+    /// Reads an opcode at the current program counter location.
+    Macro::Opcode GetOpcode() const;
+
+    /// Returns the specified register's value. Register 0 is hardcoded to always return 0.
+    u32 GetRegister(u32 register_id) const;
+
+    /// Sets the register to the input value.
+    void SetRegister(u32 register_id, u32 value);
+
+    /// Sets the method address to use for the next Send instruction.
+    void SetMethodAddress(u32 address);
+
+    /// Calls a GPU Engine method with the input parameter.
+    void Send(u32 value);
+
+    /// Reads a GPU register located at the method address.
+    u32 Read(u32 method) const;
+
+    /// Returns the next parameter in the parameter queue.
+    u32 FetchParameter();
+
+    /// Current program counter
+    u32 pc{};
+    /// Program counter to execute at after the delay slot is executed.
+    std::optional<u32> delayed_pc;
+
+    /// General purpose macro registers.
+    std::array<u32, Macro::NUM_MACRO_REGISTERS> registers = {};
+
+    /// Method address to use for the next Send instruction.
+    Macro::MethodAddress method_address = {};
+
+    /// Input parameters of the current macro.
+    std::unique_ptr<u32[]> parameters;
+    std::size_t num_parameters = 0;
+    std::size_t parameters_capacity = 0;
+    /// Index of the next parameter that will be fetched by the 'parm' instruction.
+    u32 next_parameter_index = 0;
+
+    bool carry_flag = false;
+    const std::vector<u32>& code;
+};
+
+void MacroInterpreterImpl::Execute(const std::vector<u32>& params, u32 method) {
+    Reset();
+
+    registers[1] = params[0];
+    num_parameters = params.size();
+
+    if (num_parameters > parameters_capacity) {
+        parameters_capacity = num_parameters;
+        parameters = std::make_unique<u32[]>(num_parameters);
+    }
+    std::memcpy(parameters.get(), params.data(), num_parameters * sizeof(u32));
+
+    // Execute the code until we hit an exit condition.
+    bool keep_executing = true;
+    while (keep_executing) {
+        keep_executing = Step(false);
+    }
+
+    // Assert the the macro used all the input parameters
+    ASSERT(next_parameter_index == num_parameters);
+}
+
+void MacroInterpreterImpl::Reset() {
+    registers = {};
+    pc = 0;
+    delayed_pc = {};
+    method_address.raw = 0;
+    num_parameters = 0;
+    // The next parameter index starts at 1, because $r1 already has the value of the first
+    // parameter.
+    next_parameter_index = 1;
+    carry_flag = false;
+}
+
+bool MacroInterpreterImpl::Step(bool is_delay_slot) {
+    u32 base_address = pc;
+
+    Macro::Opcode opcode = GetOpcode();
+    pc += 4;
+
+    // Update the program counter if we were delayed
+    if (delayed_pc) {
+        ASSERT(is_delay_slot);
+        pc = *delayed_pc;
+        delayed_pc = {};
+    }
+
+    switch (opcode.operation) {
+    case Macro::Operation::ALU: {
+        u32 result = GetALUResult(opcode.alu_operation, GetRegister(opcode.src_a),
+                                  GetRegister(opcode.src_b));
+        ProcessResult(opcode.result_operation, opcode.dst, result);
+        break;
+    }
+    case Macro::Operation::AddImmediate: {
+        ProcessResult(opcode.result_operation, opcode.dst,
+                      GetRegister(opcode.src_a) + opcode.immediate);
+        break;
+    }
+    case Macro::Operation::ExtractInsert: {
+        u32 dst = GetRegister(opcode.src_a);
+        u32 src = GetRegister(opcode.src_b);
+
+        src = (src >> opcode.bf_src_bit) & opcode.GetBitfieldMask();
+        dst &= ~(opcode.GetBitfieldMask() << opcode.bf_dst_bit);
+        dst |= src << opcode.bf_dst_bit;
+        ProcessResult(opcode.result_operation, opcode.dst, dst);
+        break;
+    }
+    case Macro::Operation::ExtractShiftLeftImmediate: {
+        u32 dst = GetRegister(opcode.src_a);
+        u32 src = GetRegister(opcode.src_b);
+
+        u32 result = ((src >> dst) & opcode.GetBitfieldMask()) << opcode.bf_dst_bit;
+
+        ProcessResult(opcode.result_operation, opcode.dst, result);
+        break;
+    }
+    case Macro::Operation::ExtractShiftLeftRegister: {
+        u32 dst = GetRegister(opcode.src_a);
+        u32 src = GetRegister(opcode.src_b);
+
+        u32 result = ((src >> opcode.bf_src_bit) & opcode.GetBitfieldMask()) << dst;
+
+        ProcessResult(opcode.result_operation, opcode.dst, result);
+        break;
+    }
+    case Macro::Operation::Read: {
+        u32 result = Read(GetRegister(opcode.src_a) + opcode.immediate);
+        ProcessResult(opcode.result_operation, opcode.dst, result);
+        break;
+    }
+    case Macro::Operation::Branch: {
+        ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid");
+        u32 value = GetRegister(opcode.src_a);
+        bool taken = EvaluateBranchCondition(opcode.branch_condition, value);
+        if (taken) {
+            // Ignore the delay slot if the branch has the annul bit.
+            if (opcode.branch_annul) {
+                pc = base_address + opcode.GetBranchTarget();
+                return true;
+            }
+
+            delayed_pc = base_address + opcode.GetBranchTarget();
+            // Execute one more instruction due to the delay slot.
+            return Step(true);
+        }
+        break;
+    }
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented macro operation {}", opcode.operation.Value());
+        break;
+    }
+
+    // An instruction with the Exit flag will not actually
+    // cause an exit if it's executed inside a delay slot.
+    if (opcode.is_exit && !is_delay_slot) {
+        // Exit has a delay slot, execute the next instruction
+        Step(true);
+        return false;
+    }
+
+    return true;
+}
+
+u32 MacroInterpreterImpl::GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b) {
+    switch (operation) {
+    case Macro::ALUOperation::Add: {
+        const u64 result{static_cast<u64>(src_a) + src_b};
+        carry_flag = result > 0xffffffff;
+        return static_cast<u32>(result);
+    }
+    case Macro::ALUOperation::AddWithCarry: {
+        const u64 result{static_cast<u64>(src_a) + src_b + (carry_flag ? 1ULL : 0ULL)};
+        carry_flag = result > 0xffffffff;
+        return static_cast<u32>(result);
+    }
+    case Macro::ALUOperation::Subtract: {
+        const u64 result{static_cast<u64>(src_a) - src_b};
+        carry_flag = result < 0x100000000;
+        return static_cast<u32>(result);
+    }
+    case Macro::ALUOperation::SubtractWithBorrow: {
+        const u64 result{static_cast<u64>(src_a) - src_b - (carry_flag ? 0ULL : 1ULL)};
+        carry_flag = result < 0x100000000;
+        return static_cast<u32>(result);
+    }
+    case Macro::ALUOperation::Xor:
+        return src_a ^ src_b;
+    case Macro::ALUOperation::Or:
+        return src_a | src_b;
+    case Macro::ALUOperation::And:
+        return src_a & src_b;
+    case Macro::ALUOperation::AndNot:
+        return src_a & ~src_b;
+    case Macro::ALUOperation::Nand:
+        return ~(src_a & src_b);
+
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", operation);
+        return 0;
+    }
+}
+
+void MacroInterpreterImpl::ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result) {
+    switch (operation) {
+    case Macro::ResultOperation::IgnoreAndFetch:
+        // Fetch parameter and ignore result.
+        SetRegister(reg, FetchParameter());
+        break;
+    case Macro::ResultOperation::Move:
+        // Move result.
+        SetRegister(reg, result);
+        break;
+    case Macro::ResultOperation::MoveAndSetMethod:
+        // Move result and use as Method Address.
+        SetRegister(reg, result);
+        SetMethodAddress(result);
+        break;
+    case Macro::ResultOperation::FetchAndSend:
+        // Fetch parameter and send result.
+        SetRegister(reg, FetchParameter());
+        Send(result);
+        break;
+    case Macro::ResultOperation::MoveAndSend:
+        // Move and send result.
+        SetRegister(reg, result);
+        Send(result);
+        break;
+    case Macro::ResultOperation::FetchAndSetMethod:
+        // Fetch parameter and use result as Method Address.
+        SetRegister(reg, FetchParameter());
+        SetMethodAddress(result);
+        break;
+    case Macro::ResultOperation::MoveAndSetMethodFetchAndSend:
+        // Move result and use as Method Address, then fetch and send parameter.
+        SetRegister(reg, result);
+        SetMethodAddress(result);
+        Send(FetchParameter());
+        break;
+    case Macro::ResultOperation::MoveAndSetMethodSend:
+        // Move result and use as Method Address, then send bits 12:17 of result.
+        SetRegister(reg, result);
+        SetMethodAddress(result);
+        Send((result >> 12) & 0b111111);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented result operation {}", operation);
+        break;
+    }
+}
+
+bool MacroInterpreterImpl::EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const {
+    switch (cond) {
+    case Macro::BranchCondition::Zero:
+        return value == 0;
+    case Macro::BranchCondition::NotZero:
+        return value != 0;
+    }
+    UNREACHABLE();
+}
+
+Macro::Opcode MacroInterpreterImpl::GetOpcode() const {
+    ASSERT((pc % sizeof(u32)) == 0);
+    ASSERT(pc < code.size() * sizeof(u32));
+    return {code[pc / sizeof(u32)]};
+}
+
+u32 MacroInterpreterImpl::GetRegister(u32 register_id) const {
+    return registers.at(register_id);
+}
+
+void MacroInterpreterImpl::SetRegister(u32 register_id, u32 value) {
+    // Register 0 is hardwired as the zero register.
+    // Ensure no writes to it actually occur.
+    if (register_id == 0) {
+        return;
+    }
+
+    registers.at(register_id) = value;
+}
+
+void MacroInterpreterImpl::SetMethodAddress(u32 address) {
+    method_address.raw = address;
+}
+
+void MacroInterpreterImpl::Send(u32 value) {
+    maxwell3d.CallMethod(method_address.address, value, true);
+    // Increment the method address by the method increment.
+    method_address.address.Assign(method_address.address.Value() +
+                                  method_address.increment.Value());
+}
+
+u32 MacroInterpreterImpl::Read(u32 method) const {
+    return maxwell3d.GetRegisterValue(method);
+}
+
+u32 MacroInterpreterImpl::FetchParameter() {
+    ASSERT(next_parameter_index < num_parameters);
+    return parameters[next_parameter_index++];
+}
+} // Anonymous namespace
+
+#ifdef ARCHITECTURE_x86_64
+namespace {
+constexpr Xbyak::Reg64 STATE = Xbyak::util::rbx;
+constexpr Xbyak::Reg32 RESULT = Xbyak::util::r10d;
+constexpr Xbyak::Reg64 MAX_PARAMETER = Xbyak::util::r11;
+constexpr Xbyak::Reg64 PARAMETERS = Xbyak::util::r12;
+constexpr Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d;
+constexpr Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15;
+
+constexpr std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({
+    STATE,
+    RESULT,
+    MAX_PARAMETER,
+    PARAMETERS,
+    METHOD_ADDRESS,
+    BRANCH_HOLDER,
+});
+
+// Arbitrarily chosen based on current booting games.
+constexpr size_t MAX_CODE_SIZE = 0x10000;
+
+std::bitset<32> PersistentCallerSavedRegs() {
+    return PERSISTENT_REGISTERS & Common::X64::ABI_ALL_CALLER_SAVED;
+}
+
+/// @brief Must enforce W^X constraints, as we yet don't havea  global "NO_EXECUTE" support flag
+/// the speed loss is minimal, and in fact may be negligible, however for your peace of mind
+/// I simply included known OSes whom had W^X issues
+#if defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__)
+static const auto default_cg_mode = Xbyak::DontSetProtectRWE;
+#else
+static const auto default_cg_mode = nullptr; //Allow RWE
+#endif
+
+class MacroJITx64Impl final : public Xbyak::CodeGenerator, public CachedMacro {
+public:
+    explicit MacroJITx64Impl(Engines::Maxwell3D& maxwell3d_, const std::vector<u32>& code_)
+        : Xbyak::CodeGenerator(MAX_CODE_SIZE, default_cg_mode)
+        , CachedMacro(maxwell3d_)
+        , code{code_}
+    {
+        Compile();
+    }
+
+    void Execute(const std::vector<u32>& parameters, u32 method) override;
+
+    void Compile_ALU(Macro::Opcode opcode);
+    void Compile_AddImmediate(Macro::Opcode opcode);
+    void Compile_ExtractInsert(Macro::Opcode opcode);
+    void Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode);
+    void Compile_ExtractShiftLeftRegister(Macro::Opcode opcode);
+    void Compile_Read(Macro::Opcode opcode);
+    void Compile_Branch(Macro::Opcode opcode);
+
+private:
+    void Optimizer_ScanFlags();
+
+    void Compile();
+    bool Compile_NextInstruction();
+
+    Xbyak::Reg32 Compile_FetchParameter();
+    Xbyak::Reg32 Compile_GetRegister(u32 index, Xbyak::Reg32 dst);
+
+    void Compile_ProcessResult(Macro::ResultOperation operation, u32 reg);
+    void Compile_Send(Xbyak::Reg32 value);
+
+    Macro::Opcode GetOpCode() const;
+
+    struct JITState {
+        Engines::Maxwell3D* maxwell3d{};
+        std::array<u32, Macro::NUM_MACRO_REGISTERS> registers{};
+        u32 carry_flag{};
+    };
+    static_assert(offsetof(JITState, maxwell3d) == 0, "Maxwell3D is not at 0x0");
+    using ProgramType = void (*)(JITState*, const u32*, const u32*);
+
+    struct OptimizerState {
+        bool can_skip_carry{};
+        bool has_delayed_pc{};
+        bool zero_reg_skip{};
+        bool skip_dummy_addimmediate{};
+        bool optimize_for_method_move{};
+        bool enable_asserts{};
+    };
+    OptimizerState optimizer{};
+
+    std::optional<Macro::Opcode> next_opcode{};
+    ProgramType program{nullptr};
+
+    std::array<Xbyak::Label, MAX_CODE_SIZE> labels;
+    std::array<Xbyak::Label, MAX_CODE_SIZE> delay_skip;
+    Xbyak::Label end_of_code{};
+
+    bool is_delay_slot{};
+    u32 pc{};
+
+    const std::vector<u32>& code;
+};
+
+void MacroJITx64Impl::Execute(const std::vector<u32>& parameters, u32 method) {
+    ASSERT_OR_EXECUTE(program != nullptr, { return; });
+    JITState state{};
+    state.maxwell3d = &maxwell3d;
+    state.registers = {};
+    program(&state, parameters.data(), parameters.data() + parameters.size());
+}
+
+void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) {
+    const bool is_a_zero = opcode.src_a == 0;
+    const bool is_b_zero = opcode.src_b == 0;
+    const bool valid_operation = !is_a_zero && !is_b_zero;
+    [[maybe_unused]] const bool is_move_operation = !is_a_zero && is_b_zero;
+    const bool has_zero_register = is_a_zero || is_b_zero;
+    const bool no_zero_reg_skip = opcode.alu_operation == Macro::ALUOperation::AddWithCarry ||
+                                  opcode.alu_operation == Macro::ALUOperation::SubtractWithBorrow;
+
+    Xbyak::Reg32 src_a;
+    Xbyak::Reg32 src_b;
+
+    if (!optimizer.zero_reg_skip || no_zero_reg_skip) {
+        src_a = Compile_GetRegister(opcode.src_a, RESULT);
+        src_b = Compile_GetRegister(opcode.src_b, eax);
+    } else {
+        if (!is_a_zero) {
+            src_a = Compile_GetRegister(opcode.src_a, RESULT);
+        }
+        if (!is_b_zero) {
+            src_b = Compile_GetRegister(opcode.src_b, eax);
+        }
+    }
+
+    bool has_emitted = false;
+
+    switch (opcode.alu_operation) {
+    case Macro::ALUOperation::Add:
+        if (optimizer.zero_reg_skip) {
+            if (valid_operation) {
+                add(src_a, src_b);
+            }
+        } else {
+            add(src_a, src_b);
+        }
+
+        if (!optimizer.can_skip_carry) {
+            setc(byte[STATE + offsetof(JITState, carry_flag)]);
+        }
+        break;
+    case Macro::ALUOperation::AddWithCarry:
+        bt(dword[STATE + offsetof(JITState, carry_flag)], 0);
+        adc(src_a, src_b);
+        setc(byte[STATE + offsetof(JITState, carry_flag)]);
+        break;
+    case Macro::ALUOperation::Subtract:
+        if (optimizer.zero_reg_skip) {
+            if (valid_operation) {
+                sub(src_a, src_b);
+                has_emitted = true;
+            }
+        } else {
+            sub(src_a, src_b);
+            has_emitted = true;
+        }
+        if (!optimizer.can_skip_carry && has_emitted) {
+            setc(byte[STATE + offsetof(JITState, carry_flag)]);
+        }
+        break;
+    case Macro::ALUOperation::SubtractWithBorrow:
+        bt(dword[STATE + offsetof(JITState, carry_flag)], 0);
+        sbb(src_a, src_b);
+        setc(byte[STATE + offsetof(JITState, carry_flag)]);
+        break;
+    case Macro::ALUOperation::Xor:
+        if (optimizer.zero_reg_skip) {
+            if (valid_operation) {
+                xor_(src_a, src_b);
+            }
+        } else {
+            xor_(src_a, src_b);
+        }
+        break;
+    case Macro::ALUOperation::Or:
+        if (optimizer.zero_reg_skip) {
+            if (valid_operation) {
+                or_(src_a, src_b);
+            }
+        } else {
+            or_(src_a, src_b);
+        }
+        break;
+    case Macro::ALUOperation::And:
+        if (optimizer.zero_reg_skip) {
+            if (!has_zero_register) {
+                and_(src_a, src_b);
+            }
+        } else {
+            and_(src_a, src_b);
+        }
+        break;
+    case Macro::ALUOperation::AndNot:
+        if (optimizer.zero_reg_skip) {
+            if (!is_a_zero) {
+                not_(src_b);
+                and_(src_a, src_b);
+            }
+        } else {
+            not_(src_b);
+            and_(src_a, src_b);
+        }
+        break;
+    case Macro::ALUOperation::Nand:
+        if (optimizer.zero_reg_skip) {
+            if (!is_a_zero) {
+                and_(src_a, src_b);
+                not_(src_a);
+            }
+        } else {
+            and_(src_a, src_b);
+            not_(src_a);
+        }
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", opcode.alu_operation.Value());
+        break;
+    }
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_AddImmediate(Macro::Opcode opcode) {
+    if (optimizer.skip_dummy_addimmediate) {
+        // Games tend to use this as an exit instruction placeholder. It's to encode an instruction
+        // without doing anything. In our case we can just not emit anything.
+        if (opcode.result_operation == Macro::ResultOperation::Move && opcode.dst == 0) {
+            return;
+        }
+    }
+    // Check for redundant moves
+    if (optimizer.optimize_for_method_move &&
+        opcode.result_operation == Macro::ResultOperation::MoveAndSetMethod) {
+        if (next_opcode.has_value()) {
+            const auto next = *next_opcode;
+            if (next.result_operation == Macro::ResultOperation::MoveAndSetMethod &&
+                opcode.dst == next.dst) {
+                return;
+            }
+        }
+    }
+    if (optimizer.zero_reg_skip && opcode.src_a == 0) {
+        if (opcode.immediate == 0) {
+            xor_(RESULT, RESULT);
+        } else {
+            mov(RESULT, opcode.immediate);
+        }
+    } else {
+        auto result = Compile_GetRegister(opcode.src_a, RESULT);
+        if (opcode.immediate > 2) {
+            add(result, opcode.immediate);
+        } else if (opcode.immediate == 1) {
+            inc(result);
+        } else if (opcode.immediate < 0) {
+            sub(result, opcode.immediate * -1);
+        }
+    }
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_ExtractInsert(Macro::Opcode opcode) {
+    auto dst = Compile_GetRegister(opcode.src_a, RESULT);
+    auto src = Compile_GetRegister(opcode.src_b, eax);
+
+    const u32 mask = ~(opcode.GetBitfieldMask() << opcode.bf_dst_bit);
+    and_(dst, mask);
+    shr(src, opcode.bf_src_bit);
+    and_(src, opcode.GetBitfieldMask());
+    shl(src, opcode.bf_dst_bit);
+    or_(dst, src);
+
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode) {
+    const auto dst = Compile_GetRegister(opcode.src_a, ecx);
+    const auto src = Compile_GetRegister(opcode.src_b, RESULT);
+
+    shr(src, dst.cvt8());
+    and_(src, opcode.GetBitfieldMask());
+    shl(src, opcode.bf_dst_bit);
+
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_ExtractShiftLeftRegister(Macro::Opcode opcode) {
+    const auto dst = Compile_GetRegister(opcode.src_a, ecx);
+    const auto src = Compile_GetRegister(opcode.src_b, RESULT);
+
+    shr(src, opcode.bf_src_bit);
+    and_(src, opcode.GetBitfieldMask());
+    shl(src, dst.cvt8());
+
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_Read(Macro::Opcode opcode) {
+    if (optimizer.zero_reg_skip && opcode.src_a == 0) {
+        if (opcode.immediate == 0) {
+            xor_(RESULT, RESULT);
+        } else {
+            mov(RESULT, opcode.immediate);
+        }
+    } else {
+        auto result = Compile_GetRegister(opcode.src_a, RESULT);
+        if (opcode.immediate > 2) {
+            add(result, opcode.immediate);
+        } else if (opcode.immediate == 1) {
+            inc(result);
+        } else if (opcode.immediate < 0) {
+            sub(result, opcode.immediate * -1);
+        }
+    }
+
+    // Equivalent to Engines::Maxwell3D::GetRegisterValue:
+    if (optimizer.enable_asserts) {
+        Xbyak::Label pass_range_check;
+        cmp(RESULT, static_cast<u32>(Engines::Maxwell3D::Regs::NUM_REGS));
+        jb(pass_range_check);
+        int3();
+        L(pass_range_check);
+    }
+    mov(rax, qword[STATE]);
+    mov(RESULT,
+        dword[rax + offsetof(Engines::Maxwell3D, regs) +
+              offsetof(Engines::Maxwell3D::Regs, reg_array) + RESULT.cvt64() * sizeof(u32)]);
+
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void Send(Engines::Maxwell3D* maxwell3d, Macro::MethodAddress method_address, u32 value) {
+    maxwell3d->CallMethod(method_address.address, value, true);
+}
+
+void MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) {
+    Common::X64::ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+    mov(Common::X64::ABI_PARAM1, qword[STATE]);
+    mov(Common::X64::ABI_PARAM2.cvt32(), METHOD_ADDRESS);
+    mov(Common::X64::ABI_PARAM3.cvt32(), value);
+    Common::X64::CallFarFunction(*this, &Send);
+    Common::X64::ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+
+    Xbyak::Label dont_process{};
+    // Get increment
+    test(METHOD_ADDRESS, 0x3f000);
+    // If zero, method address doesn't update
+    je(dont_process);
+
+    mov(ecx, METHOD_ADDRESS);
+    and_(METHOD_ADDRESS, 0xfff);
+    shr(ecx, 12);
+    and_(ecx, 0x3f);
+    lea(eax, ptr[rcx + METHOD_ADDRESS.cvt64()]);
+    sal(ecx, 12);
+    or_(eax, ecx);
+
+    mov(METHOD_ADDRESS, eax);
+
+    L(dont_process);
+}
+
+void MacroJITx64Impl::Compile_Branch(Macro::Opcode opcode) {
+    ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid");
+    const s32 jump_address =
+        static_cast<s32>(pc) + static_cast<s32>(opcode.GetBranchTarget() / sizeof(s32));
+
+    Xbyak::Label end;
+    auto value = Compile_GetRegister(opcode.src_a, eax);
+    cmp(value, 0); // test(value, value);
+    if (optimizer.has_delayed_pc) {
+        switch (opcode.branch_condition) {
+        case Macro::BranchCondition::Zero:
+            jne(end, T_NEAR);
+            break;
+        case Macro::BranchCondition::NotZero:
+            je(end, T_NEAR);
+            break;
+        }
+
+        if (opcode.branch_annul) {
+            xor_(BRANCH_HOLDER, BRANCH_HOLDER);
+            jmp(labels[jump_address], T_NEAR);
+        } else {
+            Xbyak::Label handle_post_exit{};
+            Xbyak::Label skip{};
+            jmp(skip, T_NEAR);
+
+            L(handle_post_exit);
+            xor_(BRANCH_HOLDER, BRANCH_HOLDER);
+            jmp(labels[jump_address], T_NEAR);
+
+            L(skip);
+            mov(BRANCH_HOLDER, handle_post_exit);
+            jmp(delay_skip[pc], T_NEAR);
+        }
+    } else {
+        switch (opcode.branch_condition) {
+        case Macro::BranchCondition::Zero:
+            je(labels[jump_address], T_NEAR);
+            break;
+        case Macro::BranchCondition::NotZero:
+            jne(labels[jump_address], T_NEAR);
+            break;
+        }
+    }
+
+    L(end);
+}
+
+void MacroJITx64Impl::Optimizer_ScanFlags() {
+    optimizer.can_skip_carry = true;
+    optimizer.has_delayed_pc = false;
+    for (auto raw_op : code) {
+        Macro::Opcode op{};
+        op.raw = raw_op;
+
+        if (op.operation == Macro::Operation::ALU) {
+            // Scan for any ALU operations which actually use the carry flag, if they don't exist in
+            // our current code we can skip emitting the carry flag handling operations
+            if (op.alu_operation == Macro::ALUOperation::AddWithCarry ||
+                op.alu_operation == Macro::ALUOperation::SubtractWithBorrow) {
+                optimizer.can_skip_carry = false;
+            }
+        }
+
+        if (op.operation == Macro::Operation::Branch) {
+            if (!op.branch_annul) {
+                optimizer.has_delayed_pc = true;
+            }
+        }
+    }
+}
+
+void MacroJITx64Impl::Compile() {
+    labels.fill(Xbyak::Label());
+
+    Common::X64::ABI_PushRegistersAndAdjustStack(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
+    // JIT state
+    mov(STATE, Common::X64::ABI_PARAM1);
+    mov(PARAMETERS, Common::X64::ABI_PARAM2);
+    mov(MAX_PARAMETER, Common::X64::ABI_PARAM3);
+    xor_(RESULT, RESULT);
+    xor_(METHOD_ADDRESS, METHOD_ADDRESS);
+    xor_(BRANCH_HOLDER, BRANCH_HOLDER);
+
+    mov(dword[STATE + offsetof(JITState, registers) + 4], Compile_FetchParameter());
+
+    // Track get register for zero registers and mark it as no-op
+    optimizer.zero_reg_skip = true;
+
+    // AddImmediate tends to be used as a NOP instruction, if we detect this we can
+    // completely skip the entire code path and no emit anything
+    optimizer.skip_dummy_addimmediate = true;
+
+    // SMO tends to emit a lot of unnecessary method moves, we can mitigate this by only emitting
+    // one if our register isn't "dirty"
+    optimizer.optimize_for_method_move = true;
+
+    // Enable run-time assertions in JITted code
+    optimizer.enable_asserts = false;
+
+    // Check to see if we can skip emitting certain instructions
+    Optimizer_ScanFlags();
+
+    const u32 op_count = static_cast<u32>(code.size());
+    for (u32 i = 0; i < op_count; i++) {
+        if (i < op_count - 1) {
+            pc = i + 1;
+            next_opcode = GetOpCode();
+        } else {
+            next_opcode = {};
+        }
+        pc = i;
+        Compile_NextInstruction();
+    }
+
+    L(end_of_code);
+
+    Common::X64::ABI_PopRegistersAndAdjustStack(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
+    ret();
+    ready();
+    program = getCode<ProgramType>();
+}
+
+bool MacroJITx64Impl::Compile_NextInstruction() {
+    const auto opcode = GetOpCode();
+    if (labels[pc].getAddress()) {
+        return false;
+    }
+
+    L(labels[pc]);
+
+    switch (opcode.operation) {
+    case Macro::Operation::ALU:
+        Compile_ALU(opcode);
+        break;
+    case Macro::Operation::AddImmediate:
+        Compile_AddImmediate(opcode);
+        break;
+    case Macro::Operation::ExtractInsert:
+        Compile_ExtractInsert(opcode);
+        break;
+    case Macro::Operation::ExtractShiftLeftImmediate:
+        Compile_ExtractShiftLeftImmediate(opcode);
+        break;
+    case Macro::Operation::ExtractShiftLeftRegister:
+        Compile_ExtractShiftLeftRegister(opcode);
+        break;
+    case Macro::Operation::Read:
+        Compile_Read(opcode);
+        break;
+    case Macro::Operation::Branch:
+        Compile_Branch(opcode);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented opcode {}", opcode.operation.Value());
+        break;
+    }
+
+    if (optimizer.has_delayed_pc) {
+        if (opcode.is_exit) {
+            mov(rax, end_of_code);
+            test(BRANCH_HOLDER, BRANCH_HOLDER);
+            cmove(BRANCH_HOLDER, rax);
+            // Jump to next instruction to skip delay slot check
+            je(labels[pc + 1], T_NEAR);
+        } else {
+            // TODO(ogniK): Optimize delay slot branching
+            Xbyak::Label no_delay_slot{};
+            test(BRANCH_HOLDER, BRANCH_HOLDER);
+            je(no_delay_slot, T_NEAR);
+            mov(rax, BRANCH_HOLDER);
+            xor_(BRANCH_HOLDER, BRANCH_HOLDER);
+            jmp(rax);
+            L(no_delay_slot);
+        }
+        L(delay_skip[pc]);
+        if (opcode.is_exit) {
+            return false;
+        }
+    } else {
+        test(BRANCH_HOLDER, BRANCH_HOLDER);
+        jne(end_of_code, T_NEAR);
+        if (opcode.is_exit) {
+            inc(BRANCH_HOLDER);
+            return false;
+        }
+    }
+    return true;
+}
+
+static void WarnInvalidParameter(uintptr_t parameter, uintptr_t max_parameter) {
+    LOG_CRITICAL(HW_GPU,
+                 "Macro JIT: invalid parameter access 0x{:x} (0x{:x} is the last parameter)",
+                 parameter, max_parameter - sizeof(u32));
+}
+
+Xbyak::Reg32 MacroJITx64Impl::Compile_FetchParameter() {
+    Xbyak::Label parameter_ok{};
+    cmp(PARAMETERS, MAX_PARAMETER);
+    jb(parameter_ok, T_NEAR);
+    Common::X64::ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+    mov(Common::X64::ABI_PARAM1, PARAMETERS);
+    mov(Common::X64::ABI_PARAM2, MAX_PARAMETER);
+    Common::X64::CallFarFunction(*this, &WarnInvalidParameter);
+    Common::X64::ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+    L(parameter_ok);
+    mov(eax, dword[PARAMETERS]);
+    add(PARAMETERS, sizeof(u32));
+    return eax;
+}
+
+Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) {
+    if (index == 0) {
+        // Register 0 is always zero
+        xor_(dst, dst);
+    } else {
+        mov(dst, dword[STATE + offsetof(JITState, registers) + index * sizeof(u32)]);
+    }
+
+    return dst;
+}
+
+void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u32 reg) {
+    const auto SetRegister = [this](u32 reg_index, const Xbyak::Reg32& result) {
+        // Register 0 is supposed to always return 0. NOP is implemented as a store to the zero
+        // register.
+        if (reg_index == 0) {
+            return;
+        }
+        mov(dword[STATE + offsetof(JITState, registers) + reg_index * sizeof(u32)], result);
+    };
+    const auto SetMethodAddress = [this](const Xbyak::Reg32& reg32) { mov(METHOD_ADDRESS, reg32); };
+
+    switch (operation) {
+    case Macro::ResultOperation::IgnoreAndFetch:
+        SetRegister(reg, Compile_FetchParameter());
+        break;
+    case Macro::ResultOperation::Move:
+        SetRegister(reg, RESULT);
+        break;
+    case Macro::ResultOperation::MoveAndSetMethod:
+        SetRegister(reg, RESULT);
+        SetMethodAddress(RESULT);
+        break;
+    case Macro::ResultOperation::FetchAndSend:
+        // Fetch parameter and send result.
+        SetRegister(reg, Compile_FetchParameter());
+        Compile_Send(RESULT);
+        break;
+    case Macro::ResultOperation::MoveAndSend:
+        // Move and send result.
+        SetRegister(reg, RESULT);
+        Compile_Send(RESULT);
+        break;
+    case Macro::ResultOperation::FetchAndSetMethod:
+        // Fetch parameter and use result as Method Address.
+        SetRegister(reg, Compile_FetchParameter());
+        SetMethodAddress(RESULT);
+        break;
+    case Macro::ResultOperation::MoveAndSetMethodFetchAndSend:
+        // Move result and use as Method Address, then fetch and send parameter.
+        SetRegister(reg, RESULT);
+        SetMethodAddress(RESULT);
+        Compile_Send(Compile_FetchParameter());
+        break;
+    case Macro::ResultOperation::MoveAndSetMethodSend:
+        // Move result and use as Method Address, then send bits 12:17 of result.
+        SetRegister(reg, RESULT);
+        SetMethodAddress(RESULT);
+        shr(RESULT, 12);
+        and_(RESULT, 0b111111);
+        Compile_Send(RESULT);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented macro operation {}", operation);
+        break;
+    }
+}
+
+Macro::Opcode MacroJITx64Impl::GetOpCode() const {
+    ASSERT(pc < code.size());
+    return {code[pc]};
+}
+} // Anonymous namespace
+#endif
+
+static void Dump(u64 hash, std::span<const u32> code, bool decompiled = false) {
+    const auto base_dir{Common::FS::GetEdenPath(Common::FS::EdenPath::DumpDir)};
+    const auto macro_dir{base_dir / "macros"};
+    if (!Common::FS::CreateDir(base_dir) || !Common::FS::CreateDir(macro_dir)) {
+        LOG_ERROR(Common_Filesystem, "Failed to create macro dump directories");
+        return;
+    }
+    auto name{macro_dir / fmt::format("{:016x}.macro", hash)};
+
+    if (decompiled) {
+        auto new_name{macro_dir / fmt::format("decompiled_{:016x}.macro", hash)};
+        if (Common::FS::Exists(name)) {
+            (void)Common::FS::RenameFile(name, new_name);
+            return;
+        }
+        name = new_name;
+    }
+
+    std::fstream macro_file(name, std::ios::out | std::ios::binary);
+    if (!macro_file) {
+        LOG_ERROR(Common_Filesystem, "Unable to open or create file at {}", Common::FS::PathToUTF8String(name));
+        return;
+    }
+    macro_file.write(reinterpret_cast<const char*>(code.data()), code.size_bytes());
+}
+
+MacroEngine::MacroEngine(Engines::Maxwell3D& maxwell3d_, bool is_interpreted_)
+    : hle_macros{std::make_optional<Tegra::HLEMacro>(maxwell3d_)}
+    , maxwell3d{maxwell3d_}
+    , is_interpreted{is_interpreted_}
+{}
+
+MacroEngine::~MacroEngine() = default;
+
+void MacroEngine::AddCode(u32 method, u32 data) {
+    uploaded_macro_code[method].push_back(data);
+}
+
+void MacroEngine::ClearCode(u32 method) {
+    macro_cache.erase(method);
+    uploaded_macro_code.erase(method);
+}
+
+void MacroEngine::Execute(u32 method, const std::vector<u32>& parameters) {
+    auto compiled_macro = macro_cache.find(method);
+    if (compiled_macro != macro_cache.end()) {
+        const auto& cache_info = compiled_macro->second;
+        if (cache_info.has_hle_program) {
+            cache_info.hle_program->Execute(parameters, method);
+        } else {
+            maxwell3d.RefreshParameters();
+            cache_info.lle_program->Execute(parameters, method);
+        }
+    } else {
+        // Macro not compiled, check if it's uploaded and if so, compile it
+        std::optional<u32> mid_method;
+        const auto macro_code = uploaded_macro_code.find(method);
+        if (macro_code == uploaded_macro_code.end()) {
+            for (const auto& [method_base, code] : uploaded_macro_code) {
+                if (method >= method_base && (method - method_base) < code.size()) {
+                    mid_method = method_base;
+                    break;
+                }
+            }
+            if (!mid_method.has_value()) {
+                ASSERT_MSG(false, "Macro 0x{0:x} was not uploaded", method);
+                return;
+            }
+        }
+        auto& cache_info = macro_cache[method];
+
+        if (!mid_method.has_value()) {
+            cache_info.lle_program = Compile(macro_code->second);
+            cache_info.hash = Common::HashValue(macro_code->second);
+        } else {
+            const auto& macro_cached = uploaded_macro_code[mid_method.value()];
+            const auto rebased_method = method - mid_method.value();
+            auto& code = uploaded_macro_code[method];
+            code.resize(macro_cached.size() - rebased_method);
+            std::memcpy(code.data(), macro_cached.data() + rebased_method, code.size() * sizeof(u32));
+            cache_info.hash = Common::HashValue(code);
+            cache_info.lle_program = Compile(code);
+        }
+
+        auto hle_program = hle_macros->GetHLEProgram(cache_info.hash);
+        if (!hle_program || Settings::values.disable_macro_hle) {
+            maxwell3d.RefreshParameters();
+            cache_info.lle_program->Execute(parameters, method);
+        } else {
+            cache_info.has_hle_program = true;
+            cache_info.hle_program = std::move(hle_program);
+            cache_info.hle_program->Execute(parameters, method);
+        }
+
+        if (Settings::values.dump_macros) {
+            Dump(cache_info.hash, macro_code->second, cache_info.has_hle_program);
+        }
+    }
+}
+
+std::unique_ptr<CachedMacro> MacroEngine::Compile(const std::vector<u32>& code) {
+#ifdef ARCHITECTURE_x86_64
+    if (!is_interpreted)
+        return std::make_unique<MacroJITx64Impl>(maxwell3d, code);
+#endif
+    return std::make_unique<MacroInterpreterImpl>(maxwell3d, code);
+}
+
+std::optional<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d) {
+#ifdef ARCHITECTURE_x86_64
+    return std::make_optional<MacroEngine>(maxwell3d, bool(Settings::values.disable_macro_jit));
+#else
+    return std::make_optional<MacroEngine>(maxwell3d, true);
+#endif
+}
+
+} // namespace Tegra
diff --git a/src/video_core/macro/macro.h b/src/video_core/macro.h
similarity index 74%
rename from src/video_core/macro/macro.h
rename to src/video_core/macro.h
index 737ced9a45..685097a693 100644
--- a/src/video_core/macro/macro.h
+++ b/src/video_core/macro.h
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
@@ -95,24 +98,34 @@ union MethodAddress {
 
 } // namespace Macro
 
-class HLEMacro;
-
 class CachedMacro {
 public:
+    CachedMacro(Engines::Maxwell3D& maxwell3d_)
+        : maxwell3d{maxwell3d_}
+    {}
     virtual ~CachedMacro() = default;
-    /**
-     * Executes the macro code with the specified input parameters.
-     *
-     * @param parameters The parameters of the macro
-     * @param method     The method to execute
-     */
+    /// Executes the macro code with the specified input parameters.
+    /// @param parameters The parameters of the macro
+    /// @param method     The method to execute
     virtual void Execute(const std::vector<u32>& parameters, u32 method) = 0;
+    Engines::Maxwell3D& maxwell3d;
+};
+
+class HLEMacro {
+public:
+    explicit HLEMacro(Engines::Maxwell3D& maxwell3d_);
+    ~HLEMacro();
+    // Allocates and returns a cached macro if the hash matches a known function.
+    // Returns nullptr otherwise.
+    [[nodiscard]] std::unique_ptr<CachedMacro> GetHLEProgram(u64 hash) const;
+private:
+    Engines::Maxwell3D& maxwell3d;
 };
 
 class MacroEngine {
 public:
-    explicit MacroEngine(Engines::Maxwell3D& maxwell3d);
-    virtual ~MacroEngine();
+    explicit MacroEngine(Engines::Maxwell3D& maxwell3d, bool is_interpreted);
+    ~MacroEngine();
 
     // Store the uploaded macro code to compile them when they're called.
     void AddCode(u32 method, u32 data);
@@ -124,7 +137,7 @@ public:
     void Execute(u32 method, const std::vector<u32>& parameters);
 
 protected:
-    virtual std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) = 0;
+    std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code);
 
 private:
     struct CacheInfo {
@@ -136,10 +149,11 @@ private:
 
     std::unordered_map<u32, CacheInfo> macro_cache;
     std::unordered_map<u32, std::vector<u32>> uploaded_macro_code;
-    std::unique_ptr<HLEMacro> hle_macros;
+    std::optional<HLEMacro> hle_macros;
     Engines::Maxwell3D& maxwell3d;
+    bool is_interpreted;
 };
 
-std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d);
+std::optional<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d);
 
 } // namespace Tegra
diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp
deleted file mode 100644
index 2ff5e21c5e..0000000000
--- a/src/video_core/macro/macro.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
-// SPDX-License-Identifier: GPL-3.0-or-later
-
-// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-#include <cstring>
-#include <fstream>
-#include <optional>
-#include <span>
-
-#include "common/container_hash.h"
-
-#include <fstream>
-#include "common/assert.h"
-#include "common/fs/fs.h"
-#include "common/fs/path_util.h"
-#include "common/settings.h"
-#include "video_core/engines/maxwell_3d.h"
-#include "video_core/macro/macro.h"
-#include "video_core/macro/macro_hle.h"
-#include "video_core/macro/macro_interpreter.h"
-
-#ifdef ARCHITECTURE_x86_64
-#include "video_core/macro/macro_jit_x64.h"
-#endif
-
-namespace Tegra {
-
-static void Dump(u64 hash, std::span<const u32> code, bool decompiled = false) {
-    const auto base_dir{Common::FS::GetEdenPath(Common::FS::EdenPath::DumpDir)};
-    const auto macro_dir{base_dir / "macros"};
-    if (!Common::FS::CreateDir(base_dir) || !Common::FS::CreateDir(macro_dir)) {
-        LOG_ERROR(Common_Filesystem, "Failed to create macro dump directories");
-        return;
-    }
-    auto name{macro_dir / fmt::format("{:016x}.macro", hash)};
-
-    if (decompiled) {
-        auto new_name{macro_dir / fmt::format("decompiled_{:016x}.macro", hash)};
-        if (Common::FS::Exists(name)) {
-            (void)Common::FS::RenameFile(name, new_name);
-            return;
-        }
-        name = new_name;
-    }
-
-    std::fstream macro_file(name, std::ios::out | std::ios::binary);
-    if (!macro_file) {
-        LOG_ERROR(Common_Filesystem, "Unable to open or create file at {}",
-                  Common::FS::PathToUTF8String(name));
-        return;
-    }
-    macro_file.write(reinterpret_cast<const char*>(code.data()), code.size_bytes());
-}
-
-MacroEngine::MacroEngine(Engines::Maxwell3D& maxwell3d_)
-    : hle_macros{std::make_unique<Tegra::HLEMacro>(maxwell3d_)}, maxwell3d{maxwell3d_} {}
-
-MacroEngine::~MacroEngine() = default;
-
-void MacroEngine::AddCode(u32 method, u32 data) {
-    uploaded_macro_code[method].push_back(data);
-}
-
-void MacroEngine::ClearCode(u32 method) {
-    macro_cache.erase(method);
-    uploaded_macro_code.erase(method);
-}
-
-void MacroEngine::Execute(u32 method, const std::vector<u32>& parameters) {
-    auto compiled_macro = macro_cache.find(method);
-    if (compiled_macro != macro_cache.end()) {
-        const auto& cache_info = compiled_macro->second;
-        if (cache_info.has_hle_program) {
-            cache_info.hle_program->Execute(parameters, method);
-        } else {
-            maxwell3d.RefreshParameters();
-            cache_info.lle_program->Execute(parameters, method);
-        }
-    } else {
-        // Macro not compiled, check if it's uploaded and if so, compile it
-        std::optional<u32> mid_method;
-        const auto macro_code = uploaded_macro_code.find(method);
-        if (macro_code == uploaded_macro_code.end()) {
-            for (const auto& [method_base, code] : uploaded_macro_code) {
-                if (method >= method_base && (method - method_base) < code.size()) {
-                    mid_method = method_base;
-                    break;
-                }
-            }
-            if (!mid_method.has_value()) {
-                ASSERT_MSG(false, "Macro 0x{0:x} was not uploaded", method);
-                return;
-            }
-        }
-        auto& cache_info = macro_cache[method];
-
-        if (!mid_method.has_value()) {
-            cache_info.lle_program = Compile(macro_code->second);
-            cache_info.hash = Common::HashValue(macro_code->second);
-        } else {
-            const auto& macro_cached = uploaded_macro_code[mid_method.value()];
-            const auto rebased_method = method - mid_method.value();
-            auto& code = uploaded_macro_code[method];
-            code.resize(macro_cached.size() - rebased_method);
-            std::memcpy(code.data(), macro_cached.data() + rebased_method,
-                        code.size() * sizeof(u32));
-            cache_info.hash = Common::HashValue(code);
-            cache_info.lle_program = Compile(code);
-        }
-
-        auto hle_program = hle_macros->GetHLEProgram(cache_info.hash);
-        if (!hle_program || Settings::values.disable_macro_hle) {
-            maxwell3d.RefreshParameters();
-            cache_info.lle_program->Execute(parameters, method);
-        } else {
-            cache_info.has_hle_program = true;
-            cache_info.hle_program = std::move(hle_program);
-            cache_info.hle_program->Execute(parameters, method);
-        }
-
-        if (Settings::values.dump_macros) {
-            Dump(cache_info.hash, macro_code->second, cache_info.has_hle_program);
-        }
-    }
-}
-
-std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d) {
-    if (Settings::values.disable_macro_jit) {
-        return std::make_unique<MacroInterpreter>(maxwell3d);
-    }
-#ifdef ARCHITECTURE_x86_64
-    return std::make_unique<MacroJITx64>(maxwell3d);
-#else
-    return std::make_unique<MacroInterpreter>(maxwell3d);
-#endif
-}
-
-} // namespace Tegra
diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp
deleted file mode 100644
index 2f41e806c2..0000000000
--- a/src/video_core/macro/macro_hle.cpp
+++ /dev/null
@@ -1,606 +0,0 @@
-// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
-// SPDX-License-Identifier: GPL-3.0-or-later
-
-// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
-// SPDX-License-Identifier: GPL-3.0-or-later
-
-#include <array>
-#include <vector>
-#include "common/assert.h"
-#include "common/scope_exit.h"
-#include "video_core/dirty_flags.h"
-#include "video_core/engines/draw_manager.h"
-#include "video_core/engines/maxwell_3d.h"
-#include "video_core/macro/macro.h"
-#include "video_core/macro/macro_hle.h"
-#include "video_core/memory_manager.h"
-#include "video_core/rasterizer_interface.h"
-
-namespace Tegra {
-
-using Maxwell3D = Engines::Maxwell3D;
-
-namespace {
-
-bool IsTopologySafe(Maxwell3D::Regs::PrimitiveTopology topology) {
-    switch (topology) {
-    case Maxwell3D::Regs::PrimitiveTopology::Points:
-    case Maxwell3D::Regs::PrimitiveTopology::Lines:
-    case Maxwell3D::Regs::PrimitiveTopology::LineLoop:
-    case Maxwell3D::Regs::PrimitiveTopology::LineStrip:
-    case Maxwell3D::Regs::PrimitiveTopology::Triangles:
-    case Maxwell3D::Regs::PrimitiveTopology::TriangleStrip:
-    case Maxwell3D::Regs::PrimitiveTopology::TriangleFan:
-    case Maxwell3D::Regs::PrimitiveTopology::LinesAdjacency:
-    case Maxwell3D::Regs::PrimitiveTopology::LineStripAdjacency:
-    case Maxwell3D::Regs::PrimitiveTopology::TrianglesAdjacency:
-    case Maxwell3D::Regs::PrimitiveTopology::TriangleStripAdjacency:
-    case Maxwell3D::Regs::PrimitiveTopology::Patches:
-        return true;
-    case Maxwell3D::Regs::PrimitiveTopology::Quads:
-    case Maxwell3D::Regs::PrimitiveTopology::QuadStrip:
-    case Maxwell3D::Regs::PrimitiveTopology::Polygon:
-    default:
-        return false;
-    }
-}
-
-class HLEMacroImpl : public CachedMacro {
-public:
-    explicit HLEMacroImpl(Maxwell3D& maxwell3d_) : maxwell3d{maxwell3d_} {}
-
-protected:
-    Maxwell3D& maxwell3d;
-};
-
-/*
- * @note: these macros have two versions, a normal and extended version, with the extended version
- * also assigning the base vertex/instance.
- */
-template <bool extended>
-class HLE_DrawArraysIndirect final : public HLEMacroImpl {
-public:
-    explicit HLE_DrawArraysIndirect(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
-
-    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
-        auto topology = static_cast<Maxwell3D::Regs::PrimitiveTopology>(parameters[0]);
-        if (!maxwell3d.AnyParametersDirty() || !IsTopologySafe(topology)) {
-            Fallback(parameters);
-            return;
-        }
-
-        auto& params = maxwell3d.draw_manager->GetIndirectParams();
-        params.is_byte_count = false;
-        params.is_indexed = false;
-        params.include_count = false;
-        params.count_start_address = 0;
-        params.indirect_start_address = maxwell3d.GetMacroAddress(1);
-        params.buffer_size = 4 * sizeof(u32);
-        params.max_draw_counts = 1;
-        params.stride = 0;
-
-        if constexpr (extended) {
-            maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro;
-            maxwell3d.SetHLEReplacementAttributeType(
-                0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseInstance);
-        }
-
-        maxwell3d.draw_manager->DrawArrayIndirect(topology);
-
-        if constexpr (extended) {
-            maxwell3d.engine_state = Maxwell3D::EngineHint::None;
-            maxwell3d.replace_table.clear();
-        }
-    }
-
-private:
-    void Fallback(const std::vector<u32>& parameters) {
-        SCOPE_EXIT {
-            if (extended) {
-                maxwell3d.engine_state = Maxwell3D::EngineHint::None;
-                maxwell3d.replace_table.clear();
-            }
-        };
-        maxwell3d.RefreshParameters();
-        const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
-
-        auto topology = static_cast<Maxwell3D::Regs::PrimitiveTopology>(parameters[0]);
-        const u32 vertex_first = parameters[3];
-        const u32 vertex_count = parameters[1];
-
-        if (!IsTopologySafe(topology) &&
-            static_cast<size_t>(maxwell3d.GetMaxCurrentVertices()) <
-                static_cast<size_t>(vertex_first) + static_cast<size_t>(vertex_count)) {
-            ASSERT_MSG(false, "Faulty draw!");
-            return;
-        }
-
-        const u32 base_instance = parameters[4];
-        if constexpr (extended) {
-            maxwell3d.regs.global_base_instance_index = base_instance;
-            maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro;
-            maxwell3d.SetHLEReplacementAttributeType(
-                0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseInstance);
-        }
-
-        maxwell3d.draw_manager->DrawArray(topology, vertex_first, vertex_count, base_instance,
-                                          instance_count);
-
-        if constexpr (extended) {
-            maxwell3d.regs.global_base_instance_index = 0;
-            maxwell3d.engine_state = Maxwell3D::EngineHint::None;
-            maxwell3d.replace_table.clear();
-        }
-    }
-};
-
-/*
- * @note: these macros have two versions, a normal and extended version, with the extended version
- * also assigning the base vertex/instance.
- */
-template <bool extended>
-class HLE_DrawIndexedIndirect final : public HLEMacroImpl {
-public:
-    explicit HLE_DrawIndexedIndirect(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
-
-    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
-        auto topology = static_cast<Maxwell3D::Regs::PrimitiveTopology>(parameters[0]);
-        if (!maxwell3d.AnyParametersDirty() || !IsTopologySafe(topology)) {
-            Fallback(parameters);
-            return;
-        }
-
-        const u32 estimate = static_cast<u32>(maxwell3d.EstimateIndexBufferSize());
-        const u32 element_base = parameters[4];
-        const u32 base_instance = parameters[5];
-        maxwell3d.regs.vertex_id_base = element_base;
-        maxwell3d.regs.global_base_vertex_index = element_base;
-        maxwell3d.regs.global_base_instance_index = base_instance;
-        maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
-        if constexpr (extended) {
-            maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro;
-            maxwell3d.SetHLEReplacementAttributeType(
-                0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseVertex);
-            maxwell3d.SetHLEReplacementAttributeType(
-                0, 0x644, Maxwell3D::HLEReplacementAttributeType::BaseInstance);
-        }
-        auto& params = maxwell3d.draw_manager->GetIndirectParams();
-        params.is_byte_count = false;
-        params.is_indexed = true;
-        params.include_count = false;
-        params.count_start_address = 0;
-        params.indirect_start_address = maxwell3d.GetMacroAddress(1);
-        params.buffer_size = 5 * sizeof(u32);
-        params.max_draw_counts = 1;
-        params.stride = 0;
-        maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
-        maxwell3d.draw_manager->DrawIndexedIndirect(topology, 0, estimate);
-        maxwell3d.regs.vertex_id_base = 0x0;
-        maxwell3d.regs.global_base_vertex_index = 0x0;
-        maxwell3d.regs.global_base_instance_index = 0x0;
-        if constexpr (extended) {
-            maxwell3d.engine_state = Maxwell3D::EngineHint::None;
-            maxwell3d.replace_table.clear();
-        }
-    }
-
-private:
-    void Fallback(const std::vector<u32>& parameters) {
-        maxwell3d.RefreshParameters();
-        const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
-        const u32 element_base = parameters[4];
-        const u32 base_instance = parameters[5];
-        maxwell3d.regs.vertex_id_base = element_base;
-        maxwell3d.regs.global_base_vertex_index = element_base;
-        maxwell3d.regs.global_base_instance_index = base_instance;
-        maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
-        if constexpr (extended) {
-            maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro;
-            maxwell3d.SetHLEReplacementAttributeType(
-                0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseVertex);
-            maxwell3d.SetHLEReplacementAttributeType(
-                0, 0x644, Maxwell3D::HLEReplacementAttributeType::BaseInstance);
-        }
-
-        maxwell3d.draw_manager->DrawIndex(
-            static_cast<Tegra::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]), parameters[3],
-            parameters[1], element_base, base_instance, instance_count);
-
-        maxwell3d.regs.vertex_id_base = 0x0;
-        maxwell3d.regs.global_base_vertex_index = 0x0;
-        maxwell3d.regs.global_base_instance_index = 0x0;
-        if constexpr (extended) {
-            maxwell3d.engine_state = Maxwell3D::EngineHint::None;
-            maxwell3d.replace_table.clear();
-        }
-    }
-};
-
-class HLE_MultiLayerClear final : public HLEMacroImpl {
-public:
-    explicit HLE_MultiLayerClear(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
-
-    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
-        maxwell3d.RefreshParameters();
-        ASSERT(parameters.size() == 1);
-
-        const Maxwell3D::Regs::ClearSurface clear_params{parameters[0]};
-        const u32 rt_index = clear_params.RT;
-        const u32 num_layers = maxwell3d.regs.rt[rt_index].depth;
-        ASSERT(clear_params.layer == 0);
-
-        maxwell3d.regs.clear_surface.raw = clear_params.raw;
-        maxwell3d.draw_manager->Clear(num_layers);
-    }
-};
-
-class HLE_MultiDrawIndexedIndirectCount final : public HLEMacroImpl {
-public:
-    explicit HLE_MultiDrawIndexedIndirectCount(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
-
-    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
-        const auto topology = static_cast<Maxwell3D::Regs::PrimitiveTopology>(parameters[2]);
-        if (!IsTopologySafe(topology)) {
-            Fallback(parameters);
-            return;
-        }
-
-        const u32 start_indirect = parameters[0];
-        const u32 end_indirect = parameters[1];
-        if (start_indirect >= end_indirect) {
-            // Nothing to do.
-            return;
-        }
-
-        const u32 padding = parameters[3]; // padding is in words
-
-        // size of each indirect segment
-        const u32 indirect_words = 5 + padding;
-        const u32 stride = indirect_words * sizeof(u32);
-        const std::size_t draw_count = end_indirect - start_indirect;
-        const u32 estimate = static_cast<u32>(maxwell3d.EstimateIndexBufferSize());
-        maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
-        auto& params = maxwell3d.draw_manager->GetIndirectParams();
-        params.is_byte_count = false;
-        params.is_indexed = true;
-        params.include_count = true;
-        params.count_start_address = maxwell3d.GetMacroAddress(4);
-        params.indirect_start_address = maxwell3d.GetMacroAddress(5);
-        params.buffer_size = stride * draw_count;
-        params.max_draw_counts = draw_count;
-        params.stride = stride;
-        maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
-        maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro;
-        maxwell3d.SetHLEReplacementAttributeType(
-            0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseVertex);
-        maxwell3d.SetHLEReplacementAttributeType(
-            0, 0x644, Maxwell3D::HLEReplacementAttributeType::BaseInstance);
-        maxwell3d.SetHLEReplacementAttributeType(0, 0x648,
-                                                 Maxwell3D::HLEReplacementAttributeType::DrawID);
-        maxwell3d.draw_manager->DrawIndexedIndirect(topology, 0, estimate);
-        maxwell3d.engine_state = Maxwell3D::EngineHint::None;
-        maxwell3d.replace_table.clear();
-    }
-
-private:
-    void Fallback(const std::vector<u32>& parameters) {
-        SCOPE_EXIT {
-            // Clean everything.
-            maxwell3d.regs.vertex_id_base = 0x0;
-            maxwell3d.engine_state = Maxwell3D::EngineHint::None;
-            maxwell3d.replace_table.clear();
-        };
-        maxwell3d.RefreshParameters();
-        const u32 start_indirect = parameters[0];
-        const u32 end_indirect = parameters[1];
-        if (start_indirect >= end_indirect) {
-            // Nothing to do.
-            return;
-        }
-        const auto topology = static_cast<Maxwell3D::Regs::PrimitiveTopology>(parameters[2]);
-        const u32 padding = parameters[3];
-        const std::size_t max_draws = parameters[4];
-
-        const u32 indirect_words = 5 + padding;
-        const std::size_t first_draw = start_indirect;
-        const std::size_t effective_draws = end_indirect - start_indirect;
-        const std::size_t last_draw = start_indirect + (std::min)(effective_draws, max_draws);
-
-        for (std::size_t index = first_draw; index < last_draw; index++) {
-            const std::size_t base = index * indirect_words + 5;
-            const u32 base_vertex = parameters[base + 3];
-            const u32 base_instance = parameters[base + 4];
-            maxwell3d.regs.vertex_id_base = base_vertex;
-            maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro;
-            maxwell3d.SetHLEReplacementAttributeType(
-                0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseVertex);
-            maxwell3d.SetHLEReplacementAttributeType(
-                0, 0x644, Maxwell3D::HLEReplacementAttributeType::BaseInstance);
-            maxwell3d.CallMethod(0x8e3, 0x648, true);
-            maxwell3d.CallMethod(0x8e4, static_cast<u32>(index), true);
-            maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
-            maxwell3d.draw_manager->DrawIndex(topology, parameters[base + 2], parameters[base],
-                                              base_vertex, base_instance, parameters[base + 1]);
-        }
-    }
-};
-
-class HLE_DrawIndirectByteCount final : public HLEMacroImpl {
-public:
-    explicit HLE_DrawIndirectByteCount(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
-
-    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
-        const bool force = maxwell3d.Rasterizer().HasDrawTransformFeedback();
-
-        auto topology = static_cast<Maxwell3D::Regs::PrimitiveTopology>(parameters[0] & 0xFFFFU);
-        if (!force && (!maxwell3d.AnyParametersDirty() || !IsTopologySafe(topology))) {
-            Fallback(parameters);
-            return;
-        }
-        auto& params = maxwell3d.draw_manager->GetIndirectParams();
-        params.is_byte_count = true;
-        params.is_indexed = false;
-        params.include_count = false;
-        params.count_start_address = 0;
-        params.indirect_start_address = maxwell3d.GetMacroAddress(2);
-        params.buffer_size = 4;
-        params.max_draw_counts = 1;
-        params.stride = parameters[1];
-        maxwell3d.regs.draw.begin = parameters[0];
-        maxwell3d.regs.draw_auto_stride = parameters[1];
-        maxwell3d.regs.draw_auto_byte_count = parameters[2];
-
-        maxwell3d.draw_manager->DrawArrayIndirect(topology);
-    }
-
-private:
-    void Fallback(const std::vector<u32>& parameters) {
-        maxwell3d.RefreshParameters();
-
-        maxwell3d.regs.draw.begin = parameters[0];
-        maxwell3d.regs.draw_auto_stride = parameters[1];
-        maxwell3d.regs.draw_auto_byte_count = parameters[2];
-
-        maxwell3d.draw_manager->DrawArray(
-            maxwell3d.regs.draw.topology, 0,
-            maxwell3d.regs.draw_auto_byte_count / maxwell3d.regs.draw_auto_stride, 0, 1);
-    }
-};
-
-class HLE_C713C83D8F63CCF3 final : public HLEMacroImpl {
-public:
-    explicit HLE_C713C83D8F63CCF3(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
-
-    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
-        maxwell3d.RefreshParameters();
-        const u32 offset = (parameters[0] & 0x3FFFFFFF) << 2;
-        const u32 address = maxwell3d.regs.shadow_scratch[24];
-        auto& const_buffer = maxwell3d.regs.const_buffer;
-        const_buffer.size = 0x7000;
-        const_buffer.address_high = (address >> 24) & 0xFF;
-        const_buffer.address_low = address << 8;
-        const_buffer.offset = offset;
-    }
-};
-
-class HLE_D7333D26E0A93EDE final : public HLEMacroImpl {
-public:
-    explicit HLE_D7333D26E0A93EDE(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
-
-    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
-        maxwell3d.RefreshParameters();
-        const size_t index = parameters[0];
-        const u32 address = maxwell3d.regs.shadow_scratch[42 + index];
-        const u32 size = maxwell3d.regs.shadow_scratch[47 + index];
-        auto& const_buffer = maxwell3d.regs.const_buffer;
-        const_buffer.size = size;
-        const_buffer.address_high = (address >> 24) & 0xFF;
-        const_buffer.address_low = address << 8;
-    }
-};
-
-class HLE_BindShader final : public HLEMacroImpl {
-public:
-    explicit HLE_BindShader(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
-
-    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
-        maxwell3d.RefreshParameters();
-        auto& regs = maxwell3d.regs;
-        const u32 index = parameters[0];
-        if ((parameters[1] - regs.shadow_scratch[28 + index]) == 0) {
-            return;
-        }
-
-        regs.pipelines[index & 0xF].offset = parameters[2];
-        maxwell3d.dirty.flags[VideoCommon::Dirty::Shaders] = true;
-        regs.shadow_scratch[28 + index] = parameters[1];
-        regs.shadow_scratch[34 + index] = parameters[2];
-
-        const u32 address = parameters[4];
-        auto& const_buffer = regs.const_buffer;
-        const_buffer.size = 0x10000;
-        const_buffer.address_high = (address >> 24) & 0xFF;
-        const_buffer.address_low = address << 8;
-
-        const size_t bind_group_id = parameters[3] & 0x7F;
-        auto& bind_group = regs.bind_groups[bind_group_id];
-        bind_group.raw_config = 0x11;
-        maxwell3d.ProcessCBBind(bind_group_id);
-    }
-};
-
-class HLE_SetRasterBoundingBox final : public HLEMacroImpl {
-public:
-    explicit HLE_SetRasterBoundingBox(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
-
-    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
-        maxwell3d.RefreshParameters();
-        const u32 raster_mode = parameters[0];
-        auto& regs = maxwell3d.regs;
-        const u32 raster_enabled = maxwell3d.regs.conservative_raster_enable;
-        const u32 scratch_data = maxwell3d.regs.shadow_scratch[52];
-        regs.raster_bounding_box.raw = raster_mode & 0xFFFFF00F;
-        regs.raster_bounding_box.pad.Assign(scratch_data & raster_enabled);
-    }
-};
-
-template <size_t base_size>
-class HLE_ClearConstBuffer final : public HLEMacroImpl {
-public:
-    explicit HLE_ClearConstBuffer(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
-
-    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
-        maxwell3d.RefreshParameters();
-        static constexpr std::array<u32, base_size> zeroes{};
-        auto& regs = maxwell3d.regs;
-        regs.const_buffer.size = static_cast<u32>(base_size);
-        regs.const_buffer.address_high = parameters[0];
-        regs.const_buffer.address_low = parameters[1];
-        regs.const_buffer.offset = 0;
-        maxwell3d.ProcessCBMultiData(zeroes.data(), parameters[2] * 4);
-    }
-};
-
-class HLE_ClearMemory final : public HLEMacroImpl {
-public:
-    explicit HLE_ClearMemory(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
-
-    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
-        maxwell3d.RefreshParameters();
-
-        const u32 needed_memory = parameters[2] / sizeof(u32);
-        if (needed_memory > zero_memory.size()) {
-            zero_memory.resize(needed_memory, 0);
-        }
-        auto& regs = maxwell3d.regs;
-        regs.upload.line_length_in = parameters[2];
-        regs.upload.line_count = 1;
-        regs.upload.dest.address_high = parameters[0];
-        regs.upload.dest.address_low = parameters[1];
-        maxwell3d.CallMethod(static_cast<size_t>(MAXWELL3D_REG_INDEX(launch_dma)), 0x1011, true);
-        maxwell3d.CallMultiMethod(static_cast<size_t>(MAXWELL3D_REG_INDEX(inline_data)),
-                                  zero_memory.data(), needed_memory, needed_memory);
-    }
-
-private:
-    std::vector<u32> zero_memory;
-};
-
-class HLE_TransformFeedbackSetup final : public HLEMacroImpl {
-public:
-    explicit HLE_TransformFeedbackSetup(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
-
-    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
-        maxwell3d.RefreshParameters();
-
-        auto& regs = maxwell3d.regs;
-        regs.transform_feedback_enabled = 1;
-        regs.transform_feedback.buffers[0].start_offset = 0;
-        regs.transform_feedback.buffers[1].start_offset = 0;
-        regs.transform_feedback.buffers[2].start_offset = 0;
-        regs.transform_feedback.buffers[3].start_offset = 0;
-
-        regs.upload.line_length_in = 4;
-        regs.upload.line_count = 1;
-        regs.upload.dest.address_high = parameters[0];
-        regs.upload.dest.address_low = parameters[1];
-        maxwell3d.CallMethod(static_cast<size_t>(MAXWELL3D_REG_INDEX(launch_dma)), 0x1011, true);
-        maxwell3d.CallMethod(static_cast<size_t>(MAXWELL3D_REG_INDEX(inline_data)),
-                             regs.transform_feedback.controls[0].stride, true);
-
-        maxwell3d.Rasterizer().RegisterTransformFeedback(regs.upload.dest.Address());
-    }
-};
-
-} // Anonymous namespace
-
-HLEMacro::HLEMacro(Maxwell3D& maxwell3d_) : maxwell3d{maxwell3d_} {
-    builders.emplace(0x0D61FC9FAAC9FCADULL,
-                     std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
-                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
-                             return std::make_unique<HLE_DrawArraysIndirect<false>>(maxwell3d__);
-                         }));
-    builders.emplace(0x8A4D173EB99A8603ULL,
-                     std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
-                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
-                             return std::make_unique<HLE_DrawArraysIndirect<true>>(maxwell3d__);
-                         }));
-    builders.emplace(0x771BB18C62444DA0ULL,
-                     std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
-                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
-                             return std::make_unique<HLE_DrawIndexedIndirect<false>>(maxwell3d__);
-                         }));
-    builders.emplace(0x0217920100488FF7ULL,
-                     std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
-                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
-                             return std::make_unique<HLE_DrawIndexedIndirect<true>>(maxwell3d__);
-                         }));
-    builders.emplace(0x3F5E74B9C9A50164ULL,
-                     std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
-                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
-                             return std::make_unique<HLE_MultiDrawIndexedIndirectCount>(
-                                 maxwell3d__);
-                         }));
-    builders.emplace(0xEAD26C3E2109B06BULL,
-                     std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
-                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
-                             return std::make_unique<HLE_MultiLayerClear>(maxwell3d__);
-                         }));
-    builders.emplace(0xC713C83D8F63CCF3ULL,
-                     std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
-                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
-                             return std::make_unique<HLE_C713C83D8F63CCF3>(maxwell3d__);
-                         }));
-    builders.emplace(0xD7333D26E0A93EDEULL,
-                     std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
-                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
-                             return std::make_unique<HLE_D7333D26E0A93EDE>(maxwell3d__);
-                         }));
-    builders.emplace(0xEB29B2A09AA06D38ULL,
-                     std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
-                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
-                             return std::make_unique<HLE_BindShader>(maxwell3d__);
-                         }));
-    builders.emplace(0xDB1341DBEB4C8AF7ULL,
-                     std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
-                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
-                             return std::make_unique<HLE_SetRasterBoundingBox>(maxwell3d__);
-                         }));
-    builders.emplace(0x6C97861D891EDf7EULL,
-                     std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
-                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
-                             return std::make_unique<HLE_ClearConstBuffer<0x5F00>>(maxwell3d__);
-                         }));
-    builders.emplace(0xD246FDDF3A6173D7ULL,
-                     std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
-                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
-                             return std::make_unique<HLE_ClearConstBuffer<0x7000>>(maxwell3d__);
-                         }));
-    builders.emplace(0xEE4D0004BEC8ECF4ULL,
-                     std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
-                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
-                             return std::make_unique<HLE_ClearMemory>(maxwell3d__);
-                         }));
-    builders.emplace(0xFC0CF27F5FFAA661ULL,
-                     std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
-                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
-                             return std::make_unique<HLE_TransformFeedbackSetup>(maxwell3d__);
-                         }));
-    builders.emplace(0xB5F74EDB717278ECULL,
-                     std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
-                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
-                             return std::make_unique<HLE_DrawIndirectByteCount>(maxwell3d__);
-                         }));
-}
-
-HLEMacro::~HLEMacro() = default;
-
-std::unique_ptr<CachedMacro> HLEMacro::GetHLEProgram(u64 hash) const {
-    const auto it = builders.find(hash);
-    if (it == builders.end()) {
-        return nullptr;
-    }
-    return it->second(maxwell3d);
-}
-
-} // namespace Tegra
diff --git a/src/video_core/macro/macro_hle.h b/src/video_core/macro/macro_hle.h
deleted file mode 100644
index 33f92fab16..0000000000
--- a/src/video_core/macro/macro_hle.h
+++ /dev/null
@@ -1,33 +0,0 @@
-// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-#pragma once
-
-#include <functional>
-#include <memory>
-#include <unordered_map>
-
-#include "common/common_types.h"
-
-namespace Tegra {
-
-namespace Engines {
-class Maxwell3D;
-}
-
-class HLEMacro {
-public:
-    explicit HLEMacro(Engines::Maxwell3D& maxwell3d_);
-    ~HLEMacro();
-
-    // Allocates and returns a cached macro if the hash matches a known function.
-    // Returns nullptr otherwise.
-    [[nodiscard]] std::unique_ptr<CachedMacro> GetHLEProgram(u64 hash) const;
-
-private:
-    Engines::Maxwell3D& maxwell3d;
-    std::unordered_map<u64, std::function<std::unique_ptr<CachedMacro>(Engines::Maxwell3D&)>>
-        builders;
-};
-
-} // namespace Tegra
diff --git a/src/video_core/macro/macro_interpreter.cpp b/src/video_core/macro/macro_interpreter.cpp
deleted file mode 100644
index f9befce676..0000000000
--- a/src/video_core/macro/macro_interpreter.cpp
+++ /dev/null
@@ -1,362 +0,0 @@
-// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
-// SPDX-License-Identifier: GPL-3.0-or-later
-
-// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-#include <array>
-#include <optional>
-
-#include "common/assert.h"
-#include "common/logging/log.h"
-#include "video_core/engines/maxwell_3d.h"
-#include "video_core/macro/macro_interpreter.h"
-
-namespace Tegra {
-namespace {
-class MacroInterpreterImpl final : public CachedMacro {
-public:
-    explicit MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d_, const std::vector<u32>& code_)
-        : maxwell3d{maxwell3d_}, code{code_} {}
-
-    void Execute(const std::vector<u32>& params, u32 method) override;
-
-private:
-    /// Resets the execution engine state, zeroing registers, etc.
-    void Reset();
-
-    /**
-     * Executes a single macro instruction located at the current program counter. Returns whether
-     * the interpreter should keep running.
-     *
-     * @param is_delay_slot Whether the current step is being executed due to a delay slot in a
-     *                      previous instruction.
-     */
-    bool Step(bool is_delay_slot);
-
-    /// Calculates the result of an ALU operation. src_a OP src_b;
-    u32 GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b);
-
-    /// Performs the result operation on the input result and stores it in the specified register
-    /// (if necessary).
-    void ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result);
-
-    /// Evaluates the branch condition and returns whether the branch should be taken or not.
-    bool EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const;
-
-    /// Reads an opcode at the current program counter location.
-    Macro::Opcode GetOpcode() const;
-
-    /// Returns the specified register's value. Register 0 is hardcoded to always return 0.
-    u32 GetRegister(u32 register_id) const;
-
-    /// Sets the register to the input value.
-    void SetRegister(u32 register_id, u32 value);
-
-    /// Sets the method address to use for the next Send instruction.
-    void SetMethodAddress(u32 address);
-
-    /// Calls a GPU Engine method with the input parameter.
-    void Send(u32 value);
-
-    /// Reads a GPU register located at the method address.
-    u32 Read(u32 method) const;
-
-    /// Returns the next parameter in the parameter queue.
-    u32 FetchParameter();
-
-    Engines::Maxwell3D& maxwell3d;
-
-    /// Current program counter
-    u32 pc{};
-    /// Program counter to execute at after the delay slot is executed.
-    std::optional<u32> delayed_pc;
-
-    /// General purpose macro registers.
-    std::array<u32, Macro::NUM_MACRO_REGISTERS> registers = {};
-
-    /// Method address to use for the next Send instruction.
-    Macro::MethodAddress method_address = {};
-
-    /// Input parameters of the current macro.
-    std::unique_ptr<u32[]> parameters;
-    std::size_t num_parameters = 0;
-    std::size_t parameters_capacity = 0;
-    /// Index of the next parameter that will be fetched by the 'parm' instruction.
-    u32 next_parameter_index = 0;
-
-    bool carry_flag = false;
-    const std::vector<u32>& code;
-};
-
-void MacroInterpreterImpl::Execute(const std::vector<u32>& params, u32 method) {
-    Reset();
-
-    registers[1] = params[0];
-    num_parameters = params.size();
-
-    if (num_parameters > parameters_capacity) {
-        parameters_capacity = num_parameters;
-        parameters = std::make_unique<u32[]>(num_parameters);
-    }
-    std::memcpy(parameters.get(), params.data(), num_parameters * sizeof(u32));
-
-    // Execute the code until we hit an exit condition.
-    bool keep_executing = true;
-    while (keep_executing) {
-        keep_executing = Step(false);
-    }
-
-    // Assert the the macro used all the input parameters
-    ASSERT(next_parameter_index == num_parameters);
-}
-
-void MacroInterpreterImpl::Reset() {
-    registers = {};
-    pc = 0;
-    delayed_pc = {};
-    method_address.raw = 0;
-    num_parameters = 0;
-    // The next parameter index starts at 1, because $r1 already has the value of the first
-    // parameter.
-    next_parameter_index = 1;
-    carry_flag = false;
-}
-
-bool MacroInterpreterImpl::Step(bool is_delay_slot) {
-    u32 base_address = pc;
-
-    Macro::Opcode opcode = GetOpcode();
-    pc += 4;
-
-    // Update the program counter if we were delayed
-    if (delayed_pc) {
-        ASSERT(is_delay_slot);
-        pc = *delayed_pc;
-        delayed_pc = {};
-    }
-
-    switch (opcode.operation) {
-    case Macro::Operation::ALU: {
-        u32 result = GetALUResult(opcode.alu_operation, GetRegister(opcode.src_a),
-                                  GetRegister(opcode.src_b));
-        ProcessResult(opcode.result_operation, opcode.dst, result);
-        break;
-    }
-    case Macro::Operation::AddImmediate: {
-        ProcessResult(opcode.result_operation, opcode.dst,
-                      GetRegister(opcode.src_a) + opcode.immediate);
-        break;
-    }
-    case Macro::Operation::ExtractInsert: {
-        u32 dst = GetRegister(opcode.src_a);
-        u32 src = GetRegister(opcode.src_b);
-
-        src = (src >> opcode.bf_src_bit) & opcode.GetBitfieldMask();
-        dst &= ~(opcode.GetBitfieldMask() << opcode.bf_dst_bit);
-        dst |= src << opcode.bf_dst_bit;
-        ProcessResult(opcode.result_operation, opcode.dst, dst);
-        break;
-    }
-    case Macro::Operation::ExtractShiftLeftImmediate: {
-        u32 dst = GetRegister(opcode.src_a);
-        u32 src = GetRegister(opcode.src_b);
-
-        u32 result = ((src >> dst) & opcode.GetBitfieldMask()) << opcode.bf_dst_bit;
-
-        ProcessResult(opcode.result_operation, opcode.dst, result);
-        break;
-    }
-    case Macro::Operation::ExtractShiftLeftRegister: {
-        u32 dst = GetRegister(opcode.src_a);
-        u32 src = GetRegister(opcode.src_b);
-
-        u32 result = ((src >> opcode.bf_src_bit) & opcode.GetBitfieldMask()) << dst;
-
-        ProcessResult(opcode.result_operation, opcode.dst, result);
-        break;
-    }
-    case Macro::Operation::Read: {
-        u32 result = Read(GetRegister(opcode.src_a) + opcode.immediate);
-        ProcessResult(opcode.result_operation, opcode.dst, result);
-        break;
-    }
-    case Macro::Operation::Branch: {
-        ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid");
-        u32 value = GetRegister(opcode.src_a);
-        bool taken = EvaluateBranchCondition(opcode.branch_condition, value);
-        if (taken) {
-            // Ignore the delay slot if the branch has the annul bit.
-            if (opcode.branch_annul) {
-                pc = base_address + opcode.GetBranchTarget();
-                return true;
-            }
-
-            delayed_pc = base_address + opcode.GetBranchTarget();
-            // Execute one more instruction due to the delay slot.
-            return Step(true);
-        }
-        break;
-    }
-    default:
-        UNIMPLEMENTED_MSG("Unimplemented macro operation {}", opcode.operation.Value());
-        break;
-    }
-
-    // An instruction with the Exit flag will not actually
-    // cause an exit if it's executed inside a delay slot.
-    if (opcode.is_exit && !is_delay_slot) {
-        // Exit has a delay slot, execute the next instruction
-        Step(true);
-        return false;
-    }
-
-    return true;
-}
-
-u32 MacroInterpreterImpl::GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b) {
-    switch (operation) {
-    case Macro::ALUOperation::Add: {
-        const u64 result{static_cast<u64>(src_a) + src_b};
-        carry_flag = result > 0xffffffff;
-        return static_cast<u32>(result);
-    }
-    case Macro::ALUOperation::AddWithCarry: {
-        const u64 result{static_cast<u64>(src_a) + src_b + (carry_flag ? 1ULL : 0ULL)};
-        carry_flag = result > 0xffffffff;
-        return static_cast<u32>(result);
-    }
-    case Macro::ALUOperation::Subtract: {
-        const u64 result{static_cast<u64>(src_a) - src_b};
-        carry_flag = result < 0x100000000;
-        return static_cast<u32>(result);
-    }
-    case Macro::ALUOperation::SubtractWithBorrow: {
-        const u64 result{static_cast<u64>(src_a) - src_b - (carry_flag ? 0ULL : 1ULL)};
-        carry_flag = result < 0x100000000;
-        return static_cast<u32>(result);
-    }
-    case Macro::ALUOperation::Xor:
-        return src_a ^ src_b;
-    case Macro::ALUOperation::Or:
-        return src_a | src_b;
-    case Macro::ALUOperation::And:
-        return src_a & src_b;
-    case Macro::ALUOperation::AndNot:
-        return src_a & ~src_b;
-    case Macro::ALUOperation::Nand:
-        return ~(src_a & src_b);
-
-    default:
-        UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", operation);
-        return 0;
-    }
-}
-
-void MacroInterpreterImpl::ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result) {
-    switch (operation) {
-    case Macro::ResultOperation::IgnoreAndFetch:
-        // Fetch parameter and ignore result.
-        SetRegister(reg, FetchParameter());
-        break;
-    case Macro::ResultOperation::Move:
-        // Move result.
-        SetRegister(reg, result);
-        break;
-    case Macro::ResultOperation::MoveAndSetMethod:
-        // Move result and use as Method Address.
-        SetRegister(reg, result);
-        SetMethodAddress(result);
-        break;
-    case Macro::ResultOperation::FetchAndSend:
-        // Fetch parameter and send result.
-        SetRegister(reg, FetchParameter());
-        Send(result);
-        break;
-    case Macro::ResultOperation::MoveAndSend:
-        // Move and send result.
-        SetRegister(reg, result);
-        Send(result);
-        break;
-    case Macro::ResultOperation::FetchAndSetMethod:
-        // Fetch parameter and use result as Method Address.
-        SetRegister(reg, FetchParameter());
-        SetMethodAddress(result);
-        break;
-    case Macro::ResultOperation::MoveAndSetMethodFetchAndSend:
-        // Move result and use as Method Address, then fetch and send parameter.
-        SetRegister(reg, result);
-        SetMethodAddress(result);
-        Send(FetchParameter());
-        break;
-    case Macro::ResultOperation::MoveAndSetMethodSend:
-        // Move result and use as Method Address, then send bits 12:17 of result.
-        SetRegister(reg, result);
-        SetMethodAddress(result);
-        Send((result >> 12) & 0b111111);
-        break;
-    default:
-        UNIMPLEMENTED_MSG("Unimplemented result operation {}", operation);
-        break;
-    }
-}
-
-bool MacroInterpreterImpl::EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const {
-    switch (cond) {
-    case Macro::BranchCondition::Zero:
-        return value == 0;
-    case Macro::BranchCondition::NotZero:
-        return value != 0;
-    }
-    UNREACHABLE();
-}
-
-Macro::Opcode MacroInterpreterImpl::GetOpcode() const {
-    ASSERT((pc % sizeof(u32)) == 0);
-    ASSERT(pc < code.size() * sizeof(u32));
-    return {code[pc / sizeof(u32)]};
-}
-
-u32 MacroInterpreterImpl::GetRegister(u32 register_id) const {
-    return registers.at(register_id);
-}
-
-void MacroInterpreterImpl::SetRegister(u32 register_id, u32 value) {
-    // Register 0 is hardwired as the zero register.
-    // Ensure no writes to it actually occur.
-    if (register_id == 0) {
-        return;
-    }
-
-    registers.at(register_id) = value;
-}
-
-void MacroInterpreterImpl::SetMethodAddress(u32 address) {
-    method_address.raw = address;
-}
-
-void MacroInterpreterImpl::Send(u32 value) {
-    maxwell3d.CallMethod(method_address.address, value, true);
-    // Increment the method address by the method increment.
-    method_address.address.Assign(method_address.address.Value() +
-                                  method_address.increment.Value());
-}
-
-u32 MacroInterpreterImpl::Read(u32 method) const {
-    return maxwell3d.GetRegisterValue(method);
-}
-
-u32 MacroInterpreterImpl::FetchParameter() {
-    ASSERT(next_parameter_index < num_parameters);
-    return parameters[next_parameter_index++];
-}
-} // Anonymous namespace
-
-MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d_)
-    : MacroEngine{maxwell3d_}, maxwell3d{maxwell3d_} {}
-
-std::unique_ptr<CachedMacro> MacroInterpreter::Compile(const std::vector<u32>& code) {
-    return std::make_unique<MacroInterpreterImpl>(maxwell3d, code);
-}
-
-} // namespace Tegra
diff --git a/src/video_core/macro/macro_interpreter.h b/src/video_core/macro/macro_interpreter.h
deleted file mode 100644
index f5eeb0b76f..0000000000
--- a/src/video_core/macro/macro_interpreter.h
+++ /dev/null
@@ -1,27 +0,0 @@
-// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-#pragma once
-
-#include <vector>
-
-#include "common/common_types.h"
-#include "video_core/macro/macro.h"
-
-namespace Tegra {
-namespace Engines {
-class Maxwell3D;
-}
-
-class MacroInterpreter final : public MacroEngine {
-public:
-    explicit MacroInterpreter(Engines::Maxwell3D& maxwell3d_);
-
-protected:
-    std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override;
-
-private:
-    Engines::Maxwell3D& maxwell3d;
-};
-
-} // namespace Tegra
diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp
deleted file mode 100644
index 65935f6c62..0000000000
--- a/src/video_core/macro/macro_jit_x64.cpp
+++ /dev/null
@@ -1,678 +0,0 @@
-// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
-// SPDX-License-Identifier: GPL-3.0-or-later
-
-// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-#include <array>
-#include <bitset>
-#include <optional>
-
-#include <xbyak/xbyak.h>
-
-#include "common/assert.h"
-#include "common/bit_field.h"
-#include "common/logging/log.h"
-#include "common/x64/xbyak_abi.h"
-#include "common/x64/xbyak_util.h"
-#include "video_core/engines/maxwell_3d.h"
-#include "video_core/macro/macro_interpreter.h"
-#include "video_core/macro/macro_jit_x64.h"
-
-namespace Tegra {
-namespace {
-constexpr Xbyak::Reg64 STATE = Xbyak::util::rbx;
-constexpr Xbyak::Reg32 RESULT = Xbyak::util::r10d;
-constexpr Xbyak::Reg64 MAX_PARAMETER = Xbyak::util::r11;
-constexpr Xbyak::Reg64 PARAMETERS = Xbyak::util::r12;
-constexpr Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d;
-constexpr Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15;
-
-constexpr std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({
-    STATE,
-    RESULT,
-    MAX_PARAMETER,
-    PARAMETERS,
-    METHOD_ADDRESS,
-    BRANCH_HOLDER,
-});
-
-// Arbitrarily chosen based on current booting games.
-constexpr size_t MAX_CODE_SIZE = 0x10000;
-
-std::bitset<32> PersistentCallerSavedRegs() {
-    return PERSISTENT_REGISTERS & Common::X64::ABI_ALL_CALLER_SAVED;
-}
-
-/// @brief Must enforce W^X constraints, as we yet don't havea  global "NO_EXECUTE" support flag
-/// the speed loss is minimal, and in fact may be negligible, however for your peace of mind
-/// I simply included known OSes whom had W^X issues
-#if defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__)
-static const auto default_cg_mode = Xbyak::DontSetProtectRWE;
-#else
-static const auto default_cg_mode = nullptr; //Allow RWE
-#endif
-
-class MacroJITx64Impl final : public Xbyak::CodeGenerator, public CachedMacro {
-public:
-    explicit MacroJITx64Impl(Engines::Maxwell3D& maxwell3d_, const std::vector<u32>& code_)
-        : Xbyak::CodeGenerator(MAX_CODE_SIZE, default_cg_mode)
-        , code{code_}, maxwell3d{maxwell3d_} {
-        Compile();
-    }
-
-    void Execute(const std::vector<u32>& parameters, u32 method) override;
-
-    void Compile_ALU(Macro::Opcode opcode);
-    void Compile_AddImmediate(Macro::Opcode opcode);
-    void Compile_ExtractInsert(Macro::Opcode opcode);
-    void Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode);
-    void Compile_ExtractShiftLeftRegister(Macro::Opcode opcode);
-    void Compile_Read(Macro::Opcode opcode);
-    void Compile_Branch(Macro::Opcode opcode);
-
-private:
-    void Optimizer_ScanFlags();
-
-    void Compile();
-    bool Compile_NextInstruction();
-
-    Xbyak::Reg32 Compile_FetchParameter();
-    Xbyak::Reg32 Compile_GetRegister(u32 index, Xbyak::Reg32 dst);
-
-    void Compile_ProcessResult(Macro::ResultOperation operation, u32 reg);
-    void Compile_Send(Xbyak::Reg32 value);
-
-    Macro::Opcode GetOpCode() const;
-
-    struct JITState {
-        Engines::Maxwell3D* maxwell3d{};
-        std::array<u32, Macro::NUM_MACRO_REGISTERS> registers{};
-        u32 carry_flag{};
-    };
-    static_assert(offsetof(JITState, maxwell3d) == 0, "Maxwell3D is not at 0x0");
-    using ProgramType = void (*)(JITState*, const u32*, const u32*);
-
-    struct OptimizerState {
-        bool can_skip_carry{};
-        bool has_delayed_pc{};
-        bool zero_reg_skip{};
-        bool skip_dummy_addimmediate{};
-        bool optimize_for_method_move{};
-        bool enable_asserts{};
-    };
-    OptimizerState optimizer{};
-
-    std::optional<Macro::Opcode> next_opcode{};
-    ProgramType program{nullptr};
-
-    std::array<Xbyak::Label, MAX_CODE_SIZE> labels;
-    std::array<Xbyak::Label, MAX_CODE_SIZE> delay_skip;
-    Xbyak::Label end_of_code{};
-
-    bool is_delay_slot{};
-    u32 pc{};
-
-    const std::vector<u32>& code;
-    Engines::Maxwell3D& maxwell3d;
-};
-
-void MacroJITx64Impl::Execute(const std::vector<u32>& parameters, u32 method) {
-    ASSERT_OR_EXECUTE(program != nullptr, { return; });
-    JITState state{};
-    state.maxwell3d = &maxwell3d;
-    state.registers = {};
-    program(&state, parameters.data(), parameters.data() + parameters.size());
-}
-
-void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) {
-    const bool is_a_zero = opcode.src_a == 0;
-    const bool is_b_zero = opcode.src_b == 0;
-    const bool valid_operation = !is_a_zero && !is_b_zero;
-    [[maybe_unused]] const bool is_move_operation = !is_a_zero && is_b_zero;
-    const bool has_zero_register = is_a_zero || is_b_zero;
-    const bool no_zero_reg_skip = opcode.alu_operation == Macro::ALUOperation::AddWithCarry ||
-                                  opcode.alu_operation == Macro::ALUOperation::SubtractWithBorrow;
-
-    Xbyak::Reg32 src_a;
-    Xbyak::Reg32 src_b;
-
-    if (!optimizer.zero_reg_skip || no_zero_reg_skip) {
-        src_a = Compile_GetRegister(opcode.src_a, RESULT);
-        src_b = Compile_GetRegister(opcode.src_b, eax);
-    } else {
-        if (!is_a_zero) {
-            src_a = Compile_GetRegister(opcode.src_a, RESULT);
-        }
-        if (!is_b_zero) {
-            src_b = Compile_GetRegister(opcode.src_b, eax);
-        }
-    }
-
-    bool has_emitted = false;
-
-    switch (opcode.alu_operation) {
-    case Macro::ALUOperation::Add:
-        if (optimizer.zero_reg_skip) {
-            if (valid_operation) {
-                add(src_a, src_b);
-            }
-        } else {
-            add(src_a, src_b);
-        }
-
-        if (!optimizer.can_skip_carry) {
-            setc(byte[STATE + offsetof(JITState, carry_flag)]);
-        }
-        break;
-    case Macro::ALUOperation::AddWithCarry:
-        bt(dword[STATE + offsetof(JITState, carry_flag)], 0);
-        adc(src_a, src_b);
-        setc(byte[STATE + offsetof(JITState, carry_flag)]);
-        break;
-    case Macro::ALUOperation::Subtract:
-        if (optimizer.zero_reg_skip) {
-            if (valid_operation) {
-                sub(src_a, src_b);
-                has_emitted = true;
-            }
-        } else {
-            sub(src_a, src_b);
-            has_emitted = true;
-        }
-        if (!optimizer.can_skip_carry && has_emitted) {
-            setc(byte[STATE + offsetof(JITState, carry_flag)]);
-        }
-        break;
-    case Macro::ALUOperation::SubtractWithBorrow:
-        bt(dword[STATE + offsetof(JITState, carry_flag)], 0);
-        sbb(src_a, src_b);
-        setc(byte[STATE + offsetof(JITState, carry_flag)]);
-        break;
-    case Macro::ALUOperation::Xor:
-        if (optimizer.zero_reg_skip) {
-            if (valid_operation) {
-                xor_(src_a, src_b);
-            }
-        } else {
-            xor_(src_a, src_b);
-        }
-        break;
-    case Macro::ALUOperation::Or:
-        if (optimizer.zero_reg_skip) {
-            if (valid_operation) {
-                or_(src_a, src_b);
-            }
-        } else {
-            or_(src_a, src_b);
-        }
-        break;
-    case Macro::ALUOperation::And:
-        if (optimizer.zero_reg_skip) {
-            if (!has_zero_register) {
-                and_(src_a, src_b);
-            }
-        } else {
-            and_(src_a, src_b);
-        }
-        break;
-    case Macro::ALUOperation::AndNot:
-        if (optimizer.zero_reg_skip) {
-            if (!is_a_zero) {
-                not_(src_b);
-                and_(src_a, src_b);
-            }
-        } else {
-            not_(src_b);
-            and_(src_a, src_b);
-        }
-        break;
-    case Macro::ALUOperation::Nand:
-        if (optimizer.zero_reg_skip) {
-            if (!is_a_zero) {
-                and_(src_a, src_b);
-                not_(src_a);
-            }
-        } else {
-            and_(src_a, src_b);
-            not_(src_a);
-        }
-        break;
-    default:
-        UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", opcode.alu_operation.Value());
-        break;
-    }
-    Compile_ProcessResult(opcode.result_operation, opcode.dst);
-}
-
-void MacroJITx64Impl::Compile_AddImmediate(Macro::Opcode opcode) {
-    if (optimizer.skip_dummy_addimmediate) {
-        // Games tend to use this as an exit instruction placeholder. It's to encode an instruction
-        // without doing anything. In our case we can just not emit anything.
-        if (opcode.result_operation == Macro::ResultOperation::Move && opcode.dst == 0) {
-            return;
-        }
-    }
-    // Check for redundant moves
-    if (optimizer.optimize_for_method_move &&
-        opcode.result_operation == Macro::ResultOperation::MoveAndSetMethod) {
-        if (next_opcode.has_value()) {
-            const auto next = *next_opcode;
-            if (next.result_operation == Macro::ResultOperation::MoveAndSetMethod &&
-                opcode.dst == next.dst) {
-                return;
-            }
-        }
-    }
-    if (optimizer.zero_reg_skip && opcode.src_a == 0) {
-        if (opcode.immediate == 0) {
-            xor_(RESULT, RESULT);
-        } else {
-            mov(RESULT, opcode.immediate);
-        }
-    } else {
-        auto result = Compile_GetRegister(opcode.src_a, RESULT);
-        if (opcode.immediate > 2) {
-            add(result, opcode.immediate);
-        } else if (opcode.immediate == 1) {
-            inc(result);
-        } else if (opcode.immediate < 0) {
-            sub(result, opcode.immediate * -1);
-        }
-    }
-    Compile_ProcessResult(opcode.result_operation, opcode.dst);
-}
-
-void MacroJITx64Impl::Compile_ExtractInsert(Macro::Opcode opcode) {
-    auto dst = Compile_GetRegister(opcode.src_a, RESULT);
-    auto src = Compile_GetRegister(opcode.src_b, eax);
-
-    const u32 mask = ~(opcode.GetBitfieldMask() << opcode.bf_dst_bit);
-    and_(dst, mask);
-    shr(src, opcode.bf_src_bit);
-    and_(src, opcode.GetBitfieldMask());
-    shl(src, opcode.bf_dst_bit);
-    or_(dst, src);
-
-    Compile_ProcessResult(opcode.result_operation, opcode.dst);
-}
-
-void MacroJITx64Impl::Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode) {
-    const auto dst = Compile_GetRegister(opcode.src_a, ecx);
-    const auto src = Compile_GetRegister(opcode.src_b, RESULT);
-
-    shr(src, dst.cvt8());
-    and_(src, opcode.GetBitfieldMask());
-    shl(src, opcode.bf_dst_bit);
-
-    Compile_ProcessResult(opcode.result_operation, opcode.dst);
-}
-
-void MacroJITx64Impl::Compile_ExtractShiftLeftRegister(Macro::Opcode opcode) {
-    const auto dst = Compile_GetRegister(opcode.src_a, ecx);
-    const auto src = Compile_GetRegister(opcode.src_b, RESULT);
-
-    shr(src, opcode.bf_src_bit);
-    and_(src, opcode.GetBitfieldMask());
-    shl(src, dst.cvt8());
-
-    Compile_ProcessResult(opcode.result_operation, opcode.dst);
-}
-
-void MacroJITx64Impl::Compile_Read(Macro::Opcode opcode) {
-    if (optimizer.zero_reg_skip && opcode.src_a == 0) {
-        if (opcode.immediate == 0) {
-            xor_(RESULT, RESULT);
-        } else {
-            mov(RESULT, opcode.immediate);
-        }
-    } else {
-        auto result = Compile_GetRegister(opcode.src_a, RESULT);
-        if (opcode.immediate > 2) {
-            add(result, opcode.immediate);
-        } else if (opcode.immediate == 1) {
-            inc(result);
-        } else if (opcode.immediate < 0) {
-            sub(result, opcode.immediate * -1);
-        }
-    }
-
-    // Equivalent to Engines::Maxwell3D::GetRegisterValue:
-    if (optimizer.enable_asserts) {
-        Xbyak::Label pass_range_check;
-        cmp(RESULT, static_cast<u32>(Engines::Maxwell3D::Regs::NUM_REGS));
-        jb(pass_range_check);
-        int3();
-        L(pass_range_check);
-    }
-    mov(rax, qword[STATE]);
-    mov(RESULT,
-        dword[rax + offsetof(Engines::Maxwell3D, regs) +
-              offsetof(Engines::Maxwell3D::Regs, reg_array) + RESULT.cvt64() * sizeof(u32)]);
-
-    Compile_ProcessResult(opcode.result_operation, opcode.dst);
-}
-
-void Send(Engines::Maxwell3D* maxwell3d, Macro::MethodAddress method_address, u32 value) {
-    maxwell3d->CallMethod(method_address.address, value, true);
-}
-
-void MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) {
-    Common::X64::ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
-    mov(Common::X64::ABI_PARAM1, qword[STATE]);
-    mov(Common::X64::ABI_PARAM2.cvt32(), METHOD_ADDRESS);
-    mov(Common::X64::ABI_PARAM3.cvt32(), value);
-    Common::X64::CallFarFunction(*this, &Send);
-    Common::X64::ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
-
-    Xbyak::Label dont_process{};
-    // Get increment
-    test(METHOD_ADDRESS, 0x3f000);
-    // If zero, method address doesn't update
-    je(dont_process);
-
-    mov(ecx, METHOD_ADDRESS);
-    and_(METHOD_ADDRESS, 0xfff);
-    shr(ecx, 12);
-    and_(ecx, 0x3f);
-    lea(eax, ptr[rcx + METHOD_ADDRESS.cvt64()]);
-    sal(ecx, 12);
-    or_(eax, ecx);
-
-    mov(METHOD_ADDRESS, eax);
-
-    L(dont_process);
-}
-
-void MacroJITx64Impl::Compile_Branch(Macro::Opcode opcode) {
-    ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid");
-    const s32 jump_address =
-        static_cast<s32>(pc) + static_cast<s32>(opcode.GetBranchTarget() / sizeof(s32));
-
-    Xbyak::Label end;
-    auto value = Compile_GetRegister(opcode.src_a, eax);
-    cmp(value, 0); // test(value, value);
-    if (optimizer.has_delayed_pc) {
-        switch (opcode.branch_condition) {
-        case Macro::BranchCondition::Zero:
-            jne(end, T_NEAR);
-            break;
-        case Macro::BranchCondition::NotZero:
-            je(end, T_NEAR);
-            break;
-        }
-
-        if (opcode.branch_annul) {
-            xor_(BRANCH_HOLDER, BRANCH_HOLDER);
-            jmp(labels[jump_address], T_NEAR);
-        } else {
-            Xbyak::Label handle_post_exit{};
-            Xbyak::Label skip{};
-            jmp(skip, T_NEAR);
-
-            L(handle_post_exit);
-            xor_(BRANCH_HOLDER, BRANCH_HOLDER);
-            jmp(labels[jump_address], T_NEAR);
-
-            L(skip);
-            mov(BRANCH_HOLDER, handle_post_exit);
-            jmp(delay_skip[pc], T_NEAR);
-        }
-    } else {
-        switch (opcode.branch_condition) {
-        case Macro::BranchCondition::Zero:
-            je(labels[jump_address], T_NEAR);
-            break;
-        case Macro::BranchCondition::NotZero:
-            jne(labels[jump_address], T_NEAR);
-            break;
-        }
-    }
-
-    L(end);
-}
-
-void MacroJITx64Impl::Optimizer_ScanFlags() {
-    optimizer.can_skip_carry = true;
-    optimizer.has_delayed_pc = false;
-    for (auto raw_op : code) {
-        Macro::Opcode op{};
-        op.raw = raw_op;
-
-        if (op.operation == Macro::Operation::ALU) {
-            // Scan for any ALU operations which actually use the carry flag, if they don't exist in
-            // our current code we can skip emitting the carry flag handling operations
-            if (op.alu_operation == Macro::ALUOperation::AddWithCarry ||
-                op.alu_operation == Macro::ALUOperation::SubtractWithBorrow) {
-                optimizer.can_skip_carry = false;
-            }
-        }
-
-        if (op.operation == Macro::Operation::Branch) {
-            if (!op.branch_annul) {
-                optimizer.has_delayed_pc = true;
-            }
-        }
-    }
-}
-
-void MacroJITx64Impl::Compile() {
-    labels.fill(Xbyak::Label());
-
-    Common::X64::ABI_PushRegistersAndAdjustStack(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
-    // JIT state
-    mov(STATE, Common::X64::ABI_PARAM1);
-    mov(PARAMETERS, Common::X64::ABI_PARAM2);
-    mov(MAX_PARAMETER, Common::X64::ABI_PARAM3);
-    xor_(RESULT, RESULT);
-    xor_(METHOD_ADDRESS, METHOD_ADDRESS);
-    xor_(BRANCH_HOLDER, BRANCH_HOLDER);
-
-    mov(dword[STATE + offsetof(JITState, registers) + 4], Compile_FetchParameter());
-
-    // Track get register for zero registers and mark it as no-op
-    optimizer.zero_reg_skip = true;
-
-    // AddImmediate tends to be used as a NOP instruction, if we detect this we can
-    // completely skip the entire code path and no emit anything
-    optimizer.skip_dummy_addimmediate = true;
-
-    // SMO tends to emit a lot of unnecessary method moves, we can mitigate this by only emitting
-    // one if our register isn't "dirty"
-    optimizer.optimize_for_method_move = true;
-
-    // Enable run-time assertions in JITted code
-    optimizer.enable_asserts = false;
-
-    // Check to see if we can skip emitting certain instructions
-    Optimizer_ScanFlags();
-
-    const u32 op_count = static_cast<u32>(code.size());
-    for (u32 i = 0; i < op_count; i++) {
-        if (i < op_count - 1) {
-            pc = i + 1;
-            next_opcode = GetOpCode();
-        } else {
-            next_opcode = {};
-        }
-        pc = i;
-        Compile_NextInstruction();
-    }
-
-    L(end_of_code);
-
-    Common::X64::ABI_PopRegistersAndAdjustStack(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
-    ret();
-    ready();
-    program = getCode<ProgramType>();
-}
-
-bool MacroJITx64Impl::Compile_NextInstruction() {
-    const auto opcode = GetOpCode();
-    if (labels[pc].getAddress()) {
-        return false;
-    }
-
-    L(labels[pc]);
-
-    switch (opcode.operation) {
-    case Macro::Operation::ALU:
-        Compile_ALU(opcode);
-        break;
-    case Macro::Operation::AddImmediate:
-        Compile_AddImmediate(opcode);
-        break;
-    case Macro::Operation::ExtractInsert:
-        Compile_ExtractInsert(opcode);
-        break;
-    case Macro::Operation::ExtractShiftLeftImmediate:
-        Compile_ExtractShiftLeftImmediate(opcode);
-        break;
-    case Macro::Operation::ExtractShiftLeftRegister:
-        Compile_ExtractShiftLeftRegister(opcode);
-        break;
-    case Macro::Operation::Read:
-        Compile_Read(opcode);
-        break;
-    case Macro::Operation::Branch:
-        Compile_Branch(opcode);
-        break;
-    default:
-        UNIMPLEMENTED_MSG("Unimplemented opcode {}", opcode.operation.Value());
-        break;
-    }
-
-    if (optimizer.has_delayed_pc) {
-        if (opcode.is_exit) {
-            mov(rax, end_of_code);
-            test(BRANCH_HOLDER, BRANCH_HOLDER);
-            cmove(BRANCH_HOLDER, rax);
-            // Jump to next instruction to skip delay slot check
-            je(labels[pc + 1], T_NEAR);
-        } else {
-            // TODO(ogniK): Optimize delay slot branching
-            Xbyak::Label no_delay_slot{};
-            test(BRANCH_HOLDER, BRANCH_HOLDER);
-            je(no_delay_slot, T_NEAR);
-            mov(rax, BRANCH_HOLDER);
-            xor_(BRANCH_HOLDER, BRANCH_HOLDER);
-            jmp(rax);
-            L(no_delay_slot);
-        }
-        L(delay_skip[pc]);
-        if (opcode.is_exit) {
-            return false;
-        }
-    } else {
-        test(BRANCH_HOLDER, BRANCH_HOLDER);
-        jne(end_of_code, T_NEAR);
-        if (opcode.is_exit) {
-            inc(BRANCH_HOLDER);
-            return false;
-        }
-    }
-    return true;
-}
-
-static void WarnInvalidParameter(uintptr_t parameter, uintptr_t max_parameter) {
-    LOG_CRITICAL(HW_GPU,
-                 "Macro JIT: invalid parameter access 0x{:x} (0x{:x} is the last parameter)",
-                 parameter, max_parameter - sizeof(u32));
-}
-
-Xbyak::Reg32 MacroJITx64Impl::Compile_FetchParameter() {
-    Xbyak::Label parameter_ok{};
-    cmp(PARAMETERS, MAX_PARAMETER);
-    jb(parameter_ok, T_NEAR);
-    Common::X64::ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
-    mov(Common::X64::ABI_PARAM1, PARAMETERS);
-    mov(Common::X64::ABI_PARAM2, MAX_PARAMETER);
-    Common::X64::CallFarFunction(*this, &WarnInvalidParameter);
-    Common::X64::ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
-    L(parameter_ok);
-    mov(eax, dword[PARAMETERS]);
-    add(PARAMETERS, sizeof(u32));
-    return eax;
-}
-
-Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) {
-    if (index == 0) {
-        // Register 0 is always zero
-        xor_(dst, dst);
-    } else {
-        mov(dst, dword[STATE + offsetof(JITState, registers) + index * sizeof(u32)]);
-    }
-
-    return dst;
-}
-
-void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u32 reg) {
-    const auto SetRegister = [this](u32 reg_index, const Xbyak::Reg32& result) {
-        // Register 0 is supposed to always return 0. NOP is implemented as a store to the zero
-        // register.
-        if (reg_index == 0) {
-            return;
-        }
-        mov(dword[STATE + offsetof(JITState, registers) + reg_index * sizeof(u32)], result);
-    };
-    const auto SetMethodAddress = [this](const Xbyak::Reg32& reg32) { mov(METHOD_ADDRESS, reg32); };
-
-    switch (operation) {
-    case Macro::ResultOperation::IgnoreAndFetch:
-        SetRegister(reg, Compile_FetchParameter());
-        break;
-    case Macro::ResultOperation::Move:
-        SetRegister(reg, RESULT);
-        break;
-    case Macro::ResultOperation::MoveAndSetMethod:
-        SetRegister(reg, RESULT);
-        SetMethodAddress(RESULT);
-        break;
-    case Macro::ResultOperation::FetchAndSend:
-        // Fetch parameter and send result.
-        SetRegister(reg, Compile_FetchParameter());
-        Compile_Send(RESULT);
-        break;
-    case Macro::ResultOperation::MoveAndSend:
-        // Move and send result.
-        SetRegister(reg, RESULT);
-        Compile_Send(RESULT);
-        break;
-    case Macro::ResultOperation::FetchAndSetMethod:
-        // Fetch parameter and use result as Method Address.
-        SetRegister(reg, Compile_FetchParameter());
-        SetMethodAddress(RESULT);
-        break;
-    case Macro::ResultOperation::MoveAndSetMethodFetchAndSend:
-        // Move result and use as Method Address, then fetch and send parameter.
-        SetRegister(reg, RESULT);
-        SetMethodAddress(RESULT);
-        Compile_Send(Compile_FetchParameter());
-        break;
-    case Macro::ResultOperation::MoveAndSetMethodSend:
-        // Move result and use as Method Address, then send bits 12:17 of result.
-        SetRegister(reg, RESULT);
-        SetMethodAddress(RESULT);
-        shr(RESULT, 12);
-        and_(RESULT, 0b111111);
-        Compile_Send(RESULT);
-        break;
-    default:
-        UNIMPLEMENTED_MSG("Unimplemented macro operation {}", operation);
-        break;
-    }
-}
-
-Macro::Opcode MacroJITx64Impl::GetOpCode() const {
-    ASSERT(pc < code.size());
-    return {code[pc]};
-}
-} // Anonymous namespace
-
-MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d_)
-    : MacroEngine{maxwell3d_}, maxwell3d{maxwell3d_} {}
-
-std::unique_ptr<CachedMacro> MacroJITx64::Compile(const std::vector<u32>& code) {
-    return std::make_unique<MacroJITx64Impl>(maxwell3d, code);
-}
-} // namespace Tegra
diff --git a/src/video_core/macro/macro_jit_x64.h b/src/video_core/macro/macro_jit_x64.h
deleted file mode 100644
index 99ee1b9e68..0000000000
--- a/src/video_core/macro/macro_jit_x64.h
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-#pragma once
-
-#include "common/common_types.h"
-#include "video_core/macro/macro.h"
-
-namespace Tegra {
-
-namespace Engines {
-class Maxwell3D;
-}
-
-class MacroJITx64 final : public MacroEngine {
-public:
-    explicit MacroJITx64(Engines::Maxwell3D& maxwell3d_);
-
-protected:
-    std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override;
-
-private:
-    Engines::Maxwell3D& maxwell3d;
-};
-
-} // namespace Tegra
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index 75254049a6..14ab5dd967 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -1214,19 +1214,16 @@ ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::NullImageV
 ImageView::~ImageView() = default;
 
 GLuint ImageView::StorageView(Shader::TextureType texture_type, Shader::ImageFormat image_format) {
-    if (image_format == Shader::ImageFormat::Typeless) {
+    if (image_format == Shader::ImageFormat::Typeless)
         return Handle(texture_type);
-    }
-    const bool is_signed{image_format == Shader::ImageFormat::R8_SINT ||
-                         image_format == Shader::ImageFormat::R16_SINT};
-    if (!storage_views) {
-        storage_views = std::make_unique<StorageViews>();
-    }
+    const bool is_signed = image_format == Shader::ImageFormat::R8_SINT
+        || image_format == Shader::ImageFormat::R16_SINT;
+    if (!storage_views)
+        storage_views.emplace();
     auto& type_views{is_signed ? storage_views->signeds : storage_views->unsigneds};
-    GLuint& view{type_views[static_cast<size_t>(texture_type)]};
-    if (view == 0) {
+    GLuint& view{type_views[size_t(texture_type)]};
+    if (view == 0)
         view = MakeView(texture_type, ShaderFormat(image_format));
-    }
     return view;
 }
 
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h
index 3de24508fe..e2a2022cb2 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -302,7 +302,7 @@ private:
 
     std::array<GLuint, Shader::NUM_TEXTURE_TYPES> views{};
     std::vector<OGLTextureView> stored_views;
-    std::unique_ptr<StorageViews> storage_views;
+    std::optional<StorageViews> storage_views;
     GLenum internal_format = GL_NONE;
     GLuint default_handle = 0;
     u32 buffer_size = 0;
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 101a884fd7..c3a5ed391b 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -376,7 +376,6 @@ void RasterizerVulkan::DrawTexture() {
 }
 
 void RasterizerVulkan::Clear(u32 layer_count) {
-
     FlushWork();
     gpu_memory->FlushCaching();
 
@@ -396,9 +395,7 @@ void RasterizerVulkan::Clear(u32 layer_count) {
     scheduler.RequestRenderpass(framebuffer);
 
     query_cache.NotifySegment(true);
-    query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64,
-                              maxwell3d->regs.zpass_pixel_count_enable);
-
+    query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64, maxwell3d->regs.zpass_pixel_count_enable);
     u32 up_scale = 1;
     u32 down_shift = 0;
     if (texture_cache.IsRescaling()) {
@@ -443,14 +440,14 @@ void RasterizerVulkan::Clear(u32 layer_count) {
                 offset = 0;
                 return;
             }
-            if (offset >= static_cast<s32>(limit)) {
-                offset = static_cast<s32>(limit);
+            if (offset >= s32(limit)) {
+                offset = s32(limit);
                 extent = 0;
                 return;
             }
-            const u64 end_coord = static_cast<u64>(offset) + extent;
+            const u64 end_coord = u64(offset) + extent;
             if (end_coord > limit) {
-                extent = limit - static_cast<u32>(offset);
+                extent = limit - u32(offset);
             }
         };
 
@@ -464,30 +461,22 @@ void RasterizerVulkan::Clear(u32 layer_count) {
 
     const u32 color_attachment = regs.clear_surface.RT;
     if (use_color && framebuffer->HasAspectColorBit(color_attachment)) {
-        const auto format =
-            VideoCore::Surface::PixelFormatFromRenderTargetFormat(regs.rt[color_attachment].format);
+        const auto format = VideoCore::Surface::PixelFormatFromRenderTargetFormat(regs.rt[color_attachment].format);
         bool is_integer = IsPixelFormatInteger(format);
         bool is_signed = IsPixelFormatSignedInteger(format);
         size_t int_size = PixelComponentSizeBitsInteger(format);
         VkClearValue clear_value{};
         if (!is_integer) {
-            std::memcpy(clear_value.color.float32, regs.clear_color.data(),
-                        regs.clear_color.size() * sizeof(f32));
+            std::memcpy(clear_value.color.float32, regs.clear_color.data(), regs.clear_color.size() * sizeof(f32));
         } else if (!is_signed) {
-            for (size_t i = 0; i < 4; i++) {
-                clear_value.color.uint32[i] = static_cast<u32>(
-                    static_cast<f32>(static_cast<u64>(int_size) << 1U) * regs.clear_color[i]);
-            }
+            for (size_t i = 0; i < 4; i++)
+                clear_value.color.uint32[i] = u32(f32(u64(int_size) << 1U) * regs.clear_color[i]);
         } else {
-            for (size_t i = 0; i < 4; i++) {
-                clear_value.color.int32[i] =
-                    static_cast<s32>(static_cast<f32>(static_cast<s64>(int_size - 1) << 1) *
-                                     (regs.clear_color[i] - 0.5f));
-            }
+            for (size_t i = 0; i < 4; i++)
+                clear_value.color.int32[i] = s32(f32(s64(int_size - 1) << 1) * (regs.clear_color[i] - 0.5f));
         }
 
-        if (regs.clear_surface.R && regs.clear_surface.G && regs.clear_surface.B &&
-            regs.clear_surface.A) {
+        if (regs.clear_surface.R && regs.clear_surface.G && regs.clear_surface.B && regs.clear_surface.A) {
             scheduler.Record([color_attachment, clear_value, clear_rect](vk::CommandBuffer cmdbuf) {
                 const VkClearAttachment attachment{
                     .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
@@ -497,14 +486,11 @@ void RasterizerVulkan::Clear(u32 layer_count) {
                 cmdbuf.ClearAttachments(attachment, clear_rect);
             });
         } else {
-            u8 color_mask = static_cast<u8>(regs.clear_surface.R | regs.clear_surface.G << 1 |
-                                            regs.clear_surface.B << 2 | regs.clear_surface.A << 3);
+            u8 color_mask = u8(regs.clear_surface.R | regs.clear_surface.G << 1 | regs.clear_surface.B << 2 | regs.clear_surface.A << 3);
             Region2D dst_region = {
                 Offset2D{.x = clear_rect.rect.offset.x, .y = clear_rect.rect.offset.y},
-                Offset2D{.x = clear_rect.rect.offset.x +
-                              static_cast<s32>(clear_rect.rect.extent.width),
-                         .y = clear_rect.rect.offset.y +
-                              static_cast<s32>(clear_rect.rect.extent.height)}};
+                Offset2D{.x = clear_rect.rect.offset.x + s32(clear_rect.rect.extent.width),
+                         .y = clear_rect.rect.offset.y + s32(clear_rect.rect.extent.height)}};
             blit_image.ClearColor(framebuffer, color_mask, regs.clear_color, dst_region);
         }
     }
@@ -527,11 +513,10 @@ void RasterizerVulkan::Clear(u32 layer_count) {
         regs.stencil_front_mask != 0) {
         Region2D dst_region = {
             Offset2D{.x = clear_rect.rect.offset.x, .y = clear_rect.rect.offset.y},
-            Offset2D{.x = clear_rect.rect.offset.x + static_cast<s32>(clear_rect.rect.extent.width),
-                     .y = clear_rect.rect.offset.y +
-                          static_cast<s32>(clear_rect.rect.extent.height)}};
+            Offset2D{.x = clear_rect.rect.offset.x + s32(clear_rect.rect.extent.width),
+                     .y = clear_rect.rect.offset.y + s32(clear_rect.rect.extent.height)}};
         blit_image.ClearDepthStencil(framebuffer, use_depth, regs.clear_depth,
-                                     static_cast<u8>(regs.stencil_front_mask), regs.clear_stencil,
+                                     u8(regs.stencil_front_mask), regs.clear_stencil,
                                      regs.stencil_front_func_mask, dst_region);
     } else {
         scheduler.Record([clear_depth = regs.clear_depth, clear_stencil = regs.clear_stencil,
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
index cff7a73903..a950ffed7a 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -860,8 +860,7 @@ TextureCacheRuntime::TextureCacheRuntime(const Device& device_, Scheduler& sched
                                   compute_pass_descriptor_queue, memory_allocator);
     }
     if (device.IsStorageImageMultisampleSupported()) {
-        msaa_copy_pass = std::make_unique<MSAACopyPass>(
-            device, scheduler, descriptor_pool, staging_buffer_pool, compute_pass_descriptor_queue);
+        msaa_copy_pass.emplace(device, scheduler, descriptor_pool, staging_buffer_pool, compute_pass_descriptor_queue);
     }
     if (!device.IsKhrImageFormatListSupported()) {
         return;
@@ -1675,10 +1674,10 @@ void Image::UploadMemory(VkBuffer buffer, VkDeviceSize offset,
     // CHANGE: Gate the MSAA path more strictly and only use it for color, when the pass and device
     //         support are available. Avoid running the MSAA path when prerequisites aren't met,
     //         preventing validation and runtime issues.
-    const bool wants_msaa_upload = info.num_samples > 1 &&
-                                   (aspect_mask & VK_IMAGE_ASPECT_COLOR_BIT) != 0 &&
-                                   runtime->CanUploadMSAA() && runtime->msaa_copy_pass != nullptr &&
-                                   runtime->device.IsStorageImageMultisampleSupported();
+    const bool wants_msaa_upload = info.num_samples > 1
+        && (aspect_mask & VK_IMAGE_ASPECT_COLOR_BIT) != 0
+        && runtime->CanUploadMSAA() && runtime->msaa_copy_pass.has_value()
+        && runtime->device.IsStorageImageMultisampleSupported();
 
     if (wants_msaa_upload) {
         // Create a temporary non-MSAA image to upload the data first
@@ -2047,8 +2046,7 @@ bool Image::BlitScaleHelper(bool scale_up) {
     const u32 scaled_width = resolution.ScaleUp(info.size.width);
     const u32 scaled_height = is_2d ? resolution.ScaleUp(info.size.height) : info.size.height;
     std::unique_ptr<ImageView>& blit_view = scale_up ? scale_view : normal_view;
-    std::unique_ptr<Framebuffer>& blit_framebuffer =
-        scale_up ? scale_framebuffer : normal_framebuffer;
+    std::optional<Framebuffer>& blit_framebuffer = scale_up ? scale_framebuffer : normal_framebuffer;
     if (!blit_view) {
         const auto view_info = ImageViewInfo(ImageViewType::e2D, info.format);
         blit_view = std::make_unique<ImageView>(*runtime, view_info, NULL_IMAGE_ID, *this);
@@ -2060,11 +2058,11 @@ bool Image::BlitScaleHelper(bool scale_up) {
     const u32 dst_height = scale_up ? scaled_height : info.size.height;
     const Region2D src_region{
         .start = {0, 0},
-        .end = {static_cast<s32>(src_width), static_cast<s32>(src_height)},
+        .end = {s32(src_width), s32(src_height)},
     };
     const Region2D dst_region{
         .start = {0, 0},
-        .end = {static_cast<s32>(dst_width), static_cast<s32>(dst_height)},
+        .end = {s32(dst_width), s32(dst_height)},
     };
     const VkExtent2D extent{
         .width = (std::max)(scaled_width, info.size.width),
@@ -2073,21 +2071,15 @@ bool Image::BlitScaleHelper(bool scale_up) {
 
     auto* view_ptr = blit_view.get();
     if (aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT) {
-        if (!blit_framebuffer) {
-            blit_framebuffer =
-                std::make_unique<Framebuffer>(*runtime, view_ptr, nullptr, extent, scale_up);
-        }
-
-        runtime->blit_image_helper.BlitColor(blit_framebuffer.get(), *blit_view, dst_region,
-                                             src_region, operation, BLIT_OPERATION);
+        if (!blit_framebuffer)
+            blit_framebuffer.emplace(*runtime, view_ptr, nullptr, extent, scale_up);
+        runtime->blit_image_helper.BlitColor(&*blit_framebuffer, *blit_view,
+            dst_region, src_region, operation, BLIT_OPERATION);
     } else if (aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
-        if (!blit_framebuffer) {
-            blit_framebuffer =
-                std::make_unique<Framebuffer>(*runtime, nullptr, view_ptr, extent, scale_up);
-        }
-        runtime->blit_image_helper.BlitDepthStencil(blit_framebuffer.get(), *blit_view,
-                                                    dst_region, src_region, operation,
-                                                    BLIT_OPERATION);
+        if (!blit_framebuffer)
+            blit_framebuffer.emplace(*runtime, nullptr, view_ptr, extent, scale_up);
+        runtime->blit_image_helper.BlitDepthStencil(&*blit_framebuffer, *blit_view,
+            dst_region, src_region, operation, BLIT_OPERATION);
     } else {
         // TODO: Use helper blits where applicable
         flags &= ~ImageFlagBits::Rescaled;
@@ -2200,9 +2192,9 @@ ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewI
     }
 }
 
-ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewInfo& info,
-                     ImageId image_id_, Image& image, const SlotVector<Image>& slot_imgs)
-    : ImageView{runtime, info, image_id_, image} {
+ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewInfo& info, ImageId image_id_, Image& image, const SlotVector<Image>& slot_imgs)
+    : ImageView{runtime, info, image_id_, image}
+{
     slot_images = &slot_imgs;
 }
 
@@ -2267,33 +2259,25 @@ VkImageView ImageView::ColorView() {
 
 VkImageView ImageView::StorageView(Shader::TextureType texture_type,
                                    Shader::ImageFormat image_format) {
-    if (!image_handle) {
-        return VK_NULL_HANDLE;
-    }
-    if (image_format == Shader::ImageFormat::Typeless) {
-        return Handle(texture_type);
-    }
-    const bool is_signed{image_format == Shader::ImageFormat::R8_SINT ||
-                         image_format == Shader::ImageFormat::R16_SINT};
-    if (!storage_views) {
-        storage_views = std::make_unique<StorageViews>();
-    }
-    auto& views{is_signed ? storage_views->signeds : storage_views->unsigneds};
-    auto& view{views[static_cast<size_t>(texture_type)]};
-    if (view) {
+    if (image_handle) {
+        if (image_format == Shader::ImageFormat::Typeless) {
+            return Handle(texture_type);
+        }
+        const bool is_signed = image_format == Shader::ImageFormat::R8_SINT
+            || image_format == Shader::ImageFormat::R16_SINT;
+        if (!storage_views)
+            storage_views.emplace();
+        auto& views{is_signed ? storage_views->signeds : storage_views->unsigneds};
+        auto& view{views[size_t(texture_type)]};
+        if (!view)
+            view = MakeView(Format(image_format), VK_IMAGE_ASPECT_COLOR_BIT);
         return *view;
     }
-    view = MakeView(Format(image_format), VK_IMAGE_ASPECT_COLOR_BIT);
-    return *view;
+    return VK_NULL_HANDLE;
 }
 
 bool ImageView::IsRescaled() const noexcept {
-    if (!slot_images) {
-        return false;
-    }
-    const auto& slots = *slot_images;
-    const auto& src_image = slots[image_id];
-    return src_image.IsRescaled();
+    return (*slot_images)[image_id].IsRescaled();
 }
 
 vk::ImageView ImageView::MakeView(VkFormat vk_format, VkImageAspectFlags aspect_mask) {
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h
index dcc835f05e..4bb9687ab0 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.h
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.h
@@ -133,7 +133,7 @@ public:
     vk::Buffer swizzle_table_buffer;
     VkDeviceSize swizzle_table_size = 0;
 
-    std::unique_ptr<MSAACopyPass> msaa_copy_pass;
+    std::optional<MSAACopyPass> msaa_copy_pass;
     const Settings::ResolutionScalingInfo& resolution;
     std::array<std::vector<VkFormat>, VideoCore::Surface::MaxPixelFormat> view_formats;
 
@@ -141,6 +141,89 @@ public:
     std::array<vk::Buffer, indexing_slots> buffers{};
 };
 
+class Framebuffer {
+public:
+    explicit Framebuffer(TextureCacheRuntime& runtime, std::span<ImageView*, NUM_RT> color_buffers,
+                         ImageView* depth_buffer, const VideoCommon::RenderTargets& key);
+
+    explicit Framebuffer(TextureCacheRuntime& runtime, ImageView* color_buffer,
+                         ImageView* depth_buffer, VkExtent2D extent, bool is_rescaled);
+
+    ~Framebuffer();
+
+    Framebuffer(const Framebuffer&) = delete;
+    Framebuffer& operator=(const Framebuffer&) = delete;
+
+    Framebuffer(Framebuffer&&) = default;
+    Framebuffer& operator=(Framebuffer&&) = default;
+
+    void CreateFramebuffer(TextureCacheRuntime& runtime,
+                           std::span<ImageView*, NUM_RT> color_buffers, ImageView* depth_buffer,
+                           bool is_rescaled = false);
+
+    [[nodiscard]] VkFramebuffer Handle() const noexcept {
+        return *framebuffer;
+    }
+
+    [[nodiscard]] VkRenderPass RenderPass() const noexcept {
+        return renderpass;
+    }
+
+    [[nodiscard]] VkExtent2D RenderArea() const noexcept {
+        return render_area;
+    }
+
+    [[nodiscard]] VkSampleCountFlagBits Samples() const noexcept {
+        return samples;
+    }
+
+    [[nodiscard]] u32 NumColorBuffers() const noexcept {
+        return num_color_buffers;
+    }
+
+    [[nodiscard]] u32 NumImages() const noexcept {
+        return num_images;
+    }
+
+    [[nodiscard]] const std::array<VkImage, 9>& Images() const noexcept {
+        return images;
+    }
+
+    [[nodiscard]] const std::array<VkImageSubresourceRange, 9>& ImageRanges() const noexcept {
+        return image_ranges;
+    }
+
+    [[nodiscard]] bool HasAspectColorBit(size_t index) const noexcept {
+        return (image_ranges.at(rt_map[index]).aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) != 0;
+    }
+
+    [[nodiscard]] bool HasAspectDepthBit() const noexcept {
+        return has_depth;
+    }
+
+    [[nodiscard]] bool HasAspectStencilBit() const noexcept {
+        return has_stencil;
+    }
+
+    [[nodiscard]] bool IsRescaled() const noexcept {
+        return is_rescaled;
+    }
+
+private:
+    vk::Framebuffer framebuffer;
+    VkRenderPass renderpass{};
+    VkExtent2D render_area{};
+    VkSampleCountFlagBits samples = VK_SAMPLE_COUNT_1_BIT;
+    u32 num_color_buffers = 0;
+    u32 num_images = 0;
+    std::array<VkImage, 9> images{};
+    std::array<VkImageSubresourceRange, 9> image_ranges{};
+    std::array<size_t, NUM_RT> rt_map{};
+    bool has_depth{};
+    bool has_stencil{};
+    bool is_rescaled{};
+};
+
 class Image : public VideoCommon::ImageBase {
 public:
     explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr,
@@ -226,10 +309,9 @@ private:
     VkImageAspectFlags aspect_mask = 0;
     bool initialized = false;
 
-    std::unique_ptr<Framebuffer> scale_framebuffer;
+    std::optional<Framebuffer> scale_framebuffer;
+    std::optional<Framebuffer> normal_framebuffer;
     std::unique_ptr<ImageView> scale_view;
-
-    std::unique_ptr<Framebuffer> normal_framebuffer;
     std::unique_ptr<ImageView> normal_view;
 };
 
@@ -297,7 +379,7 @@ private:
     const SlotVector<Image>* slot_images = nullptr;
 
     std::array<vk::ImageView, Shader::NUM_TEXTURE_TYPES> image_views;
-    std::unique_ptr<StorageViews> storage_views;
+    std::optional<StorageViews> storage_views;
     vk::ImageView depth_view;
     vk::ImageView stencil_view;
     vk::ImageView color_view;
@@ -331,89 +413,6 @@ private:
     vk::Sampler sampler_default_anisotropy;
 };
 
-class Framebuffer {
-public:
-    explicit Framebuffer(TextureCacheRuntime& runtime, std::span<ImageView*, NUM_RT> color_buffers,
-                         ImageView* depth_buffer, const VideoCommon::RenderTargets& key);
-
-    explicit Framebuffer(TextureCacheRuntime& runtime, ImageView* color_buffer,
-                         ImageView* depth_buffer, VkExtent2D extent, bool is_rescaled);
-
-    ~Framebuffer();
-
-    Framebuffer(const Framebuffer&) = delete;
-    Framebuffer& operator=(const Framebuffer&) = delete;
-
-    Framebuffer(Framebuffer&&) = default;
-    Framebuffer& operator=(Framebuffer&&) = default;
-
-    void CreateFramebuffer(TextureCacheRuntime& runtime,
-                           std::span<ImageView*, NUM_RT> color_buffers, ImageView* depth_buffer,
-                           bool is_rescaled = false);
-
-    [[nodiscard]] VkFramebuffer Handle() const noexcept {
-        return *framebuffer;
-    }
-
-    [[nodiscard]] VkRenderPass RenderPass() const noexcept {
-        return renderpass;
-    }
-
-    [[nodiscard]] VkExtent2D RenderArea() const noexcept {
-        return render_area;
-    }
-
-    [[nodiscard]] VkSampleCountFlagBits Samples() const noexcept {
-        return samples;
-    }
-
-    [[nodiscard]] u32 NumColorBuffers() const noexcept {
-        return num_color_buffers;
-    }
-
-    [[nodiscard]] u32 NumImages() const noexcept {
-        return num_images;
-    }
-
-    [[nodiscard]] const std::array<VkImage, 9>& Images() const noexcept {
-        return images;
-    }
-
-    [[nodiscard]] const std::array<VkImageSubresourceRange, 9>& ImageRanges() const noexcept {
-        return image_ranges;
-    }
-
-    [[nodiscard]] bool HasAspectColorBit(size_t index) const noexcept {
-        return (image_ranges.at(rt_map[index]).aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) != 0;
-    }
-
-    [[nodiscard]] bool HasAspectDepthBit() const noexcept {
-        return has_depth;
-    }
-
-    [[nodiscard]] bool HasAspectStencilBit() const noexcept {
-        return has_stencil;
-    }
-
-    [[nodiscard]] bool IsRescaled() const noexcept {
-        return is_rescaled;
-    }
-
-private:
-    vk::Framebuffer framebuffer;
-    VkRenderPass renderpass{};
-    VkExtent2D render_area{};
-    VkSampleCountFlagBits samples = VK_SAMPLE_COUNT_1_BIT;
-    u32 num_color_buffers = 0;
-    u32 num_images = 0;
-    std::array<VkImage, 9> images{};
-    std::array<VkImageSubresourceRange, 9> image_ranges{};
-    std::array<size_t, NUM_RT> rt_map{};
-    bool has_depth{};
-    bool has_stencil{};
-    bool is_rescaled{};
-};
-
 struct TextureCacheParams {
     static constexpr bool ENABLE_VALIDATION = true;
     static constexpr bool FRAMEBUFFER_BLITS = false;
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index 425c8e23de..53fb57317f 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -596,10 +596,10 @@ FramebufferId TextureCache<P>::GetFramebufferId(const RenderTargets& key) {
         return framebuffer_id;
     }
     std::array<ImageView*, NUM_RT> color_buffers;
-    std::ranges::transform(key.color_buffer_ids, color_buffers.begin(),
-                           [this](ImageViewId id) { return id ? &slot_image_views[id] : nullptr; });
-    ImageView* const depth_buffer =
-        key.depth_buffer_id ? &slot_image_views[key.depth_buffer_id] : nullptr;
+    std::ranges::transform(key.color_buffer_ids, color_buffers.begin(), [this](ImageViewId id) {
+        return id ? &slot_image_views[id] : nullptr;
+    });
+    ImageView* const depth_buffer = key.depth_buffer_id ? &slot_image_views[key.depth_buffer_id] : nullptr;
     framebuffer_id = slot_framebuffers.insert(runtime, color_buffers, depth_buffer, key);
     return framebuffer_id;
 }