[crypto] Rework AES CTR/XTS streaming and squash heap churn (#2782)
AES Updates: Replaced heap churn with stack scratch buffers tail handling now stays in-place, no more recursive transcode detours. CTR/XTS modes read in larger, aligned chunks and still handle odd offsets cleanly. XTS prefetches a few sectors ahead to reduce extra reads. AesCtrStorage writer now uses the pooled buffer properly one stack slab, chunk forward, bump counter, repeat. Result: less malloc noise, fewer watchdog spikes at startup (though mbedtls still sets the pace). This should make the loading speed slightly better than before. Make sure to test. Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/2782 Reviewed-by: MaranBr <maranbr@eden-emu.dev> Reviewed-by: crueter <crueter@eden-emu.dev> Co-authored-by: godpow <thesaviorsrule@yahoo.com> Co-committed-by: godpow <thesaviorsrule@yahoo.com>
This commit is contained in:
parent
73ebf59af7
commit
41e15e95b1
|
|
@ -4,8 +4,9 @@
|
|||
// SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <vector>
|
||||
#include <cstring>
|
||||
#include <mbedtls/cipher.h>
|
||||
#include "common/assert.h"
|
||||
#include "common/logging/log.h"
|
||||
|
|
@ -15,6 +16,7 @@
|
|||
namespace Core::Crypto {
|
||||
namespace {
|
||||
using NintendoTweak = std::array<u8, 16>;
|
||||
constexpr std::size_t AesBlockBytes = 16;
|
||||
|
||||
NintendoTweak CalculateNintendoTweak(std::size_t sector_id) {
|
||||
NintendoTweak out{};
|
||||
|
|
@ -75,39 +77,51 @@ void AESCipher<Key, KeySize>::Transcode(const u8* src, std::size_t size, u8* des
|
|||
|
||||
mbedtls_cipher_reset(context);
|
||||
|
||||
// Only ECB strictly requires block sized chunks.
|
||||
if (size == 0)
|
||||
return;
|
||||
|
||||
const auto mode = mbedtls_cipher_get_cipher_mode(context);
|
||||
std::size_t written = 0;
|
||||
if (mbedtls_cipher_get_cipher_mode(context) != MBEDTLS_MODE_ECB) {
|
||||
mbedtls_cipher_update(context, src, size, dest, &written);
|
||||
if (written != size)
|
||||
|
||||
if (mode != MBEDTLS_MODE_ECB) {
|
||||
const int ret = mbedtls_cipher_update(context, src, size, dest, &written);
|
||||
ASSERT(ret == 0);
|
||||
if (written != size) {
|
||||
LOG_WARNING(Crypto, "Not all data was processed requested={:016X}, actual={:016X}.", size, written);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// ECB path: operate in block sized chunks and mirror previous behavior.
|
||||
const auto block_size = mbedtls_cipher_get_block_size(context);
|
||||
if (size < block_size) {
|
||||
std::vector<u8> block(block_size);
|
||||
std::memcpy(block.data(), src, size);
|
||||
Transcode(block.data(), block.size(), block.data(), op);
|
||||
std::memcpy(dest, block.data(), size);
|
||||
return;
|
||||
}
|
||||
ASSERT(block_size <= AesBlockBytes);
|
||||
|
||||
for (std::size_t offset = 0; offset < size; offset += block_size) {
|
||||
const auto length = std::min<std::size_t>(block_size, size - offset);
|
||||
mbedtls_cipher_update(context, src + offset, length, dest + offset, &written);
|
||||
if (written != length) {
|
||||
if (length < block_size) {
|
||||
std::vector<u8> block(block_size);
|
||||
std::memcpy(block.data(), src + offset, length);
|
||||
Transcode(block.data(), block.size(), block.data(), op);
|
||||
std::memcpy(dest + offset, block.data(), length);
|
||||
return;
|
||||
}
|
||||
LOG_WARNING(Crypto, "Not all data was processed requested={:016X}, actual={:016X}.", length, written);
|
||||
const std::size_t whole_block_bytes = size - (size % block_size);
|
||||
if (whole_block_bytes != 0) {
|
||||
const int ret = mbedtls_cipher_update(context, src, whole_block_bytes, dest, &written);
|
||||
ASSERT(ret == 0);
|
||||
if (written != whole_block_bytes) {
|
||||
LOG_WARNING(Crypto, "Not all data was processed requested={:016X}, actual={:016X}.",
|
||||
whole_block_bytes, written);
|
||||
}
|
||||
}
|
||||
|
||||
const std::size_t tail = size - whole_block_bytes;
|
||||
if (tail == 0)
|
||||
return;
|
||||
|
||||
std::array<u8, AesBlockBytes> tail_buffer{};
|
||||
std::memcpy(tail_buffer.data(), src + whole_block_bytes, tail);
|
||||
|
||||
std::size_t tail_written = 0;
|
||||
const int ret = mbedtls_cipher_update(context, tail_buffer.data(), block_size, tail_buffer.data(),
|
||||
&tail_written);
|
||||
ASSERT(ret == 0);
|
||||
if (tail_written != block_size) {
|
||||
LOG_WARNING(Crypto, "Not all data was processed requested={:016X}, actual={:016X}.", block_size,
|
||||
tail_written);
|
||||
}
|
||||
|
||||
std::memcpy(dest + whole_block_bytes, tail_buffer.data(), tail);
|
||||
}
|
||||
|
||||
template <typename Key, std::size_t KeySize>
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <cstring>
|
||||
#include "core/crypto/ctr_encryption_layer.h"
|
||||
|
||||
|
|
@ -18,35 +19,84 @@ std::size_t CTREncryptionLayer::Read(u8* data, std::size_t length, std::size_t o
|
|||
if (length == 0)
|
||||
return 0;
|
||||
|
||||
constexpr std::size_t BlockSize = 0x10;
|
||||
constexpr std::size_t MaxChunkSize = 0x10000;
|
||||
|
||||
std::size_t total_read = 0;
|
||||
// Handle an initial misaligned portion if needed.
|
||||
if (auto const sector_offset = offset & 0xF; sector_offset != 0) {
|
||||
const std::size_t aligned_off = offset - sector_offset;
|
||||
std::array<u8, 0x10> block{};
|
||||
if (auto const got = base->Read(block.data(), block.size(), aligned_off); got != 0) {
|
||||
UpdateIV(base_offset + aligned_off);
|
||||
cipher.Transcode(block.data(), got, block.data(), Op::Decrypt);
|
||||
auto const to_copy = std::min<std::size_t>(length, got > sector_offset ? got - sector_offset : 0);
|
||||
if (to_copy > 0) {
|
||||
std::memcpy(data, block.data() + sector_offset, to_copy);
|
||||
data += to_copy;
|
||||
offset += to_copy;
|
||||
length -= to_copy;
|
||||
total_read += to_copy;
|
||||
}
|
||||
} else {
|
||||
return 0;
|
||||
auto* out = data;
|
||||
std::size_t remaining = length;
|
||||
std::size_t current_offset = offset;
|
||||
|
||||
const auto read_exact = [this](u8* dst, std::size_t bytes, std::size_t src_offset) {
|
||||
std::size_t filled = 0;
|
||||
while (filled < bytes) {
|
||||
const std::size_t got = base->Read(dst + filled, bytes - filled, src_offset + filled);
|
||||
if (got == 0)
|
||||
break;
|
||||
filled += got;
|
||||
}
|
||||
return filled;
|
||||
};
|
||||
|
||||
if (const std::size_t intra_block = current_offset & (BlockSize - 1); intra_block != 0) {
|
||||
std::array<u8, BlockSize> block{};
|
||||
const std::size_t aligned_offset = current_offset - intra_block;
|
||||
const std::size_t got = read_exact(block.data(), BlockSize, aligned_offset);
|
||||
if (got <= intra_block)
|
||||
return total_read;
|
||||
|
||||
UpdateIV(base_offset + aligned_offset);
|
||||
cipher.Transcode(block.data(), got, block.data(), Op::Decrypt);
|
||||
|
||||
const std::size_t available = got - intra_block;
|
||||
const std::size_t to_copy = std::min<std::size_t>(remaining, available);
|
||||
std::memcpy(out, block.data() + intra_block, to_copy);
|
||||
|
||||
out += to_copy;
|
||||
current_offset += to_copy;
|
||||
remaining -= to_copy;
|
||||
total_read += to_copy;
|
||||
|
||||
if (to_copy != available)
|
||||
return total_read;
|
||||
}
|
||||
if (length > 0) {
|
||||
// Now aligned to 0x10
|
||||
UpdateIV(base_offset + offset);
|
||||
const std::size_t got = base->Read(data, length, offset);
|
||||
if (got > 0) {
|
||||
cipher.Transcode(data, got, data, Op::Decrypt);
|
||||
total_read += got;
|
||||
}
|
||||
|
||||
while (remaining >= BlockSize) {
|
||||
const std::size_t chunk_request = std::min<std::size_t>(remaining, MaxChunkSize);
|
||||
const std::size_t aligned_request = chunk_request - (chunk_request % BlockSize);
|
||||
if (aligned_request == 0)
|
||||
break;
|
||||
|
||||
const std::size_t got = read_exact(out, aligned_request, current_offset);
|
||||
if (got == 0)
|
||||
break;
|
||||
|
||||
UpdateIV(base_offset + current_offset);
|
||||
cipher.Transcode(out, got, out, Op::Decrypt);
|
||||
|
||||
out += got;
|
||||
current_offset += got;
|
||||
remaining -= got;
|
||||
total_read += got;
|
||||
|
||||
if (got < aligned_request)
|
||||
return total_read;
|
||||
}
|
||||
|
||||
if (remaining > 0) {
|
||||
std::array<u8, BlockSize> block{};
|
||||
const std::size_t got = read_exact(block.data(), BlockSize, current_offset);
|
||||
if (got == 0)
|
||||
return total_read;
|
||||
|
||||
UpdateIV(base_offset + current_offset);
|
||||
cipher.Transcode(block.data(), got, block.data(), Op::Decrypt);
|
||||
|
||||
const std::size_t to_copy = std::min<std::size_t>(remaining, got);
|
||||
std::memcpy(out, block.data(), to_copy);
|
||||
total_read += to_copy;
|
||||
}
|
||||
|
||||
return total_read;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -20,67 +20,49 @@ std::size_t XTSEncryptionLayer::Read(u8* data, std::size_t length, std::size_t o
|
|||
if (length == 0)
|
||||
return 0;
|
||||
|
||||
constexpr std::size_t PrefetchSectors = 4;
|
||||
|
||||
auto* out = data;
|
||||
std::size_t remaining = length;
|
||||
std::size_t current_offset = offset;
|
||||
std::size_t total_read = 0;
|
||||
// Handle initial unaligned part within a sector.
|
||||
if (auto const sector_offset = offset % XTS_SECTOR_SIZE; sector_offset != 0) {
|
||||
const std::size_t aligned_off = offset - sector_offset;
|
||||
std::array<u8, XTS_SECTOR_SIZE> block{};
|
||||
if (auto const got = base->Read(block.data(), XTS_SECTOR_SIZE, aligned_off); got > 0) {
|
||||
if (got < XTS_SECTOR_SIZE)
|
||||
std::memset(block.data() + got, 0, XTS_SECTOR_SIZE - got);
|
||||
cipher.XTSTranscode(block.data(), XTS_SECTOR_SIZE, block.data(), aligned_off / XTS_SECTOR_SIZE,
|
||||
XTS_SECTOR_SIZE, Op::Decrypt);
|
||||
|
||||
auto const to_copy = std::min<std::size_t>(length, got > sector_offset ? got - sector_offset : 0);
|
||||
if (to_copy > 0) {
|
||||
std::memcpy(data, block.data() + sector_offset, to_copy);
|
||||
data += to_copy;
|
||||
offset += to_copy;
|
||||
length -= to_copy;
|
||||
total_read += to_copy;
|
||||
}
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
std::array<u8, XTS_SECTOR_SIZE> sector{};
|
||||
|
||||
if (length > 0) {
|
||||
// Process aligned middle inplace, in sector sized multiples.
|
||||
while (length >= XTS_SECTOR_SIZE) {
|
||||
const std::size_t req = (length / XTS_SECTOR_SIZE) * XTS_SECTOR_SIZE;
|
||||
const std::size_t got = base->Read(data, req, offset);
|
||||
if (got == 0) {
|
||||
while (remaining > 0) {
|
||||
const std::size_t sector_index = current_offset / XTS_SECTOR_SIZE;
|
||||
const std::size_t sector_offset = current_offset % XTS_SECTOR_SIZE;
|
||||
|
||||
const std::size_t sectors_to_read = std::min<std::size_t>(PrefetchSectors,
|
||||
(remaining + sector_offset +
|
||||
XTS_SECTOR_SIZE - 1) /
|
||||
XTS_SECTOR_SIZE);
|
||||
|
||||
for (std::size_t s = 0; s < sectors_to_read && remaining > 0; ++s) {
|
||||
const std::size_t index = sector_index + s;
|
||||
const std::size_t read_offset = index * XTS_SECTOR_SIZE;
|
||||
const std::size_t got = base->Read(sector.data(), XTS_SECTOR_SIZE, read_offset);
|
||||
if (got == 0)
|
||||
return total_read;
|
||||
}
|
||||
const std::size_t got_rounded = got - (got % XTS_SECTOR_SIZE);
|
||||
if (got_rounded > 0) {
|
||||
cipher.XTSTranscode(data, got_rounded, data, offset / XTS_SECTOR_SIZE, XTS_SECTOR_SIZE, Op::Decrypt);
|
||||
data += got_rounded;
|
||||
offset += got_rounded;
|
||||
length -= got_rounded;
|
||||
total_read += got_rounded;
|
||||
}
|
||||
// If we didn't get a full sector next, break to handle tail.
|
||||
if (got_rounded != got) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Handle tail within a sector, if any.
|
||||
if (length > 0) {
|
||||
std::array<u8, XTS_SECTOR_SIZE> block{};
|
||||
const std::size_t got = base->Read(block.data(), XTS_SECTOR_SIZE, offset);
|
||||
if (got > 0) {
|
||||
if (got < XTS_SECTOR_SIZE) {
|
||||
std::memset(block.data() + got, 0, XTS_SECTOR_SIZE - got);
|
||||
}
|
||||
cipher.XTSTranscode(block.data(), XTS_SECTOR_SIZE, block.data(),
|
||||
offset / XTS_SECTOR_SIZE, XTS_SECTOR_SIZE, Op::Decrypt);
|
||||
const std::size_t to_copy = std::min<std::size_t>(length, got);
|
||||
std::memcpy(data, block.data(), to_copy);
|
||||
total_read += to_copy;
|
||||
}
|
||||
|
||||
if (got < XTS_SECTOR_SIZE)
|
||||
std::memset(sector.data() + got, 0, XTS_SECTOR_SIZE - got);
|
||||
|
||||
cipher.XTSTranscode(sector.data(), XTS_SECTOR_SIZE, sector.data(), index, XTS_SECTOR_SIZE,
|
||||
Op::Decrypt);
|
||||
|
||||
const std::size_t local_offset = (s == 0) ? sector_offset : 0;
|
||||
const std::size_t available = XTS_SECTOR_SIZE - local_offset;
|
||||
const std::size_t to_copy = std::min<std::size_t>(available, remaining);
|
||||
std::memcpy(out, sector.data() + local_offset, to_copy);
|
||||
|
||||
out += to_copy;
|
||||
current_offset += to_copy;
|
||||
remaining -= to_copy;
|
||||
total_read += to_copy;
|
||||
}
|
||||
}
|
||||
|
||||
return total_read;
|
||||
}
|
||||
} // namespace Core::Crypto
|
||||
|
|
|
|||
|
|
@ -86,18 +86,21 @@ size_t AesCtrStorage::Write(const u8* buffer, size_t size, size_t offset) {
|
|||
|
||||
// Loop until all data is written using a pooled buffer residing on the stack (blocksize = 0x10)
|
||||
boost::container::static_vector<u8, BlockSize> pooled_buffer;
|
||||
for (size_t remaining = size; remaining > 0; ) {
|
||||
// Determine data we're writing and where.
|
||||
auto const write_size = (std::min)(pooled_buffer.size(), remaining);
|
||||
u8* write_buf = pooled_buffer.data();
|
||||
pooled_buffer.resize(BlockSize);
|
||||
|
||||
const u8* cur = buffer;
|
||||
size_t remaining = size;
|
||||
size_t current_offset = offset;
|
||||
|
||||
while (remaining > 0) {
|
||||
const size_t write_size = std::min<std::size_t>(pooled_buffer.size(), remaining);
|
||||
|
||||
// Encrypt the data and then write it.
|
||||
m_cipher->SetIV(ctr);
|
||||
m_cipher->Transcode(buffer, write_size, write_buf, Core::Crypto::Op::Encrypt);
|
||||
m_base_storage->Write(write_buf, write_size, offset);
|
||||
m_cipher->Transcode(cur, write_size, pooled_buffer.data(), Core::Crypto::Op::Encrypt);
|
||||
m_base_storage->Write(pooled_buffer.data(), write_size, current_offset);
|
||||
|
||||
// Advance next write chunk
|
||||
offset += write_size;
|
||||
cur += write_size;
|
||||
current_offset += write_size;
|
||||
remaining -= write_size;
|
||||
if (remaining > 0)
|
||||
AddCounter(ctr.data(), IvSize, write_size / BlockSize);
|
||||
|
|
|
|||
|
|
@ -65,10 +65,13 @@ size_t AesXtsStorage::Read(u8* buffer, size_t size, size_t offset) const {
|
|||
if ((offset % m_block_size) != 0) {
|
||||
// Decrypt into our pooled stack buffer (max bound = NCA::XtsBlockSize)
|
||||
boost::container::static_vector<u8, NcaHeader::XtsBlockSize> tmp_buf;
|
||||
ASSERT(m_block_size <= tmp_buf.max_size());
|
||||
tmp_buf.resize(m_block_size);
|
||||
// Determine the size of the pre-data read.
|
||||
auto const skip_size = size_t(offset - Common::AlignDown(offset, m_block_size));
|
||||
auto const data_size = (std::min)(size, m_block_size - skip_size);
|
||||
std::fill_n(tmp_buf.begin(), skip_size, u8{0});
|
||||
if (skip_size > 0)
|
||||
std::fill_n(tmp_buf.begin(), skip_size, u8{0});
|
||||
std::memcpy(tmp_buf.data() + skip_size, buffer, data_size);
|
||||
m_cipher->SetIV(ctr);
|
||||
m_cipher->Transcode(tmp_buf.data(), m_block_size, tmp_buf.data(), Core::Crypto::Op::Decrypt);
|
||||
|
|
|
|||
Loading…
Reference in New Issue