From 667dc2dcacba5341a79ac520ce712d2ce4cae1cc Mon Sep 17 00:00:00 2001 From: Michael Butler Date: Tue, 29 Dec 2020 18:56:44 -0800 Subject: [PATCH] Add recovery code to NN ResilientPreparedModel and *Buffer Prior to this CL, ResilientPreparedModel and ResilientBuffer were passthrough interfaces that just forwarded calls to the underlying interface object. This CL implements the full recovery mechanism for these two classes. However, because we do not want to enable this functionality in the NN runtime yet, ResilientDevice hides the paths that create ResilientPreparedModel and ResilientBuffer behind an #if until we want to enable those paths. Bug: N/A Test: mma Change-Id: Idfe8093c63c7ba2f16c995eec872d150696e7a08 --- .../include/nnapi/hal/ResilientBuffer.h | 2 +- .../nnapi/hal/ResilientPreparedModel.h | 4 +- .../utils/common/src/ResilientBuffer.cpp | 48 ++++++++++++++-- .../utils/common/src/ResilientDevice.cpp | 19 ++++++- .../common/src/ResilientPreparedModel.cpp | 55 +++++++++++++++++-- 5 files changed, 113 insertions(+), 15 deletions(-) diff --git a/neuralnetworks/utils/common/include/nnapi/hal/ResilientBuffer.h b/neuralnetworks/utils/common/include/nnapi/hal/ResilientBuffer.h index 9d5e3e6a05..d2c2469403 100644 --- a/neuralnetworks/utils/common/include/nnapi/hal/ResilientBuffer.h +++ b/neuralnetworks/utils/common/include/nnapi/hal/ResilientBuffer.h @@ -42,7 +42,7 @@ class ResilientBuffer final : public nn::IBuffer { nn::SharedBuffer buffer); nn::SharedBuffer getBuffer() const; - nn::SharedBuffer recover(const nn::IBuffer* failingBuffer, bool blocking) const; + nn::GeneralResult recover(const nn::IBuffer* failingBuffer) const; nn::Request::MemoryDomainToken getToken() const override; diff --git a/neuralnetworks/utils/common/include/nnapi/hal/ResilientPreparedModel.h b/neuralnetworks/utils/common/include/nnapi/hal/ResilientPreparedModel.h index faae673ba7..9b8d92435c 100644 --- a/neuralnetworks/utils/common/include/nnapi/hal/ResilientPreparedModel.h +++ b/neuralnetworks/utils/common/include/nnapi/hal/ResilientPreparedModel.h @@ -43,8 +43,8 @@ class ResilientPreparedModel final : public nn::IPreparedModel { nn::SharedPreparedModel preparedModel); nn::SharedPreparedModel getPreparedModel() const; - nn::SharedPreparedModel recover(const nn::IPreparedModel* failingPreparedModel, - bool blocking) const; + nn::GeneralResult recover( + const nn::IPreparedModel* failingPreparedModel) const; nn::ExecutionResult, nn::Timing>> execute( const nn::Request& request, nn::MeasureTiming measure, diff --git a/neuralnetworks/utils/common/src/ResilientBuffer.cpp b/neuralnetworks/utils/common/src/ResilientBuffer.cpp index cf5496ac39..47abbe268f 100644 --- a/neuralnetworks/utils/common/src/ResilientBuffer.cpp +++ b/neuralnetworks/utils/common/src/ResilientBuffer.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -29,6 +30,34 @@ #include namespace android::hardware::neuralnetworks::utils { +namespace { + +template +auto protect(const ResilientBuffer& resilientBuffer, const FnType& fn) + -> decltype(fn(*resilientBuffer.getBuffer())) { + auto buffer = resilientBuffer.getBuffer(); + auto result = fn(*buffer); + + // Immediately return if device is not dead. + if (result.has_value() || result.error().code != nn::ErrorStatus::DEAD_OBJECT) { + return result; + } + + // Attempt recovery and return if it fails. + auto maybeBuffer = resilientBuffer.recover(buffer.get()); + if (!maybeBuffer.has_value()) { + const auto& [resultErrorMessage, resultErrorCode] = result.error(); + const auto& [recoveryErrorMessage, recoveryErrorCode] = maybeBuffer.error(); + return nn::error(resultErrorCode) + << resultErrorMessage << ", and failed to recover dead buffer with error " + << recoveryErrorCode << ": " << recoveryErrorMessage; + } + buffer = std::move(maybeBuffer).value(); + + return fn(*buffer); +} + +} // namespace nn::GeneralResult> ResilientBuffer::create( Factory makeBuffer) { @@ -53,9 +82,16 @@ nn::SharedBuffer ResilientBuffer::getBuffer() const { std::lock_guard guard(mMutex); return mBuffer; } -nn::SharedBuffer ResilientBuffer::recover(const nn::IBuffer* /*failingBuffer*/, - bool /*blocking*/) const { +nn::GeneralResult ResilientBuffer::recover( + const nn::IBuffer* failingBuffer) const { std::lock_guard guard(mMutex); + + // Another caller updated the failing prepared model. + if (mBuffer.get() != failingBuffer) { + return mBuffer; + } + + mBuffer = NN_TRY(kMakeBuffer()); return mBuffer; } @@ -64,12 +100,16 @@ nn::Request::MemoryDomainToken ResilientBuffer::getToken() const { } nn::GeneralResult ResilientBuffer::copyTo(const nn::Memory& dst) const { - return getBuffer()->copyTo(dst); + const auto fn = [&dst](const nn::IBuffer& buffer) { return buffer.copyTo(dst); }; + return protect(*this, fn); } nn::GeneralResult ResilientBuffer::copyFrom(const nn::Memory& src, const nn::Dimensions& dimensions) const { - return getBuffer()->copyFrom(src, dimensions); + const auto fn = [&src, &dimensions](const nn::IBuffer& buffer) { + return buffer.copyFrom(src, dimensions); + }; + return protect(*this, fn); } } // namespace android::hardware::neuralnetworks::utils diff --git a/neuralnetworks/utils/common/src/ResilientDevice.cpp b/neuralnetworks/utils/common/src/ResilientDevice.cpp index 6ad3fadee6..2023c9af30 100644 --- a/neuralnetworks/utils/common/src/ResilientDevice.cpp +++ b/neuralnetworks/utils/common/src/ResilientDevice.cpp @@ -180,6 +180,7 @@ nn::GeneralResult ResilientDevice::prepareModel( const nn::Model& model, nn::ExecutionPreference preference, nn::Priority priority, nn::OptionalTimePoint deadline, const std::vector& modelCache, const std::vector& dataCache, const nn::CacheToken& token) const { +#if 0 auto self = shared_from_this(); ResilientPreparedModel::Factory makePreparedModel = [device = std::move(self), model, preference, priority, deadline, modelCache, @@ -188,29 +189,41 @@ nn::GeneralResult ResilientDevice::prepareModel( dataCache, token); }; return ResilientPreparedModel::create(std::move(makePreparedModel)); +#else + return prepareModelInternal(model, preference, priority, deadline, modelCache, dataCache, + token); +#endif } nn::GeneralResult ResilientDevice::prepareModelFromCache( nn::OptionalTimePoint deadline, const std::vector& modelCache, const std::vector& dataCache, const nn::CacheToken& token) const { +#if 0 auto self = shared_from_this(); ResilientPreparedModel::Factory makePreparedModel = [device = std::move(self), deadline, modelCache, dataCache, token] { return device->prepareModelFromCacheInternal(deadline, modelCache, dataCache, token); }; return ResilientPreparedModel::create(std::move(makePreparedModel)); +#else + return prepareModelFromCacheInternal(deadline, modelCache, dataCache, token); +#endif } nn::GeneralResult ResilientDevice::allocate( const nn::BufferDesc& desc, const std::vector& preparedModels, const std::vector& inputRoles, const std::vector& outputRoles) const { +#if 0 auto self = shared_from_this(); ResilientBuffer::Factory makeBuffer = [device = std::move(self), desc, preparedModels, inputRoles, outputRoles] { return device->allocateInternal(desc, preparedModels, inputRoles, outputRoles); }; return ResilientBuffer::create(std::move(makeBuffer)); +#else + return allocateInternal(desc, preparedModels, inputRoles, outputRoles); +#endif } bool ResilientDevice::isValidInternal() const { @@ -225,8 +238,8 @@ nn::GeneralResult ResilientDevice::prepareModelInternal if (!isValidInternal()) { return std::make_shared(); } - const auto fn = [&model, preference, priority, deadline, &modelCache, &dataCache, - token](const nn::IDevice& device) { + const auto fn = [&model, preference, priority, &deadline, &modelCache, &dataCache, + &token](const nn::IDevice& device) { return device.prepareModel(model, preference, priority, deadline, modelCache, dataCache, token); }; @@ -239,7 +252,7 @@ nn::GeneralResult ResilientDevice::prepareModelFromCach if (!isValidInternal()) { return std::make_shared(); } - const auto fn = [deadline, &modelCache, &dataCache, token](const nn::IDevice& device) { + const auto fn = [&deadline, &modelCache, &dataCache, &token](const nn::IDevice& device) { return device.prepareModelFromCache(deadline, modelCache, dataCache, token); }; return protect(*this, fn, /*blocking=*/false); diff --git a/neuralnetworks/utils/common/src/ResilientPreparedModel.cpp b/neuralnetworks/utils/common/src/ResilientPreparedModel.cpp index b8acee16c9..667df2b590 100644 --- a/neuralnetworks/utils/common/src/ResilientPreparedModel.cpp +++ b/neuralnetworks/utils/common/src/ResilientPreparedModel.cpp @@ -20,15 +20,45 @@ #include #include #include +#include #include #include #include #include +#include #include #include namespace android::hardware::neuralnetworks::utils { +namespace { + +template +auto protect(const ResilientPreparedModel& resilientPreparedModel, const FnType& fn) + -> decltype(fn(*resilientPreparedModel.getPreparedModel())) { + auto preparedModel = resilientPreparedModel.getPreparedModel(); + auto result = fn(*preparedModel); + + // Immediately return if prepared model is not dead. + if (result.has_value() || result.error().code != nn::ErrorStatus::DEAD_OBJECT) { + return result; + } + + // Attempt recovery and return if it fails. + auto maybePreparedModel = resilientPreparedModel.recover(preparedModel.get()); + if (!maybePreparedModel.has_value()) { + const auto& [message, code] = maybePreparedModel.error(); + std::ostringstream oss; + oss << ", and failed to recover dead prepared model with error " << code << ": " << message; + result.error().message += oss.str(); + return result; + } + preparedModel = std::move(maybePreparedModel).value(); + + return fn(*preparedModel); +} + +} // namespace nn::GeneralResult> ResilientPreparedModel::create( Factory makePreparedModel) { @@ -55,9 +85,16 @@ nn::SharedPreparedModel ResilientPreparedModel::getPreparedModel() const { return mPreparedModel; } -nn::SharedPreparedModel ResilientPreparedModel::recover( - const nn::IPreparedModel* /*failingPreparedModel*/, bool /*blocking*/) const { +nn::GeneralResult ResilientPreparedModel::recover( + const nn::IPreparedModel* failingPreparedModel) const { std::lock_guard guard(mMutex); + + // Another caller updated the failing prepared model. + if (mPreparedModel.get() != failingPreparedModel) { + return mPreparedModel; + } + + mPreparedModel = NN_TRY(kMakePreparedModel()); return mPreparedModel; } @@ -65,7 +102,11 @@ nn::ExecutionResult, nn::Timing>> ResilientPreparedModel::execute(const nn::Request& request, nn::MeasureTiming measure, const nn::OptionalTimePoint& deadline, const nn::OptionalDuration& loopTimeoutDuration) const { - return getPreparedModel()->execute(request, measure, deadline, loopTimeoutDuration); + const auto fn = [&request, measure, &deadline, + &loopTimeoutDuration](const nn::IPreparedModel& preparedModel) { + return preparedModel.execute(request, measure, deadline, loopTimeoutDuration); + }; + return protect(*this, fn); } nn::GeneralResult> @@ -75,8 +116,12 @@ ResilientPreparedModel::executeFenced(const nn::Request& request, const nn::OptionalTimePoint& deadline, const nn::OptionalDuration& loopTimeoutDuration, const nn::OptionalDuration& timeoutDurationAfterFence) const { - return getPreparedModel()->executeFenced(request, waitFor, measure, deadline, - loopTimeoutDuration, timeoutDurationAfterFence); + const auto fn = [&request, &waitFor, measure, &deadline, &loopTimeoutDuration, + &timeoutDurationAfterFence](const nn::IPreparedModel& preparedModel) { + return preparedModel.executeFenced(request, waitFor, measure, deadline, loopTimeoutDuration, + timeoutDurationAfterFence); + }; + return protect(*this, fn); } std::any ResilientPreparedModel::getUnderlyingResource() const {