Sunshine/src/platform/windows/display_vram.cpp

#include <cmath>

#include <codecvt>

#include <d3dcompiler.h>
#include <directxmath.h>

extern "C" {
#include <libavcodec/avcodec.h>
#include <libavutil/hwcontext_d3d11va.h>
}

#include "display.h"
#include "src/main.h"
#include "src/video.h"


#define SUNSHINE_SHADERS_DIR SUNSHINE_ASSETS_DIR "/shaders/directx"
namespace platf {
using namespace std::literals;
}

static void free_frame(AVFrame *frame) {
  av_frame_free(&frame);
}

using frame_t = util::safe_ptr<AVFrame, free_frame>;

namespace platf::dxgi {

template<class T>
buf_t make_buffer(device_t::pointer device, const T &t) {
  static_assert(sizeof(T) % 16 == 0, "Buffer needs to be aligned on a 16-byte alignment");

  D3D11_BUFFER_DESC buffer_desc {
    sizeof(T),
    D3D11_USAGE_IMMUTABLE,
    D3D11_BIND_CONSTANT_BUFFER
  };

  D3D11_SUBRESOURCE_DATA init_data {
    &t
  };

  buf_t::pointer buf_p;
  auto status = device->CreateBuffer(&buffer_desc, &init_data, &buf_p);
  if(status) {
    BOOST_LOG(error) << "Failed to create buffer: [0x"sv << util::hex(status).to_string_view() << ']';
    return nullptr;
  }

  return buf_t { buf_p };
}

blend_t make_blend(device_t::pointer device, bool enable, bool invert) {
  D3D11_BLEND_DESC bdesc {};
  auto &rt                 = bdesc.RenderTarget[0];
  rt.BlendEnable           = enable;
  rt.RenderTargetWriteMask = D3D11_COLOR_WRITE_ENABLE_ALL;

  if(enable) {
    rt.BlendOp      = D3D11_BLEND_OP_ADD;
    rt.BlendOpAlpha = D3D11_BLEND_OP_ADD;

    if(invert) {
      // Invert colors
      rt.SrcBlend  = D3D11_BLEND_INV_DEST_COLOR;
      rt.DestBlend = D3D11_BLEND_INV_SRC_COLOR;
    }
    else {
      // Regular alpha blending
      rt.SrcBlend  = D3D11_BLEND_SRC_ALPHA;
      rt.DestBlend = D3D11_BLEND_INV_SRC_ALPHA;
    }

    rt.SrcBlendAlpha  = D3D11_BLEND_ZERO;
    rt.DestBlendAlpha = D3D11_BLEND_ZERO;
  }

  blend_t blend;
  auto status = device->CreateBlendState(&bdesc, &blend);
  if(status) {
    BOOST_LOG(error) << "Failed to create blend state: [0x"sv << util::hex(status).to_string_view() << ']';
    return nullptr;
  }

  return blend;
}

blob_t convert_UV_vs_hlsl;
blob_t convert_UV_ps_hlsl;
blob_t scene_vs_hlsl;
blob_t convert_Y_ps_hlsl;
blob_t scene_ps_hlsl;

struct img_d3d_t : public platf::img_t {
  std::shared_ptr<platf::display_t> display;

  // These objects are owned by the display_t's ID3D11Device
  texture2d_t capture_texture;
  render_target_t capture_rt;
  keyed_mutex_t capture_mutex;

  // These objects are owned by the hwdevice_t's ID3D11Device
  texture2d_t encoder_texture;
  shader_res_t encoder_input_res;
  keyed_mutex_t encoder_mutex;

  // This is the shared handle used by hwdevice_t to open capture_texture
  HANDLE encoder_texture_handle = {};

  bool dummy = false;

  virtual ~img_d3d_t() override {
    if(encoder_texture_handle) {
      CloseHandle(encoder_texture_handle);
    }
  };
};

util::buffer_t<std::uint8_t> make_cursor_xor_image(const util::buffer_t<std::uint8_t> &img_data, DXGI_OUTDUPL_POINTER_SHAPE_INFO shape_info) {
  constexpr std::uint32_t inverted    = 0xFFFFFFFF;
  constexpr std::uint32_t transparent = 0;

  switch(shape_info.Type) {
  case DXGI_OUTDUPL_POINTER_SHAPE_TYPE_COLOR:
    // This type doesn't require any XOR-blending
    return {};
  case DXGI_OUTDUPL_POINTER_SHAPE_TYPE_MASKED_COLOR: {
    util::buffer_t<std::uint8_t> cursor_img = img_data;
    std::for_each((std::uint32_t *)std::begin(cursor_img), (std::uint32_t *)std::end(cursor_img), [](auto &pixel) {
      auto alpha = (std::uint8_t)((pixel >> 24) & 0xFF);
      if(alpha == 0xFF) {
        // Pixels with 0xFF alpha will be XOR-blended as is.
      }
      else if(alpha == 0x00) {
        // Pixels with 0x00 alpha will be blended by make_cursor_alpha_image().
        // We make them transparent for the XOR-blended cursor image.
        pixel = transparent;
      }
      else {
        // Other alpha values are illegal in masked color cursors
        BOOST_LOG(warning) << "Illegal alpha value in masked color cursor: " << alpha;
      }
    });
    return cursor_img;
  }
  case DXGI_OUTDUPL_POINTER_SHAPE_TYPE_MONOCHROME:
    // Monochrome is handled below
    break;
  default:
    BOOST_LOG(error) << "Invalid cursor shape type: " << shape_info.Type;
    return {};
  }

  shape_info.Height /= 2;

  util::buffer_t<std::uint8_t> cursor_img { shape_info.Width * shape_info.Height * 4 };

  auto bytes       = shape_info.Pitch * shape_info.Height;
  auto pixel_begin = (std::uint32_t *)std::begin(cursor_img);
  auto pixel_data  = pixel_begin;
  auto and_mask    = std::begin(img_data);
  auto xor_mask    = std::begin(img_data) + bytes;

  for(auto x = 0; x < bytes; ++x) {
    for(auto c = 7; c >= 0; --c) {
      auto bit        = 1 << c;
      auto color_type = ((*and_mask & bit) ? 1 : 0) + ((*xor_mask & bit) ? 2 : 0);

      switch(color_type) {
      case 0: // Opaque black (handled by alpha-blending)
      case 2: // Opaque white (handled by alpha-blending)
      case 1: // Color of screen (transparent)
        *pixel_data = transparent;
        break;
      case 3: // Inverse of screen
        *pixel_data = inverted;
        break;
      }

      ++pixel_data;
    }
    ++and_mask;
    ++xor_mask;
  }

  return cursor_img;
}

util::buffer_t<std::uint8_t> make_cursor_alpha_image(const util::buffer_t<std::uint8_t> &img_data, DXGI_OUTDUPL_POINTER_SHAPE_INFO shape_info) {
  constexpr std::uint32_t black       = 0xFF000000;
  constexpr std::uint32_t white       = 0xFFFFFFFF;
  constexpr std::uint32_t transparent = 0;

  switch(shape_info.Type) {
  case DXGI_OUTDUPL_POINTER_SHAPE_TYPE_MASKED_COLOR: {
    util::buffer_t<std::uint8_t> cursor_img = img_data;
    std::for_each((std::uint32_t *)std::begin(cursor_img), (std::uint32_t *)std::end(cursor_img), [](auto &pixel) {
      auto alpha = (std::uint8_t)((pixel >> 24) & 0xFF);
      if(alpha == 0xFF) {
        // Pixels with 0xFF alpha will be XOR-blended by make_cursor_xor_image().
        // We make them transparent for the alpha-blended cursor image.
        pixel = transparent;
      }
      else if(alpha == 0x00) {
        // Pixels with 0x00 alpha will be blended as opaque with the alpha-blended image.
        pixel |= 0xFF000000;
      }
      else {
        // Other alpha values are illegal in masked color cursors
        BOOST_LOG(warning) << "Illegal alpha value in masked color cursor: " << alpha;
      }
    });
    return cursor_img;
  }
  case DXGI_OUTDUPL_POINTER_SHAPE_TYPE_COLOR:
    // Color cursors are just an ARGB bitmap which requires no processing.
    return img_data;
  case DXGI_OUTDUPL_POINTER_SHAPE_TYPE_MONOCHROME:
    // Monochrome cursors are handled below.
    break;
  default:
    BOOST_LOG(error) << "Invalid cursor shape type: " << shape_info.Type;
    return {};
  }

  shape_info.Height /= 2;

  util::buffer_t<std::uint8_t> cursor_img { shape_info.Width * shape_info.Height * 4 };

  auto bytes       = shape_info.Pitch * shape_info.Height;
  auto pixel_begin = (std::uint32_t *)std::begin(cursor_img);
  auto pixel_data  = pixel_begin;
  auto and_mask    = std::begin(img_data);
  auto xor_mask    = std::begin(img_data) + bytes;

  for(auto x = 0; x < bytes; ++x) {
    for(auto c = 7; c >= 0; --c) {
      auto bit        = 1 << c;
      auto color_type = ((*and_mask & bit) ? 1 : 0) + ((*xor_mask & bit) ? 2 : 0);

      switch(color_type) {
      case 0: // Opaque black
        *pixel_data = black;
        break;
      case 2: // Opaque white
        *pixel_data = white;
        break;
      case 3: // Inverse of screen (handled by XOR blending)
      case 1: // Color of screen (transparent)
        *pixel_data = transparent;
        break;
      }

      ++pixel_data;
    }
    ++and_mask;
    ++xor_mask;
  }

  return cursor_img;
}

blob_t compile_shader(LPCSTR file, LPCSTR entrypoint, LPCSTR shader_model) {
  blob_t::pointer msg_p = nullptr;
  blob_t::pointer compiled_p;

  DWORD flags = D3DCOMPILE_ENABLE_STRICTNESS;

#ifndef NDEBUG
  flags |= D3DCOMPILE_DEBUG | D3DCOMPILE_SKIP_OPTIMIZATION;
#endif
  std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>, wchar_t> converter;

  auto wFile  = converter.from_bytes(file);
  auto status = D3DCompileFromFile(wFile.c_str(), nullptr, nullptr, entrypoint, shader_model, flags, 0, &compiled_p, &msg_p);

  if(msg_p) {
    BOOST_LOG(warning) << std::string_view { (const char *)msg_p->GetBufferPointer(), msg_p->GetBufferSize() - 1 };
    msg_p->Release();
  }

  if(status) {
    BOOST_LOG(error) << "Couldn't compile ["sv << file << "] [0x"sv << util::hex(status).to_string_view() << ']';
    return nullptr;
  }

  return blob_t { compiled_p };
}

blob_t compile_pixel_shader(LPCSTR file) {
  return compile_shader(file, "main_ps", "ps_5_0");
}

blob_t compile_vertex_shader(LPCSTR file) {
  return compile_shader(file, "main_vs", "vs_5_0");
}

class hwdevice_t : public platf::hwdevice_t {
public:
  int convert(platf::img_t &img_base) override {
    auto &img         = (img_d3d_t &)img_base;
    auto back_d3d_img = (img_d3d_t *)back_img.get();

    // Open the shared capture texture with our ID3D11Device
    if(share_img(&img_base)) {
      return -1;
    }

    // Acquire encoder mutex to synchronize with capture code
    auto status = img.encoder_mutex->AcquireSync(0, INFINITE);
    if(status != S_OK) {
      BOOST_LOG(error) << "Failed to acquire encoder mutex [0x"sv << util::hex(status).to_string_view() << ']';
      return -1;
    }

    // Even though this image will never have racing updates, we must acquire the
    // keyed mutex for PSSetShaderResources() to succeed.
    status = back_d3d_img->encoder_mutex->AcquireSync(0, INFINITE);
    if(status != S_OK) {
      img.encoder_mutex->ReleaseSync(0);
      BOOST_LOG(error) << "Failed to acquire back_d3d_img mutex [0x"sv << util::hex(status).to_string_view() << ']';
      return -1;
    }

    device_ctx->IASetInputLayout(input_layout.get());

    _init_view_port(this->img.width, this->img.height);
    device_ctx->OMSetRenderTargets(1, &nv12_Y_rt, nullptr);
    device_ctx->VSSetShader(scene_vs.get(), nullptr, 0);
    device_ctx->PSSetShader(convert_Y_ps.get(), nullptr, 0);
    device_ctx->PSSetShaderResources(0, 1, &back_d3d_img->encoder_input_res);
    device_ctx->Draw(3, 0);

    device_ctx->RSSetViewports(1, &outY_view);
    device_ctx->PSSetShaderResources(0, 1, &img.encoder_input_res);
    device_ctx->Draw(3, 0);

    // Artifacts start appearing on the rendered image if Sunshine doesn't flush
    // before rendering on the UV part of the image.
    device_ctx->Flush();

    _init_view_port(this->img.width / 2, this->img.height / 2);
    device_ctx->OMSetRenderTargets(1, &nv12_UV_rt, nullptr);
    device_ctx->VSSetShader(convert_UV_vs.get(), nullptr, 0);
    device_ctx->PSSetShader(convert_UV_ps.get(), nullptr, 0);
    device_ctx->PSSetShaderResources(0, 1, &back_d3d_img->encoder_input_res);
    device_ctx->Draw(3, 0);

    device_ctx->RSSetViewports(1, &outUV_view);
    device_ctx->PSSetShaderResources(0, 1, &img.encoder_input_res);
    device_ctx->Draw(3, 0);
    device_ctx->Flush();

    // Release encoder mutexes to allow capture code to reuse this image
    back_d3d_img->encoder_mutex->ReleaseSync(0);
    img.encoder_mutex->ReleaseSync(0);

    return 0;
  }

  void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) override {
    switch(colorspace) {
    case 5: // SWS_CS_SMPTE170M
      color_p = &::video::colors[0];
      break;
    case 1: // SWS_CS_ITU709
      color_p = &::video::colors[2];
      break;
    case 9: // SWS_CS_BT2020
      color_p = &::video::colors[4];
      break;
    default:
      BOOST_LOG(warning) << "Colorspace: ["sv << colorspace << "] not yet supported: switching to default"sv;
      color_p = &::video::colors[0];
    };

    if(color_range > 1) {
      // Full range
      ++color_p;
    }

    auto color_matrix = make_buffer((device_t::pointer)data, *color_p);
    if(!color_matrix) {
      BOOST_LOG(warning) << "Failed to create color matrix"sv;
      return;
    }

    device_ctx->VSSetConstantBuffers(0, 1, &info_scene);
    device_ctx->PSSetConstantBuffers(0, 1, &color_matrix);
    this->color_matrix = std::move(color_matrix);
  }

  int set_frame(AVFrame *frame) {
    this->hwframe.reset(frame);
    this->frame = frame;

    auto out_width  = frame->width;
    auto out_height = frame->height;

    float in_width  = img.display->width;
    float in_height = img.display->height;

    // // Ensure aspect ratio is maintained
    auto scalar       = std::fminf(out_width / in_width, out_height / in_height);
    auto out_width_f  = in_width * scalar;
    auto out_height_f = in_height * scalar;

    // result is always positive
    auto offsetX = (out_width - out_width_f) / 2;
    auto offsetY = (out_height - out_height_f) / 2;

    outY_view  = D3D11_VIEWPORT { offsetX, offsetY, out_width_f, out_height_f, 0.0f, 1.0f };
    outUV_view = D3D11_VIEWPORT { offsetX / 2, offsetY / 2, out_width_f / 2, out_height_f / 2, 0.0f, 1.0f };

    D3D11_TEXTURE2D_DESC t {};
    t.Width            = out_width;
    t.Height           = out_height;
    t.MipLevels        = 1;
    t.ArraySize        = 1;
    t.SampleDesc.Count = 1;
    t.Usage            = D3D11_USAGE_DEFAULT;
    t.Format           = format;
    t.BindFlags        = D3D11_BIND_RENDER_TARGET;

    auto status = device->CreateTexture2D(&t, nullptr, &img.encoder_texture);
    if(FAILED(status)) {
      BOOST_LOG(error) << "Failed to create render target texture [0x"sv << util::hex(status).to_string_view() << ']';
      return -1;
    }

    img.width       = out_width;
    img.height      = out_height;
    img.data        = (std::uint8_t *)img.encoder_texture.get();
    img.row_pitch   = out_width * 4;
    img.pixel_pitch = 4;

    float info_in[16 / sizeof(float)] { 1.0f / (float)out_width_f }; //aligned to 16-byte
    info_scene = make_buffer(device.get(), info_in);

    if(!info_scene) {
      BOOST_LOG(error) << "Failed to create info scene buffer"sv;
      return -1;
    }

    D3D11_RENDER_TARGET_VIEW_DESC nv12_rt_desc {
      format == DXGI_FORMAT_P010 ? DXGI_FORMAT_R16_UNORM : DXGI_FORMAT_R8_UNORM,
      D3D11_RTV_DIMENSION_TEXTURE2D
    };

    status = device->CreateRenderTargetView(img.encoder_texture.get(), &nv12_rt_desc, &nv12_Y_rt);
    if(FAILED(status)) {
      BOOST_LOG(error) << "Failed to create render target view [0x"sv << util::hex(status).to_string_view() << ']';
      return -1;
    }

    nv12_rt_desc.Format = (format == DXGI_FORMAT_P010) ? DXGI_FORMAT_R16G16_UNORM : DXGI_FORMAT_R8G8_UNORM;

    status = device->CreateRenderTargetView(img.encoder_texture.get(), &nv12_rt_desc, &nv12_UV_rt);
    if(FAILED(status)) {
      BOOST_LOG(error) << "Failed to create render target view [0x"sv << util::hex(status).to_string_view() << ']';
      return -1;
    }

    // Need to have something refcounted
    if(!frame->buf[0]) {
      frame->buf[0] = av_buffer_allocz(sizeof(AVD3D11FrameDescriptor));
    }

    auto desc     = (AVD3D11FrameDescriptor *)frame->buf[0]->data;
    desc->texture = (ID3D11Texture2D *)img.data;
    desc->index   = 0;

    frame->data[0] = img.data;
    frame->data[1] = 0;

    frame->linesize[0] = img.row_pitch;

    frame->height = img.height;
    frame->width  = img.width;

    return 0;
  }

  int init(
    std::shared_ptr<platf::display_t> display, adapter_t::pointer adapter_p,
    pix_fmt_e pix_fmt) {

    D3D_FEATURE_LEVEL featureLevels[] {
      D3D_FEATURE_LEVEL_11_1,
      D3D_FEATURE_LEVEL_11_0,
      D3D_FEATURE_LEVEL_10_1,
      D3D_FEATURE_LEVEL_10_0,
      D3D_FEATURE_LEVEL_9_3,
      D3D_FEATURE_LEVEL_9_2,
      D3D_FEATURE_LEVEL_9_1
    };

    HRESULT status = D3D11CreateDevice(
      adapter_p,
      D3D_DRIVER_TYPE_UNKNOWN,
      nullptr,
      D3D11_CREATE_DEVICE_FLAGS,
      featureLevels, sizeof(featureLevels) / sizeof(D3D_FEATURE_LEVEL),
      D3D11_SDK_VERSION,
      &device,
      nullptr,
      &device_ctx);

    if(FAILED(status)) {
      BOOST_LOG(error) << "Failed to create encoder D3D11 device [0x"sv << util::hex(status).to_string_view() << ']';
      return -1;
    }

    dxgi::dxgi_t dxgi;
    status = device->QueryInterface(IID_IDXGIDevice, (void **)&dxgi);
    if(FAILED(status)) {
      BOOST_LOG(warning) << "Failed to query DXGI interface from device [0x"sv << util::hex(status).to_string_view() << ']';
      return -1;
    }

    status = dxgi->SetGPUThreadPriority(7);
    if(FAILED(status)) {
      BOOST_LOG(warning) << "Failed to increase encoding GPU thread priority. Please run application as administrator for optimal performance.";
    }

    data = device.get();

    format = (pix_fmt == pix_fmt_e::nv12 ? DXGI_FORMAT_NV12 : DXGI_FORMAT_P010);
    status = device->CreateVertexShader(scene_vs_hlsl->GetBufferPointer(), scene_vs_hlsl->GetBufferSize(), nullptr, &scene_vs);
    if(status) {
      BOOST_LOG(error) << "Failed to create scene vertex shader [0x"sv << util::hex(status).to_string_view() << ']';
      return -1;
    }

    status = device->CreatePixelShader(convert_Y_ps_hlsl->GetBufferPointer(), convert_Y_ps_hlsl->GetBufferSize(), nullptr, &convert_Y_ps);
    if(status) {
      BOOST_LOG(error) << "Failed to create convertY pixel shader [0x"sv << util::hex(status).to_string_view() << ']';
      return -1;
    }

    status = device->CreatePixelShader(convert_UV_ps_hlsl->GetBufferPointer(), convert_UV_ps_hlsl->GetBufferSize(), nullptr, &convert_UV_ps);
    if(status) {
      BOOST_LOG(error) << "Failed to create convertUV pixel shader [0x"sv << util::hex(status).to_string_view() << ']';
      return -1;
    }

    status = device->CreateVertexShader(convert_UV_vs_hlsl->GetBufferPointer(), convert_UV_vs_hlsl->GetBufferSize(), nullptr, &convert_UV_vs);
    if(status) {
      BOOST_LOG(error) << "Failed to create convertUV vertex shader [0x"sv << util::hex(status).to_string_view() << ']';
      return -1;
    }

    status = device->CreatePixelShader(scene_ps_hlsl->GetBufferPointer(), scene_ps_hlsl->GetBufferSize(), nullptr, &scene_ps);
    if(status) {
      BOOST_LOG(error) << "Failed to create scene pixel shader [0x"sv << util::hex(status).to_string_view() << ']';
      return -1;
    }

    color_matrix = make_buffer(device.get(), ::video::colors[0]);
    if(!color_matrix) {
      BOOST_LOG(error) << "Failed to create color matrix buffer"sv;
      return -1;
    }

    D3D11_INPUT_ELEMENT_DESC layout_desc {
      "SV_Position", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, 0, D3D11_INPUT_PER_VERTEX_DATA, 0
    };

    status = device->CreateInputLayout(
      &layout_desc, 1,
      convert_UV_vs_hlsl->GetBufferPointer(), convert_UV_vs_hlsl->GetBufferSize(),
      &input_layout);

    img.display = std::move(display);

    // Color the background black, so that the padding for keeping the aspect ratio
    // is black
    back_img = img.display->alloc_img();
    if(img.display->dummy_img(back_img.get()) || share_img(back_img.get())) {
      BOOST_LOG(warning) << "Couldn't create an image to set background color to black"sv;
      return -1;
    }

    blend_disable = make_blend(device.get(), false, false);
    if(!blend_disable) {
      return -1;
    }

    D3D11_SAMPLER_DESC sampler_desc {};
    sampler_desc.Filter         = D3D11_FILTER_MIN_MAG_MIP_LINEAR;
    sampler_desc.AddressU       = D3D11_TEXTURE_ADDRESS_CLAMP;
    sampler_desc.AddressV       = D3D11_TEXTURE_ADDRESS_CLAMP;
    sampler_desc.AddressW       = D3D11_TEXTURE_ADDRESS_WRAP;
    sampler_desc.ComparisonFunc = D3D11_COMPARISON_NEVER;
    sampler_desc.MinLOD         = 0;
    sampler_desc.MaxLOD         = D3D11_FLOAT32_MAX;

    status = device->CreateSamplerState(&sampler_desc, &sampler_linear);
    if(FAILED(status)) {
      BOOST_LOG(error) << "Failed to create point sampler state [0x"sv << util::hex(status).to_string_view() << ']';
      return -1;
    }

    device_ctx->IASetInputLayout(input_layout.get());
    device_ctx->PSSetConstantBuffers(0, 1, &color_matrix);
    device_ctx->VSSetConstantBuffers(0, 1, &info_scene);

    device_ctx->OMSetBlendState(blend_disable.get(), nullptr, 0xFFFFFFFFu);
    device_ctx->PSSetSamplers(0, 1, &sampler_linear);
    device_ctx->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP);

    return 0;
  }

private:
  void _init_view_port(float x, float y, float width, float height) {
    D3D11_VIEWPORT view {
      x, y,
      width, height,
      0.0f, 1.0f
    };

    device_ctx->RSSetViewports(1, &view);
  }

  void _init_view_port(float width, float height) {
    _init_view_port(0.0f, 0.0f, width, height);
  }

  int share_img(platf::img_t *img_base) {
    auto img = (img_d3d_t *)img_base;

    // If we've already opened the shared texture, we're done
    if(img->encoder_texture) {
      return 0;
    }

    device1_t device1;
    auto status = device->QueryInterface(__uuidof(ID3D11Device1), (void **)&device1);
    if(FAILED(status)) {
      BOOST_LOG(error) << "Failed to query ID3D11Device1 [0x"sv << util::hex(status).to_string_view() << ']';
      return -1;
    }

    // Open a handle to the shared texture
    status = device1->OpenSharedResource1(img->encoder_texture_handle, __uuidof(ID3D11Texture2D), (void **)&img->encoder_texture);
    if(FAILED(status)) {
      BOOST_LOG(error) << "Failed to open shared image texture [0x"sv << util::hex(status).to_string_view() << ']';
      return -1;
    }

    // Get the keyed mutex to synchronize with the capture code
    status = img->encoder_texture->QueryInterface(__uuidof(IDXGIKeyedMutex), (void **)&img->encoder_mutex);
    if(FAILED(status)) {
      BOOST_LOG(error) << "Failed to query IDXGIKeyedMutex [0x"sv << util::hex(status).to_string_view() << ']';
      return -1;
    }

    // Create the SRV for the encoder texture
    status = device->CreateShaderResourceView(img->encoder_texture.get(), nullptr, &img->encoder_input_res);
    if(FAILED(status)) {
      BOOST_LOG(error) << "Failed to create shader resource view for encoding [0x"sv << util::hex(status).to_string_view() << ']';
      return -1;
    }

    return 0;
  }

public:
  frame_t hwframe;

  ::video::color_t *color_p;

  buf_t info_scene;
  buf_t color_matrix;

  input_layout_t input_layout;

  blend_t blend_disable;
  sampler_state_t sampler_linear;

  render_target_t nv12_Y_rt;
  render_target_t nv12_UV_rt;

  // The image referenced by hwframe
  // The resulting image is stored here.
  img_d3d_t img;

  // Clear nv12 render target to black
  std::shared_ptr<img_t> back_img;

  vs_t convert_UV_vs;
  ps_t convert_UV_ps;
  ps_t convert_Y_ps;
  ps_t scene_ps;
  vs_t scene_vs;

  D3D11_VIEWPORT outY_view;
  D3D11_VIEWPORT outUV_view;

  DXGI_FORMAT format;

  device_t device;
  device_ctx_t device_ctx;
};

capture_e display_vram_t::capture(snapshot_cb_t &&snapshot_cb, std::shared_ptr<::platf::img_t> img, bool *cursor) {
  auto next_frame = std::chrono::steady_clock::now();

  while(img) {
    auto now = std::chrono::steady_clock::now();
    while(next_frame > now) {
      now = std::chrono::steady_clock::now();
    }
    next_frame = now + delay;

    auto status = snapshot(img.get(), 1000ms, *cursor);
    switch(status) {
    case platf::capture_e::reinit:
    case platf::capture_e::error:
      return status;
    case platf::capture_e::timeout:
      img = snapshot_cb(img, false);
      std::this_thread::sleep_for(1ms);
      break;
    case platf::capture_e::ok:
      img = snapshot_cb(img, true);
      break;
    default:
      BOOST_LOG(error) << "Unrecognized capture status ["sv << (int)status << ']';
      return status;
    }
  }

  return capture_e::ok;
}

bool set_cursor_texture(device_t::pointer device, gpu_cursor_t &cursor, util::buffer_t<std::uint8_t> &&cursor_img, DXGI_OUTDUPL_POINTER_SHAPE_INFO &shape_info) {
  // This cursor image may not be used
  if(cursor_img.size() == 0) {
    cursor.input_res.reset();
    cursor.set_texture(0, 0, nullptr);
    return true;
  }

  D3D11_SUBRESOURCE_DATA data {
    std::begin(cursor_img),
    4 * shape_info.Width,
    0
  };

  // Create texture for cursor
  D3D11_TEXTURE2D_DESC t {};
  t.Width            = shape_info.Width;
  t.Height           = cursor_img.size() / data.SysMemPitch;
  t.MipLevels        = 1;
  t.ArraySize        = 1;
  t.SampleDesc.Count = 1;
  t.Usage            = D3D11_USAGE_IMMUTABLE;
  t.Format           = DXGI_FORMAT_B8G8R8A8_UNORM;
  t.BindFlags        = D3D11_BIND_SHADER_RESOURCE;

  texture2d_t texture;
  auto status = device->CreateTexture2D(&t, &data, &texture);
  if(FAILED(status)) {
    BOOST_LOG(error) << "Failed to create mouse texture [0x"sv << util::hex(status).to_string_view() << ']';
    return false;
  }

  // Free resources before allocating on the next line.
  cursor.input_res.reset();
  status = device->CreateShaderResourceView(texture.get(), nullptr, &cursor.input_res);
  if(FAILED(status)) {
    BOOST_LOG(error) << "Failed to create cursor shader resource view [0x"sv << util::hex(status).to_string_view() << ']';
    return false;
  }

  cursor.set_texture(t.Width, t.Height, std::move(texture));
  return true;
}

capture_e display_vram_t::snapshot(platf::img_t *img_base, std::chrono::milliseconds timeout, bool cursor_visible) {
  auto img = (img_d3d_t *)img_base;

  HRESULT status;

  DXGI_OUTDUPL_FRAME_INFO frame_info;

  resource_t::pointer res_p {};
  auto capture_status = dup.next_frame(frame_info, timeout, &res_p);
  resource_t res { res_p };

  if(capture_status != capture_e::ok) {
    return capture_status;
  }

  const bool mouse_update_flag = frame_info.LastMouseUpdateTime.QuadPart != 0 || frame_info.PointerShapeBufferSize > 0;
  const bool frame_update_flag = frame_info.AccumulatedFrames != 0 || frame_info.LastPresentTime.QuadPart != 0;
  const bool update_flag       = mouse_update_flag || frame_update_flag;

  if(!update_flag) {
    return capture_e::timeout;
  }

  if(frame_info.PointerShapeBufferSize > 0) {
    DXGI_OUTDUPL_POINTER_SHAPE_INFO shape_info {};

    util::buffer_t<std::uint8_t> img_data { frame_info.PointerShapeBufferSize };

    UINT dummy;
    status = dup.dup->GetFramePointerShape(img_data.size(), std::begin(img_data), &dummy, &shape_info);
    if(FAILED(status)) {
      BOOST_LOG(error) << "Failed to get new pointer shape [0x"sv << util::hex(status).to_string_view() << ']';

      return capture_e::error;
    }

    auto alpha_cursor_img = make_cursor_alpha_image(img_data, shape_info);
    auto xor_cursor_img   = make_cursor_xor_image(img_data, shape_info);

    if(!set_cursor_texture(device.get(), cursor_alpha, std::move(alpha_cursor_img), shape_info) ||
       !set_cursor_texture(device.get(), cursor_xor, std::move(xor_cursor_img), shape_info)) {
      return capture_e::error;
    }
  }

  if(frame_info.LastMouseUpdateTime.QuadPart) {
    cursor_alpha.set_pos(frame_info.PointerPosition.Position.x, frame_info.PointerPosition.Position.y, frame_info.PointerPosition.Visible);
    cursor_xor.set_pos(frame_info.PointerPosition.Position.x, frame_info.PointerPosition.Position.y, frame_info.PointerPosition.Visible);
  }

  if(frame_update_flag) {
    texture2d_t src {};

    // Get the texture object from this frame
    status = res->QueryInterface(IID_ID3D11Texture2D, (void **)&src);
    if(FAILED(status)) {
      BOOST_LOG(error) << "Couldn't query interface [0x"sv << util::hex(status).to_string_view() << ']';
      return capture_e::error;
    }

    D3D11_TEXTURE2D_DESC desc;
    src->GetDesc(&desc);

    // It's possible for our display enumeration to race with mode changes and result in
    // mismatched image pool and desktop texture sizes. If this happens, just reinit again.
    if(desc.Width != width || desc.Height != height) {
      BOOST_LOG(info) << "Capture size changed ["sv << width << 'x' << height << " -> "sv << desc.Width << 'x' << desc.Height << ']';
      return capture_e::reinit;
    }

    // If we don't know the capture format yet, grab it from this texture
    if(capture_format == DXGI_FORMAT_UNKNOWN) {
      capture_format = desc.Format;
      BOOST_LOG(info) << "Capture format ["sv << dxgi_format_to_string(capture_format) << ']';

      D3D11_TEXTURE2D_DESC t {};
      t.Width            = width;
      t.Height           = height;
      t.MipLevels        = 1;
      t.ArraySize        = 1;
      t.SampleDesc.Count = 1;
      t.Usage            = D3D11_USAGE_DEFAULT;
      t.Format           = capture_format;
      t.BindFlags        = 0;

      // Create a texture to store the most recent copy of the desktop
      status = device->CreateTexture2D(&t, nullptr, &last_frame_copy);
      if(FAILED(status)) {
        BOOST_LOG(error) << "Failed to create frame copy texture [0x"sv << util::hex(status).to_string_view() << ']';
        return capture_e::error;
      }
    }

    // It's also possible for the capture format to change on the fly. If that happens,
    // reinitialize capture to try format detection again and create new images.
    if(capture_format != desc.Format) {
      BOOST_LOG(info) << "Capture format changed ["sv << dxgi_format_to_string(capture_format) << " -> "sv << dxgi_format_to_string(desc.Format) << ']';
      return capture_e::reinit;
    }

    // Now that we know the capture format, we can finish creating the image
    if(complete_img(img, false)) {
      return capture_e::error;
    }

    // Copy the texture to use for cursor-only updates
    device_ctx->CopyResource(last_frame_copy.get(), src.get());

    // Copy into the capture texture on the image with the mutex held
    status = img->capture_mutex->AcquireSync(0, INFINITE);
    if(status != S_OK) {
      BOOST_LOG(error) << "Failed to acquire capture mutex [0x"sv << util::hex(status).to_string_view() << ']';
      return capture_e::error;
    }
    device_ctx->CopyResource(img->capture_texture.get(), src.get());
  }
  else if(capture_format == DXGI_FORMAT_UNKNOWN) {
    // We don't know the final capture format yet, so we will encode a dummy image
    BOOST_LOG(debug) << "Capture format is still unknown. Encoding a blank image"sv;

    // Finish creating the image as a dummy (if it hasn't happened already)
    if(complete_img(img, true)) {
      return capture_e::error;
    }

    auto dummy_data = std::make_unique<std::uint8_t[]>(img->row_pitch * img->height);
    std::fill_n(dummy_data.get(), img->row_pitch * img->height, 0);

    status = img->capture_mutex->AcquireSync(0, INFINITE);
    if(status != S_OK) {
      BOOST_LOG(error) << "Failed to acquire capture mutex [0x"sv << util::hex(status).to_string_view() << ']';
      return capture_e::error;
    }

    // Populate the image with dummy data. This is required because these images could be reused
    // after rendering (in which case they would have a cursor already rendered into them).
    device_ctx->UpdateSubresource(img->capture_texture.get(), 0, nullptr, dummy_data.get(), img->row_pitch, 0);
  }
  else {
    // We must know the capture format in this path or we would have hit the above unknown format case
    if(complete_img(img, false)) {
      return capture_e::error;
    }

    // We have a previously captured frame to reuse. We can't just grab the src texture from
    // the call to AcquireNextFrame() because that won't be valid. It seems to return a texture
    // in the unmodified desktop format (rather than the formats we passed to DuplicateOutput1())
    // if called in that case.
    status = img->capture_mutex->AcquireSync(0, INFINITE);
    if(status != S_OK) {
      BOOST_LOG(error) << "Failed to acquire capture mutex [0x"sv << util::hex(status).to_string_view() << ']';
      return capture_e::error;
    }
    device_ctx->CopyResource(img->capture_texture.get(), last_frame_copy.get());
  }

  if((cursor_alpha.visible || cursor_xor.visible) && cursor_visible) {
    device_ctx->VSSetShader(scene_vs.get(), nullptr, 0);
    device_ctx->PSSetShader(scene_ps.get(), nullptr, 0);
    device_ctx->OMSetRenderTargets(1, &img->capture_rt, nullptr);

    if(cursor_alpha.texture.get()) {
      // Perform an alpha blending operation
      device_ctx->OMSetBlendState(blend_alpha.get(), nullptr, 0xFFFFFFFFu);

      device_ctx->PSSetShaderResources(0, 1, &cursor_alpha.input_res);
      device_ctx->RSSetViewports(1, &cursor_alpha.cursor_view);
      device_ctx->Draw(3, 0);
    }

    if(cursor_xor.texture.get()) {
      // Perform an invert blending without touching alpha values
      device_ctx->OMSetBlendState(blend_invert.get(), nullptr, 0x00FFFFFFu);

      device_ctx->PSSetShaderResources(0, 1, &cursor_xor.input_res);
      device_ctx->RSSetViewports(1, &cursor_xor.cursor_view);
      device_ctx->Draw(3, 0);
    }

    device_ctx->OMSetBlendState(blend_disable.get(), nullptr, 0xFFFFFFFFu);
  }

  // Release the mutex to allow encoding of this frame
  img->capture_mutex->ReleaseSync(0);

  return capture_e::ok;
}

int display_vram_t::init(int framerate, const std::string &display_name) {
  if(display_base_t::init(framerate, display_name)) {
    return -1;
  }

  D3D11_SAMPLER_DESC sampler_desc {};
  sampler_desc.Filter         = D3D11_FILTER_MIN_MAG_MIP_LINEAR;
  sampler_desc.AddressU       = D3D11_TEXTURE_ADDRESS_CLAMP;
  sampler_desc.AddressV       = D3D11_TEXTURE_ADDRESS_CLAMP;
  sampler_desc.AddressW       = D3D11_TEXTURE_ADDRESS_WRAP;
  sampler_desc.ComparisonFunc = D3D11_COMPARISON_NEVER;
  sampler_desc.MinLOD         = 0;
  sampler_desc.MaxLOD         = D3D11_FLOAT32_MAX;

  auto status = device->CreateSamplerState(&sampler_desc, &sampler_linear);
  if(FAILED(status)) {
    BOOST_LOG(error) << "Failed to create point sampler state [0x"sv << util::hex(status).to_string_view() << ']';
    return -1;
  }

  status = device->CreateVertexShader(scene_vs_hlsl->GetBufferPointer(), scene_vs_hlsl->GetBufferSize(), nullptr, &scene_vs);
  if(status) {
    BOOST_LOG(error) << "Failed to create scene vertex shader [0x"sv << util::hex(status).to_string_view() << ']';
    return -1;
  }

  status = device->CreatePixelShader(scene_ps_hlsl->GetBufferPointer(), scene_ps_hlsl->GetBufferSize(), nullptr, &scene_ps);
  if(status) {
    BOOST_LOG(error) << "Failed to create scene pixel shader [0x"sv << util::hex(status).to_string_view() << ']';
    return -1;
  }

  blend_alpha   = make_blend(device.get(), true, false);
  blend_invert  = make_blend(device.get(), true, true);
  blend_disable = make_blend(device.get(), false, false);

  if(!blend_disable || !blend_alpha || !blend_invert) {
    return -1;
  }

  device_ctx->OMSetBlendState(blend_disable.get(), nullptr, 0xFFFFFFFFu);
  device_ctx->PSSetSamplers(0, 1, &sampler_linear);
  device_ctx->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP);

  return 0;
}

std::shared_ptr<platf::img_t> display_vram_t::alloc_img() {
  auto img = std::make_shared<img_d3d_t>();

  // Initialize format-independent fields
  img->width   = width;
  img->height  = height;
  img->display = shared_from_this();

  return img;
}

// This cannot use ID3D11DeviceContext because it can be called concurrently by the encoding thread
int display_vram_t::complete_img(platf::img_t *img_base, bool dummy) {
  auto img = (img_d3d_t *)img_base;

  // If this already has a capture texture and it's not switching dummy state, nothing to do
  if(img->capture_texture && img->dummy == dummy) {
    return 0;
  }

  // If this is not a dummy image, we must know the format by now
  if(!dummy && capture_format == DXGI_FORMAT_UNKNOWN) {
    BOOST_LOG(error) << "display_vram_t::complete_img() called with unknown capture format!";
    return -1;
  }

  // Reset the image (in case this was previously a dummy)
  img->capture_texture.reset();
  img->capture_rt.reset();
  img->capture_mutex.reset();
  img->encoder_texture.reset();
  img->encoder_input_res.reset();
  img->encoder_mutex.reset();
  img->data = nullptr;
  if(img->encoder_texture_handle) {
    CloseHandle(img->encoder_texture_handle);
    img->encoder_texture_handle = NULL;
  }

  // Initialize format-dependent fields
  img->pixel_pitch = get_pixel_pitch();
  img->row_pitch   = img->pixel_pitch * img->width;
  img->dummy       = dummy;

  D3D11_TEXTURE2D_DESC t {};
  t.Width            = img->width;
  t.Height           = img->height;
  t.MipLevels        = 1;
  t.ArraySize        = 1;
  t.SampleDesc.Count = 1;
  t.Usage            = D3D11_USAGE_DEFAULT;
  t.Format           = (capture_format == DXGI_FORMAT_UNKNOWN) ? DXGI_FORMAT_B8G8R8A8_UNORM : capture_format;
  t.BindFlags        = D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_RENDER_TARGET;
  t.MiscFlags        = D3D11_RESOURCE_MISC_SHARED_NTHANDLE | D3D11_RESOURCE_MISC_SHARED_KEYEDMUTEX;

  auto dummy_data = std::make_unique<std::uint8_t[]>(img->row_pitch * img->height);
  std::fill_n(dummy_data.get(), img->row_pitch * img->height, 0);
  D3D11_SUBRESOURCE_DATA initial_data {
    dummy_data.get(),
    (UINT)img->row_pitch,
    0
  };

  auto status = device->CreateTexture2D(&t, &initial_data, &img->capture_texture);
  if(FAILED(status)) {
    BOOST_LOG(error) << "Failed to create img buf texture [0x"sv << util::hex(status).to_string_view() << ']';
    return -1;
  }

  status = device->CreateRenderTargetView(img->capture_texture.get(), nullptr, &img->capture_rt);
  if(FAILED(status)) {
    BOOST_LOG(error) << "Failed to create render target view [0x"sv << util::hex(status).to_string_view() << ']';
    return -1;
  }

  // Get the keyed mutex to synchronize with the encoding code
  status = img->capture_texture->QueryInterface(__uuidof(IDXGIKeyedMutex), (void **)&img->capture_mutex);
  if(FAILED(status)) {
    BOOST_LOG(error) << "Failed to query IDXGIKeyedMutex [0x"sv << util::hex(status).to_string_view() << ']';
    return -1;
  }

  resource1_t resource;
  status = img->capture_texture->QueryInterface(__uuidof(IDXGIResource1), (void **)&resource);
  if(FAILED(status)) {
    BOOST_LOG(error) << "Failed to query IDXGIResource1 [0x"sv << util::hex(status).to_string_view() << ']';
    return -1;
  }

  // Create a handle for the encoder device to use to open this texture
  status = resource->CreateSharedHandle(nullptr, DXGI_SHARED_RESOURCE_READ, nullptr, &img->encoder_texture_handle);
  if(FAILED(status)) {
    BOOST_LOG(error) << "Failed to create shared texture handle [0x"sv << util::hex(status).to_string_view() << ']';
    return -1;
  }

  img->data = (std::uint8_t *)img->capture_texture.get();

  return 0;
}

// This cannot use ID3D11DeviceContext because it can be called concurrently by the encoding thread
int display_vram_t::dummy_img(platf::img_t *img_base) {
  return complete_img(img_base, true);
}

std::vector<DXGI_FORMAT> display_vram_t::get_supported_sdr_capture_formats() {
  return std::vector { DXGI_FORMAT_B8G8R8A8_UNORM, DXGI_FORMAT_R8G8B8A8_UNORM };
}

std::shared_ptr<platf::hwdevice_t> display_vram_t::make_hwdevice(pix_fmt_e pix_fmt) {
  if(pix_fmt != platf::pix_fmt_e::nv12 && pix_fmt != platf::pix_fmt_e::p010) {
    BOOST_LOG(error) << "display_vram_t doesn't support pixel format ["sv << from_pix_fmt(pix_fmt) << ']';

    return nullptr;
  }

  auto hwdevice = std::make_shared<hwdevice_t>();

  auto ret = hwdevice->init(
    shared_from_this(),
    adapter.get(),
    pix_fmt);

  if(ret) {
    return nullptr;
  }

  return hwdevice;
}

int init() {
  BOOST_LOG(info) << "Compiling shaders..."sv;
  scene_vs_hlsl = compile_vertex_shader(SUNSHINE_SHADERS_DIR "/SceneVS.hlsl");
  if(!scene_vs_hlsl) {
    return -1;
  }

  convert_Y_ps_hlsl = compile_pixel_shader(SUNSHINE_SHADERS_DIR "/ConvertYPS.hlsl");
  if(!convert_Y_ps_hlsl) {
    return -1;
  }

  convert_UV_ps_hlsl = compile_pixel_shader(SUNSHINE_SHADERS_DIR "/ConvertUVPS.hlsl");
  if(!convert_UV_ps_hlsl) {
    return -1;
  }

  convert_UV_vs_hlsl = compile_vertex_shader(SUNSHINE_SHADERS_DIR "/ConvertUVVS.hlsl");
  if(!convert_UV_vs_hlsl) {
    return -1;
  }

  scene_ps_hlsl = compile_pixel_shader(SUNSHINE_SHADERS_DIR "/ScenePS.hlsl");
  if(!scene_ps_hlsl) {
    return -1;
  }
  BOOST_LOG(info) << "Compiled shaders"sv;

  return 0;
}
} // namespace platf::dxgi