Provide SSSE3, AVX2, and AVX512 optimized Reed-Solomon functions (#2828)

* Provide SSSE3, AVX2, and AVX512 optimized Reed-Solomon functions

* Update nanors to fix AVX-512 memory corruption
This commit is contained in:
Cameron Gutman 2024-07-11 20:22:57 -05:00 committed by GitHub
parent 037c61dc99
commit e7c420dd6e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 236 additions and 9 deletions

View File

@ -49,8 +49,6 @@ configure_file("${CMAKE_SOURCE_DIR}/src/version.h.in" version.h @ONLY)
include_directories("${CMAKE_CURRENT_BINARY_DIR}") # required for importing version.h
set(SUNSHINE_TARGET_FILES
"${CMAKE_SOURCE_DIR}/third-party/nanors/rs.c"
"${CMAKE_SOURCE_DIR}/third-party/nanors/rs.h"
"${CMAKE_SOURCE_DIR}/third-party/moonlight-common-c/src/Input.h"
"${CMAKE_SOURCE_DIR}/third-party/moonlight-common-c/src/Rtsp.h"
"${CMAKE_SOURCE_DIR}/third-party/moonlight-common-c/src/RtspParser.c"
@ -108,6 +106,8 @@ set(SUNSHINE_TARGET_FILES
"${CMAKE_SOURCE_DIR}/src/round_robin.h"
"${CMAKE_SOURCE_DIR}/src/stat_trackers.h"
"${CMAKE_SOURCE_DIR}/src/stat_trackers.cpp"
"${CMAKE_SOURCE_DIR}/src/rswrapper.h"
"${CMAKE_SOURCE_DIR}/src/rswrapper.c"
${PLATFORM_TARGET_FILES})
if(NOT SUNSHINE_ASSETS_DIR_DEF)

View File

@ -85,9 +85,9 @@ set_source_files_properties("${CMAKE_SOURCE_DIR}/src/upnp.cpp"
PROPERTIES COMPILE_FLAGS -Wno-pedantic)
# third-party/nanors
set_source_files_properties("${CMAKE_SOURCE_DIR}/third-party/nanors/rs.c"
set_source_files_properties("${CMAKE_SOURCE_DIR}/src/rswrapper.c"
DIRECTORY "${CMAKE_SOURCE_DIR}" "${TEST_DIR}"
PROPERTIES COMPILE_FLAGS "-include deps/obl/autoshim.h -ftree-vectorize")
PROPERTIES COMPILE_FLAGS "-ftree-vectorize -funroll-loops")
# third-party/ViGEmClient
set(VIGEM_COMPILE_FLAGS "")

View File

@ -23,7 +23,7 @@
#include "video.h"
extern "C" {
#include <rs.h>
#include "rswrapper.h"
}
using namespace std::literals;

157
src/rswrapper.c Normal file
View File

@ -0,0 +1,157 @@
/**
* @file src/rswrapper.c
* @brief Wrappers for nanors vectorization with different ISA options
*/
// _FORTIY_SOURCE can cause some versions of GCC to try to inline
// memset() with incompatible target options when compiling rs.c
#ifdef _FORTIFY_SOURCE
#undef _FORTIFY_SOURCE
#endif
// The assert() function is decorated with __cold on macOS which
// is incompatible with Clang's target multiversioning feature
#ifndef NDEBUG
#define NDEBUG
#endif
#define DECORATE_FUNC_I(a, b) a##b
#define DECORATE_FUNC(a, b) DECORATE_FUNC_I(a, b)
// Append an ISA suffix to the public RS API
#define reed_solomon_init DECORATE_FUNC(reed_solomon_init, ISA_SUFFIX)
#define reed_solomon_new DECORATE_FUNC(reed_solomon_new, ISA_SUFFIX)
#define reed_solomon_new_static DECORATE_FUNC(reed_solomon_new_static, ISA_SUFFIX)
#define reed_solomon_release DECORATE_FUNC(reed_solomon_release, ISA_SUFFIX)
#define reed_solomon_decode DECORATE_FUNC(reed_solomon_decode, ISA_SUFFIX)
#define reed_solomon_encode DECORATE_FUNC(reed_solomon_encode, ISA_SUFFIX)
// Append an ISA suffix to internal functions to prevent multiple definition errors
#define obl_axpy_ref DECORATE_FUNC(obl_axpy_ref, ISA_SUFFIX)
#define obl_scal_ref DECORATE_FUNC(obl_scal_ref, ISA_SUFFIX)
#define obl_axpyb32_ref DECORATE_FUNC(obl_axpyb32_ref, ISA_SUFFIX)
#define obl_axpy DECORATE_FUNC(obl_axpy, ISA_SUFFIX)
#define obl_scal DECORATE_FUNC(obl_scal, ISA_SUFFIX)
#define obl_swap DECORATE_FUNC(obl_swap, ISA_SUFFIX)
#define obl_axpyb32 DECORATE_FUNC(obl_axpyb32, ISA_SUFFIX)
#define axpy DECORATE_FUNC(axpy, ISA_SUFFIX)
#define scal DECORATE_FUNC(scal, ISA_SUFFIX)
#define gemm DECORATE_FUNC(gemm, ISA_SUFFIX)
#define invert_mat DECORATE_FUNC(invert_mat, ISA_SUFFIX)
#if defined(__x86_64__) || defined(__i386__)
// Compile a variant for SSSE3
#if defined(__clang__)
#pragma clang attribute push(__attribute__((target("ssse3"))), apply_to = function)
#else
#pragma GCC push_options
#pragma GCC target("ssse3")
#endif
#define ISA_SUFFIX _ssse3
#define OBLAS_SSE3
#include "../third-party/nanors/rs.c"
#undef OBLAS_SSE3
#undef ISA_SUFFIX
#if defined(__clang__)
#pragma clang attribute pop
#else
#pragma GCC pop_options
#endif
// Compile a variant for AVX2
#if defined(__clang__)
#pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
#else
#pragma GCC push_options
#pragma GCC target("avx2")
#endif
#define ISA_SUFFIX _avx2
#define OBLAS_AVX2
#include "../third-party/nanors/rs.c"
#undef OBLAS_AVX2
#undef ISA_SUFFIX
#if defined(__clang__)
#pragma clang attribute pop
#else
#pragma GCC pop_options
#endif
// Compile a variant for AVX512BW
#if defined(__clang__)
#pragma clang attribute push(__attribute__((target("avx512f,avx512bw"))), apply_to = function)
#else
#pragma GCC push_options
#pragma GCC target("avx512f,avx512bw")
#endif
#define ISA_SUFFIX _avx512
#define OBLAS_AVX512
#include "../third-party/nanors/rs.c"
#undef OBLAS_AVX512
#undef ISA_SUFFIX
#if defined(__clang__)
#pragma clang attribute pop
#else
#pragma GCC pop_options
#endif
#endif
// Compile a default variant
#define ISA_SUFFIX _def
#include "../third-party/nanors/deps/obl/autoshim.h"
#include "../third-party/nanors/rs.c"
#undef ISA_SUFFIX
#undef reed_solomon_init
#undef reed_solomon_new
#undef reed_solomon_new_static
#undef reed_solomon_release
#undef reed_solomon_decode
#undef reed_solomon_encode
#include "rswrapper.h"
reed_solomon_new_t reed_solomon_new_fn;
reed_solomon_release_t reed_solomon_release_fn;
reed_solomon_encode_t reed_solomon_encode_fn;
reed_solomon_decode_t reed_solomon_decode_fn;
/**
* @brief This initializes the RS function pointers to the best vectorized version available.
* @details The streaming code will directly invoke these function pointers during encoding.
*/
void
reed_solomon_init(void) {
#if defined(__x86_64__) || defined(__i386__)
if (__builtin_cpu_supports("avx512f") && __builtin_cpu_supports("avx512bw")) {
reed_solomon_new_fn = reed_solomon_new_avx512;
reed_solomon_release_fn = reed_solomon_release_avx512;
reed_solomon_encode_fn = reed_solomon_encode_avx512;
reed_solomon_decode_fn = reed_solomon_decode_avx512;
reed_solomon_init_avx512();
}
else if (__builtin_cpu_supports("avx2")) {
reed_solomon_new_fn = reed_solomon_new_avx2;
reed_solomon_release_fn = reed_solomon_release_avx2;
reed_solomon_encode_fn = reed_solomon_encode_avx2;
reed_solomon_decode_fn = reed_solomon_decode_avx2;
reed_solomon_init_avx2();
}
else if (__builtin_cpu_supports("ssse3")) {
reed_solomon_new_fn = reed_solomon_new_ssse3;
reed_solomon_release_fn = reed_solomon_release_ssse3;
reed_solomon_encode_fn = reed_solomon_encode_ssse3;
reed_solomon_decode_fn = reed_solomon_decode_ssse3;
reed_solomon_init_ssse3();
}
else
#endif
{
reed_solomon_new_fn = reed_solomon_new_def;
reed_solomon_release_fn = reed_solomon_release_def;
reed_solomon_encode_fn = reed_solomon_encode_def;
reed_solomon_decode_fn = reed_solomon_decode_def;
reed_solomon_init_def();
}
}

32
src/rswrapper.h Normal file
View File

@ -0,0 +1,32 @@
/**
* @file src/rswrapper.h
* @brief Wrappers for nanors vectorization
* @details This is a drop-in replacement for nanors rs.h
*/
#pragma once
#include <stdint.h>
typedef struct _reed_solomon reed_solomon;
typedef reed_solomon *(*reed_solomon_new_t)(int data_shards, int parity_shards);
typedef void (*reed_solomon_release_t)(reed_solomon *rs);
typedef int (*reed_solomon_encode_t)(reed_solomon *rs, uint8_t **shards, int nr_shards, int bs);
typedef int (*reed_solomon_decode_t)(reed_solomon *rs, uint8_t **shards, uint8_t *marks, int nr_shards, int bs);
extern reed_solomon_new_t reed_solomon_new_fn;
extern reed_solomon_release_t reed_solomon_release_fn;
extern reed_solomon_encode_t reed_solomon_encode_fn;
extern reed_solomon_decode_t reed_solomon_decode_fn;
#define reed_solomon_new reed_solomon_new_fn
#define reed_solomon_release reed_solomon_release_fn
#define reed_solomon_encode reed_solomon_encode_fn
#define reed_solomon_decode reed_solomon_decode_fn
/**
* @brief This initializes the RS function pointers to the best vectorized version available.
* @details The streaming code will directly invoke these function pointers during encoding.
*/
void
reed_solomon_init(void);

View File

@ -13,8 +13,10 @@
#include <boost/endian/arithmetic.hpp>
extern "C" {
// clang-format off
#include <moonlight-common-c/src/Limelight-internal.h>
#include <rs.h>
#include "rswrapper.h"
// clang-format on
}
#include "config.h"
@ -236,7 +238,6 @@ namespace stream {
}
constexpr std::size_t MAX_AUDIO_PACKET_SIZE = 1400;
using rh_t = util::safe_ptr<reed_solomon, reed_solomon_release>;
using video_packet_t = util::c_ptr<video_packet_raw_t>;
using audio_packet_t = util::c_ptr<audio_packet_raw_t>;
using audio_fec_packet_t = util::c_ptr<audio_fec_packet_raw_t>;
@ -621,7 +622,7 @@ namespace stream {
}
namespace fec {
using rs_t = util::safe_ptr<reed_solomon, reed_solomon_release>;
using rs_t = util::safe_ptr<reed_solomon, [](reed_solomon *rs) { reed_solomon_release(rs); }>;
struct fec_t {
size_t data_shards;

View File

@ -0,0 +1,37 @@
/**
* @file tests/unit/test_rswrapper.cpp
* @brief Test src/rswrapper.*
*/
extern "C" {
#include <src/rswrapper.h>
}
#include <tests/conftest.cpp>
TEST(ReedSolomonWrapperTests, InitTest) {
reed_solomon_init();
// Ensure all function pointers were populated
ASSERT_NE(reed_solomon_new, nullptr);
ASSERT_NE(reed_solomon_release, nullptr);
ASSERT_NE(reed_solomon_encode, nullptr);
ASSERT_NE(reed_solomon_decode, nullptr);
}
TEST(ReedSolomonWrapperTests, EncodeTest) {
reed_solomon_init();
auto rs = reed_solomon_new(1, 1);
ASSERT_NE(rs, nullptr);
uint8_t dataShard[16] = {};
uint8_t fecShard[16] = {};
// If we picked the incorrect ISA in our wrapper, we should crash here
uint8_t *shardPtrs[2] = { dataShard, fecShard };
auto ret = reed_solomon_encode(rs, shardPtrs, 2, sizeof(dataShard));
ASSERT_EQ(ret, 0);
reed_solomon_release(rs);
}

2
third-party/nanors vendored

@ -1 +1 @@
Subproject commit e9e242e98e27037830490b2a752895ca68f75f8b
Subproject commit 19f07b513e924e471cadd141943c1ec4adc8d0e0