30#error "RF_ARCH should always be defined" 
   50void fillArrays(
Batch *arrays, 
const VarVector &vars, 
double *buffer, 
double *bufferDevice)
 
   52   for (
int i = 0; i < vars.size(); i++) {
 
   53      const std::span<const double> &span = vars[i];
 
   54      if (span.size() == 1) {
 
   56         arrays[i].
set(bufferDevice + i, 
false);
 
   59         arrays[i].
set(span.data(), 
true);
 
   64int getGridSize(std::size_t 
n)
 
   77   constexpr int maxGridSize = 84;
 
   78   return std::min(
int(std::ceil(
double(
n) / 
blockSize)), maxGridSize);
 
  101      std::string out = 
_QUOTE_(RF_ARCH);
 
  102      std::transform(out.begin(), out.end(), out.begin(), [](
unsigned char c) { return std::tolower(c); });
 
  119      const std::size_t memSize = 
sizeof(
Batches) + vars.size() * 
sizeof(
Batch) + vars.size() * 
sizeof(
double) +
 
  120                                  extraArgs.size() * 
sizeof(
double);
 
  122      std::vector<char> hostMem(memSize);
 
  123      auto batches = 
reinterpret_cast<Batches *
>(hostMem.data());
 
  124      auto arrays = 
reinterpret_cast<Batch *
>(batches + 1);
 
  125      auto scalarBuffer = 
reinterpret_cast<double *
>(arrays + vars.size());
 
  126      auto extraArgsHost = 
reinterpret_cast<double *
>(scalarBuffer + vars.size());
 
  129      auto batchesDevice = 
reinterpret_cast<Batches *
>(deviceMem.
data());
 
  130      auto arraysDevice = 
reinterpret_cast<Batch *
>(batchesDevice + 1);
 
  131      auto scalarBufferDevice = 
reinterpret_cast<double *
>(arraysDevice + vars.size());
 
  132      auto extraArgsDevice = 
reinterpret_cast<double *
>(scalarBufferDevice + vars.size());
 
  134      fillBatches(*batches, 
output, nEvents, vars.size(), extraArgs.size());
 
  135      fillArrays(arrays, vars, scalarBuffer, scalarBufferDevice);
 
  136      batches->_arrays = arraysDevice;
 
  138      if (!extraArgs.empty()) {
 
  139         std::copy(std::cbegin(extraArgs), std::cend(extraArgs), extraArgsHost);
 
  140         batches->_extraArgs = extraArgsDevice;
 
  143      copyHostToDevice(hostMem.data(), deviceMem.
data(), hostMem.size(), cfg.cudaStream());
 
  145      const int gridSize = getGridSize(nEvents);
 
  151      if (!extraArgs.empty()) {
 
  152         copyDeviceToHost(extraArgsDevice, extraArgs.data(), extraArgs.size(), cfg.cudaStream());
 
  158                             std::span<const double> weights, std::span<const double> offsetProbas) 
override;
 
  164   const double y = 
a - (carry + otherCarry);
 
  165   const double t = 
sum + 
y; 
 
  168   carry = (t - 
sum) - 
y;
 
  178   for (
int i = blockDim.x / 2; i > 0; i >>= 1) {
 
  179      if (threadIdx.x < i && (threadIdx.x + i) < 
n) {
 
  180         kahanSumUpdate(shared[threadIdx.x], shared[carry_index], shared[threadIdx.x + i], shared[carry_index + i]);
 
  186   if (threadIdx.x == 0) {
 
  187      result[blockIdx.x] = shared[0];
 
  188      result[blockIdx.x + gridDim.x] = shared[carry_index];
 
  192__global__ 
void kahanSum(
const double *__restrict__ 
input, 
const double *__restrict__ carries, 
size_t n,
 
  193                         double *__restrict__ 
result, 
bool nll)
 
  195   int thIdx = threadIdx.x;
 
  196   int gthIdx = thIdx + blockIdx.x * 
blockSize;
 
  197   int carry_index = threadIdx.x + blockDim.x;
 
  198   const int nThreadsTotal = 
blockSize * gridDim.x;
 
  201   extern __shared__ 
double shared[];
 
  206   for (
int i = gthIdx; i < 
n; i += nThreadsTotal) {
 
  209      double val = nll == 1 ? -std::log(
input[i]) : 
input[i];
 
  214   shared[carry_index] = carry;
 
  222__global__ 
void nllSumKernel(
const double *__restrict__ probas, 
const double *__restrict__ weights,
 
  223                             const double *__restrict__ offsetProbas, 
size_t n, 
double *__restrict__ 
result)
 
  225   int thIdx = threadIdx.x;
 
  226   int gthIdx = thIdx + blockIdx.x * 
blockSize;
 
  227   int carry_index = threadIdx.x + blockDim.x;
 
  228   const int nThreadsTotal = 
blockSize * gridDim.x;
 
  231   extern __shared__ 
double shared[];
 
  236   for (
int i = gthIdx; i < 
n; i += nThreadsTotal) {
 
  239      double val = -std::log(probas[i]);
 
  241         val += std::log(offsetProbas[i]);
 
  243         val = weights[i] * val;
 
  248   shared[carry_index] = carry;
 
  258   const int gridSize = getGridSize(
n);
 
  259   cudaStream_t stream = *cfg.cudaStream();
 
  262   kahanSum<<<gridSize, blockSize, shMemSize, stream>>>(
input, 
nullptr, 
n, devOut.
data(), 0);
 
  263   kahanSum<<<1, blockSize, shMemSize, stream>>>(devOut.
data(), devOut.
data() + gridSize, gridSize, devOut.
data(), 0);
 
  270                                                std::span<const double> weights, std::span<const double> offsetProbas)
 
  273   const int gridSize = getGridSize(probas.size());
 
  275   cudaStream_t stream = *cfg.cudaStream();
 
  278   nllSumKernel<<<gridSize, blockSize, shMemSize, stream>>>(
 
  279      probas.data(), weights.size() == 1 ? nullptr : weights.data(),
 
  280      offsetProbas.empty() ? nullptr : offsetProbas.data(), probas.size(), devOut.
data());
 
  282   kahanSum<<<1, blockSize, shMemSize, stream>>>(devOut.
data(), devOut.
data() + gridSize, gridSize, devOut.
data(), 0);
 
  285   double tmpCarry = 0.0;
 
  289   if (weights.size() == 1) {
 
  290      tmpSum *= weights[0];
 
  291      tmpCarry *= weights[0];
 
  295   out.nllSumCarry = tmpCarry;
 
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
void set(InputArr array, bool isVector)
This class overrides some RooBatchComputeInterface functions, for the purpose of providing a cuda spe...
std::string architectureName() const override
const std::vector< void(*)(BatchesHandle)> _computeFunctions
ReduceNLLOutput reduceNLL(RooBatchCompute::Config const &cfg, std::span< const double > probas, std::span< const double > weights, std::span< const double > offsetProbas) override
void compute(RooBatchCompute::Config const &cfg, Computer computer, RestrictArr output, size_t nEvents, const VarVector &vars, ArgVector &extraArgs) override
Compute multiple values using cuda kernels.
double reduceSum(RooBatchCompute::Config const &cfg, InputArr input, size_t n) override
Return the sum of an input array.
Architecture architecture() const override
The interface which should be implemented to provide optimised computation functions for implementati...
A templated class for managing an array of data using a specified memory type.
Data_t * data()
Get a pointer to the start of the array.
__global__ void kahanSum(const double *__restrict__ input, const double *__restrict__ carries, size_t n, double *__restrict__ result, bool nll)
__device__ void kahanSumUpdate(double &sum, double &carry, double a, double otherCarry)
__device__ void kahanSumReduction(double *shared, size_t n, double *__restrict__ result, int carry_index)
static RooBatchComputeClass computeObj
Static object to trigger the constructor which overwrites the dispatch pointer.
std::vector< void(*)(BatchesHandle)> getFunctions()
Returns a std::vector of pointers to the compute functions in this file.
__global__ void nllSumKernel(const double *__restrict__ probas, const double *__restrict__ weights, const double *__restrict__ offsetProbas, size_t n, double *__restrict__ result)
Namespace for dispatching RooFit computations to various backends.
R__EXTERN RooBatchComputeInterface * dispatchCUDA
std::vector< std::span< const double > > VarVector
const double *__restrict InputArr
std::vector< double > ArgVector
double *__restrict RestrictArr
void copyDeviceToHost(const T *src, T *dest, std::size_t n, CudaStream *=nullptr)
Copies data from the CUDA device to the host.
static uint64_t sum(uint64_t i)