Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RNTupleInspector.cxx
Go to the documentation of this file.
1/// \file RNTupleInspector.cxx
2/// \ingroup NTuple ROOT7
3/// \author Florine de Geus <florine.willemijn.de.geus@cern.ch>
4/// \date 2023-01-09
5/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
6/// is welcome!
7
8/*************************************************************************
9 * Copyright (C) 1995-2023, Rene Brun and Fons Rademakers. *
10 * All rights reserved. *
11 * *
12 * For the licensing terms see $ROOTSYS/LICENSE. *
13 * For the list of contributors see $ROOTSYS/README/CREDITS. *
14 *************************************************************************/
15
17#include <ROOT/RError.hxx>
21#include <ROOT/RError.hxx>
22
23#include <TFile.h>
24
25#include <algorithm>
26#include <cstring>
27#include <deque>
28#include <exception>
29#include <iomanip>
30#include <iostream>
31
33
34ROOT::Experimental::RNTupleInspector::RNTupleInspector(std::unique_ptr<ROOT::Internal::RPageSource> pageSource)
35 : fPageSource(std::move(pageSource))
36{
37 fPageSource->Attach();
38 auto descriptorGuard = fPageSource->GetSharedDescriptorGuard();
40
43}
44
45// NOTE: outlined to avoid including RPageStorage in the header
47
49{
50 fCompressedSize = 0;
51 fUncompressedSize = 0;
52
53 for (const auto &colDesc : fDescriptor.GetColumnIterable()) {
54 if (colDesc.IsAliasColumn())
55 continue;
56
57 auto colId = colDesc.GetPhysicalId();
58
59 // We generate the default memory representation for the given column type in order
60 // to report the size _in memory_ of column elements.
61 std::uint32_t elemSize = RColumnElementBase::Generate(colDesc.GetType())->GetSize();
62 std::uint64_t nElems = 0;
63 std::vector<std::uint64_t> compressedPageSizes{};
64
65 for (const auto &clusterDescriptor : fDescriptor.GetClusterIterable()) {
66 if (!clusterDescriptor.ContainsColumn(colId)) {
67 continue;
68 }
69
70 auto columnRange = clusterDescriptor.GetColumnRange(colId);
71 if (columnRange.IsSuppressed())
72 continue;
73
74 nElems += columnRange.GetNElements();
75
76 if (!fCompressionSettings && columnRange.GetCompressionSettings()) {
77 fCompressionSettings = *columnRange.GetCompressionSettings();
78 } else if (fCompressionSettings && columnRange.GetCompressionSettings() &&
79 (*fCompressionSettings != *columnRange.GetCompressionSettings())) {
80 // Note that currently all clusters and columns are compressed with the same settings and it is not yet
81 // possible to do otherwise. This means that currently, this exception should never be thrown, but this
82 // could change in the future.
83 throw RException(R__FAIL("compression setting mismatch between column ranges (" +
84 std::to_string(*fCompressionSettings) + " vs " +
85 std::to_string(*columnRange.GetCompressionSettings()) +
86 ") for column with physical ID " + std::to_string(colId)));
87 }
88
89 const auto &pageRange = clusterDescriptor.GetPageRange(colId);
90
91 for (const auto &page : pageRange.GetPageInfos()) {
92 compressedPageSizes.emplace_back(page.GetLocator().GetNBytesOnStorage());
93 fUncompressedSize += page.GetNElements() * elemSize;
94 }
95 }
96
97 fCompressedSize +=
98 std::accumulate(compressedPageSizes.begin(), compressedPageSizes.end(), static_cast<std::uint64_t>(0));
100 }
101}
102
105{
106 std::uint64_t compressedSize = 0;
107 std::uint64_t uncompressedSize = 0;
108
109 for (const auto &colDescriptor : fDescriptor.GetColumnIterable(fieldId)) {
110 auto colInfo = GetColumnInspector(colDescriptor.GetPhysicalId());
111 compressedSize += colInfo.GetCompressedSize();
112 uncompressedSize += colInfo.GetUncompressedSize();
113 }
114
115 for (const auto &subFieldDescriptor : fDescriptor.GetFieldIterable(fieldId)) {
116 auto subFieldId = subFieldDescriptor.GetId();
117
118 auto subFieldInfo = CollectFieldTreeInfo(subFieldId);
119
120 compressedSize += subFieldInfo.GetCompressedSize();
121 uncompressedSize += subFieldInfo.GetUncompressedSize();
122 }
123
124 auto fieldInfo = RFieldTreeInspector(fDescriptor.GetFieldDescriptor(fieldId), compressedSize, uncompressedSize);
125 fFieldTreeInfo.emplace(fieldId, fieldInfo);
126 return fieldInfo;
127}
128
129std::vector<ROOT::DescriptorId_t>
131{
132 std::vector<ROOT::DescriptorId_t> colIds;
133 std::deque<ROOT::DescriptorId_t> fieldIdQueue{fieldId};
134
135 while (!fieldIdQueue.empty()) {
136 auto currId = fieldIdQueue.front();
137 fieldIdQueue.pop_front();
138
139 for (const auto &col : fDescriptor.GetColumnIterable(currId)) {
140 if (col.IsAliasColumn()) {
141 continue;
142 }
143
144 colIds.emplace_back(col.GetPhysicalId());
145 }
146
147 for (const auto &fld : fDescriptor.GetFieldIterable(currId)) {
148 fieldIdQueue.push_back(fld.GetId());
149 }
150 }
151
152 return colIds;
153}
154
155std::unique_ptr<ROOT::Experimental::RNTupleInspector>
161
162std::unique_ptr<ROOT::Experimental::RNTupleInspector>
164{
166 return std::unique_ptr<RNTupleInspector>(new RNTupleInspector(std::move(pageSource)));
167}
168
170{
171 if (!fCompressionSettings)
172 return "unknown";
173
174 int algorithm = *fCompressionSettings / 100;
175 int level = *fCompressionSettings - (algorithm * 100);
176
178 " (level " + std::to_string(level) + ")";
179}
180
181//------------------------------------------------------------------------------
182
185{
186 if (physicalColumnId > fDescriptor.GetNPhysicalColumns()) {
187 throw RException(R__FAIL("No column with physical ID " + std::to_string(physicalColumnId) + " present"));
188 }
189
190 return fColumnInfo.at(physicalColumnId);
191}
192
194{
195 size_t typeCount = 0;
196
197 for (auto &[colId, colInfo] : fColumnInfo) {
198 if (colInfo.GetType() == colType) {
199 ++typeCount;
200 }
201 }
202
203 return typeCount;
204}
205
206const std::vector<ROOT::DescriptorId_t>
208{
209 std::vector<ROOT::DescriptorId_t> colIds;
210
211 for (const auto &[colId, colInfo] : fColumnInfo) {
212 if (colInfo.GetType() == colType)
213 colIds.emplace_back(colId);
214 }
215
216 return colIds;
217}
218
219const std::vector<ROOT::ENTupleColumnType> ROOT::Experimental::RNTupleInspector::GetColumnTypes()
220{
221 std::set<ROOT::ENTupleColumnType> colTypes;
222
223 for (const auto &[colId, colInfo] : fColumnInfo) {
224 colTypes.emplace(colInfo.GetType());
225 }
226
227 return std::vector(colTypes.begin(), colTypes.end());
228}
229
231{
232 struct ColumnTypeInfo {
233 std::uint64_t nElems = 0;
234 std::uint64_t compressedSize = 0;
235 std::uint64_t uncompressedSize = 0;
236 std::uint64_t nPages = 0;
237 std::uint32_t count = 0;
238
240 {
241 this->count++;
242 this->nElems += colInfo.GetNElements();
243 this->compressedSize += colInfo.GetCompressedSize();
244 this->uncompressedSize += colInfo.GetUncompressedSize();
245 this->nPages += colInfo.GetNPages();
246 }
247
248 // Helper method to calculate compression factor
249 float GetCompressionFactor() const
250 {
251 if (compressedSize == 0)
252 return 1.0;
253 return static_cast<float>(uncompressedSize) / static_cast<float>(compressedSize);
254 }
255 };
256
257 std::map<ENTupleColumnType, ColumnTypeInfo> colTypeInfo;
258
259 // Collect information for each column
260 for (const auto &[colId, colInfo] : fColumnInfo) {
261 colTypeInfo[colInfo.GetType()] += colInfo;
262 }
263
264 switch (format) {
266 output << " column type | count | # elements | compressed bytes | uncompressed bytes | compression ratio | "
267 "# pages \n"
268 << "----------------|---------|-------------|------------------|--------------------|-------------------|-"
269 "------"
270 << std::endl;
271 for (const auto &[colType, typeInfo] : colTypeInfo)
272 output << std::setw(15) << RColumnElementBase::GetColumnTypeName(colType) << " |" << std::setw(8)
273 << typeInfo.count << " |" << std::setw(12) << typeInfo.nElems << " |" << std::setw(17)
274 << typeInfo.compressedSize << " |" << std::setw(19) << typeInfo.uncompressedSize << " |" << std::fixed
275 << std::setprecision(3) << std::setw(18) << typeInfo.GetCompressionFactor() << " |" << std::setw(6)
276 << typeInfo.nPages << " " << std::endl;
277 break;
279 output << "columnType,count,nElements,compressedSize,uncompressedSize,compressionFactor,nPages" << std::endl;
280 for (const auto &[colType, typeInfo] : colTypeInfo) {
281 output << RColumnElementBase::GetColumnTypeName(colType) << "," << typeInfo.count << "," << typeInfo.nElems
282 << "," << typeInfo.compressedSize << "," << typeInfo.uncompressedSize << "," << std::fixed
283 << std::setprecision(3) << typeInfo.GetCompressionFactor() << "," << typeInfo.nPages << std::endl;
284 }
285 break;
286 default: R__ASSERT(false && "Invalid print format");
287 }
288}
289
290std::unique_ptr<TH1D>
292 std::string_view histName, std::string_view histTitle)
293{
294 if (histName.empty()) {
295 switch (histKind) {
296 case ENTupleInspectorHist::kCount: histName = "colTypeCountHist"; break;
297 case ENTupleInspectorHist::kNElems: histName = "colTypeElemCountHist"; break;
298 case ENTupleInspectorHist::kCompressedSize: histName = "colTypeCompSizeHist"; break;
299 case ENTupleInspectorHist::kUncompressedSize: histName = "colTypeUncompSizeHist"; break;
300 default: throw RException(R__FAIL("Unknown histogram type"));
301 }
302 }
303
304 if (histTitle.empty()) {
305 switch (histKind) {
306 case ENTupleInspectorHist::kCount: histTitle = "Column count by type"; break;
307 case ENTupleInspectorHist::kNElems: histTitle = "Number of elements by column type"; break;
308 case ENTupleInspectorHist::kCompressedSize: histTitle = "Compressed size by column type"; break;
309 case ENTupleInspectorHist::kUncompressedSize: histTitle = "Uncompressed size by column type"; break;
310 default: throw RException(R__FAIL("Unknown histogram type"));
311 }
312 }
313
314 auto hist = std::make_unique<TH1D>(std::string(histName).c_str(), std::string(histTitle).c_str(), 1, 0, 1);
315
316 double data;
317 for (const auto &[colId, colInfo] : fColumnInfo) {
318 switch (histKind) {
319 case ENTupleInspectorHist::kCount: data = 1.; break;
320 case ENTupleInspectorHist::kNElems: data = colInfo.GetNElements(); break;
321 case ENTupleInspectorHist::kCompressedSize: data = colInfo.GetCompressedSize(); break;
322 case ENTupleInspectorHist::kUncompressedSize: data = colInfo.GetUncompressedSize(); break;
323 default: throw RException(R__FAIL("Unknown histogram type"));
324 }
325
326 hist->AddBinContent(hist->GetXaxis()->FindBin(RColumnElementBase::GetColumnTypeName(colInfo.GetType())), data);
327 }
328
329 return hist;
330}
331
332std::unique_ptr<TH1D>
334 std::string histName, std::string histTitle, size_t nBins)
335{
336 if (histTitle.empty())
337 histTitle = "Page size distribution for column with ID " + std::to_string(physicalColumnId);
338
339 return GetPageSizeDistribution({physicalColumnId}, histName, histTitle, nBins);
340}
341
343 std::string histName,
344 std::string histTitle, size_t nBins)
345{
346 if (histName.empty())
347 histName = "pageSizeHistCol" + std::string{RColumnElementBase::GetColumnTypeName(colType)};
348 if (histTitle.empty())
349 histTitle =
350 "Page size distribution for columns with type " + std::string{RColumnElementBase::GetColumnTypeName(colType)};
351
352 auto perTypeHist = GetPageSizeDistribution({colType}, histName, histTitle, nBins);
353
354 if (perTypeHist->GetNhists() < 1)
355 return std::make_unique<TH1D>(histName.c_str(), histTitle.c_str(), 64, 0, 0);
356
357 auto hist = std::unique_ptr<TH1D>(dynamic_cast<TH1D *>(perTypeHist->GetHists()->First()));
358
359 hist->SetName(histName.c_str());
360 hist->SetTitle(histTitle.c_str());
361 hist->SetXTitle("Page size (B)");
362 hist->SetYTitle("N_{pages}");
363 return hist;
364}
365
366std::unique_ptr<TH1D>
368 std::string histName, std::string histTitle, size_t nBins)
369{
370 auto hist = std::make_unique<TH1D>();
371
372 if (histName.empty())
373 histName = "pageSizeHist";
374 hist->SetName(histName.c_str());
375 if (histTitle.empty())
376 histTitle = "Page size distribution";
377 hist->SetTitle(histTitle.c_str());
378 hist->SetXTitle("Page size (B)");
379 hist->SetYTitle("N_{pages}");
380
381 std::vector<std::uint64_t> pageSizes;
382 std::for_each(colIds.begin(), colIds.end(), [this, &pageSizes](const auto colId) {
383 auto colInfo = GetColumnInspector(colId);
384 pageSizes.insert(pageSizes.end(), colInfo.GetCompressedPageSizes().begin(),
385 colInfo.GetCompressedPageSizes().end());
386 });
387
388 if (!pageSizes.empty()) {
389 auto histMinMax = std::minmax_element(pageSizes.begin(), pageSizes.end());
390 hist->SetBins(nBins, *histMinMax.first,
391 *histMinMax.second + ((*histMinMax.second - *histMinMax.first) / static_cast<double>(nBins)));
392
393 for (const auto pageSize : pageSizes) {
394 hist->Fill(pageSize);
395 }
396 }
397
398 return hist;
399}
400
401std::unique_ptr<THStack>
402ROOT::Experimental::RNTupleInspector::GetPageSizeDistribution(std::initializer_list<ROOT::ENTupleColumnType> colTypes,
403 std::string histName, std::string histTitle, size_t nBins)
404{
405 if (histName.empty())
406 histName = "pageSizeHist";
407 if (histTitle.empty())
408 histTitle = "Per-column type page size distribution";
409
410 auto stackedHist = std::make_unique<THStack>(histName.c_str(), histTitle.c_str());
411
412 double histMin = std::numeric_limits<double>::max();
413 double histMax = 0;
414 std::map<ROOT::ENTupleColumnType, std::vector<std::uint64_t>> pageSizes;
415
416 std::vector<ROOT::ENTupleColumnType> colTypeVec = colTypes;
417 if (std::empty(colTypes)) {
418 colTypeVec = GetColumnTypes();
419 }
420
421 for (const auto colType : colTypeVec) {
422 auto colIds = GetColumnsByType(colType);
423
424 if (colIds.empty())
425 continue;
426
427 std::vector<std::uint64_t> pageSizesForColType;
428 std::for_each(colIds.cbegin(), colIds.cend(), [this, &pageSizesForColType](const auto colId) {
429 auto colInfo = GetColumnInspector(colId);
430 pageSizesForColType.insert(pageSizesForColType.end(), colInfo.GetCompressedPageSizes().begin(),
431 colInfo.GetCompressedPageSizes().end());
432 });
433 if (pageSizesForColType.empty())
434 continue;
435
437
438 auto histMinMax = std::minmax_element(pageSizesForColType.begin(), pageSizesForColType.end());
439 histMin = std::min(histMin, static_cast<double>(*histMinMax.first));
440 histMax = std::max(histMax, static_cast<double>(*histMinMax.second));
441 }
442
443 for (const auto &[colType, pageSizesForColType] : pageSizes) {
444 auto hist = std::make_unique<TH1D>(
447 histMax + ((histMax - histMin) / static_cast<double>(nBins)));
448
449 for (const auto pageSize : pageSizesForColType) {
450 hist->Fill(pageSize);
451 }
452
453 stackedHist->Add(hist.release());
454 }
455
456 return stackedHist;
457}
458
459//------------------------------------------------------------------------------
460
463{
464 if (fieldId >= fDescriptor.GetNFields()) {
465 throw RException(R__FAIL("No field with ID " + std::to_string(fieldId) + " present"));
466 }
467
468 return fFieldTreeInfo.at(fieldId);
469}
470
473{
474 auto fieldId = fDescriptor.FindFieldId(fieldName);
475
477 throw RException(R__FAIL("Could not find field `" + std::string(fieldName) + "`"));
478 }
479
480 return GetFieldTreeInspector(fieldId);
481}
482
484 bool includeSubfields) const
485{
486 size_t typeCount = 0;
487
488 for (auto &[fldId, fldInfo] : fFieldTreeInfo) {
489 if (!includeSubfields && fldInfo.GetDescriptor().GetParentId() != fDescriptor.GetFieldZeroId()) {
490 continue;
491 }
492
493 if (std::regex_match(fldInfo.GetDescriptor().GetTypeName(), typeNamePattern)) {
494 typeCount++;
495 }
496 }
497
498 return typeCount;
499}
500
501const std::vector<ROOT::DescriptorId_t>
503{
504 std::vector<ROOT::DescriptorId_t> fieldIds;
505
506 for (auto &[fldId, fldInfo] : fFieldTreeInfo) {
507
508 if (!searchInSubfields && fldInfo.GetDescriptor().GetParentId() != fDescriptor.GetFieldZeroId()) {
509 continue;
510 }
511
512 if (std::regex_match(fldInfo.GetDescriptor().GetFieldName(), fieldNamePattern)) {
513 fieldIds.emplace_back(fldId);
514 }
515 }
516
517 return fieldIds;
518}
#define R__FAIL(msg)
Short-hand to return an RResult<T> in an error state; the RError is implicitly converted into RResult...
Definition RError.hxx:299
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
#define R__ASSERT(e)
Checks condition e and reports a fatal error if it's false.
Definition TError.h:125
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void data
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h Atom_t Int_t ULong_t ULong_t unsigned char prop_list Atom_t Atom_t Atom_t Time_t format
std::string & operator+=(std::string &left, const TString &right)
Definition TString.h:486
The available trivial, native content types of a column.
Provides column-level storage information.
Inspect on-disk and storage-related information of an RNTuple.
std::vector< ROOT::DescriptorId_t > GetColumnsByFieldId(ROOT::DescriptorId_t fieldId) const
Get the columns that make up the given field, including its subfields.
const RFieldTreeInspector & GetFieldTreeInspector(ROOT::DescriptorId_t fieldId) const
Get storage information for a given (sub)field by ID.
std::unique_ptr< TH1D > GetPageSizeDistribution(ROOT::DescriptorId_t physicalColumnId, std::string histName="", std::string histTitle="", size_t nBins=64)
Get a histogram containing the size distribution of the compressed pages for an individual column.
size_t GetColumnCountByType(ROOT::ENTupleColumnType colType) const
Get the number of columns of a given type present in the RNTuple.
const std::vector< ROOT::ENTupleColumnType > GetColumnTypes()
Get all column types present in the RNTuple being inspected.
size_t GetFieldCountByType(const std::regex &typeNamePattern, bool searchInSubfields=true) const
Get the number of fields of a given type or class present in the RNTuple.
const std::vector< ROOT::DescriptorId_t > GetFieldsByName(const std::regex &fieldNamePattern, bool searchInSubfields=true) const
Get the IDs of (sub-)fields whose name matches the given string.
std::string GetCompressionSettingsAsString() const
Get a string describing compression settings of the RNTuple being inspected.
RFieldTreeInspector CollectFieldTreeInfo(ROOT::DescriptorId_t fieldId)
Recursively gather field-level information.
RNTupleInspector(std::unique_ptr< ROOT::Internal::RPageSource > pageSource)
void PrintColumnTypeInfo(ENTupleInspectorPrintFormat format=ENTupleInspectorPrintFormat::kTable, std::ostream &output=std::cout)
Print storage information per column type.
const RColumnInspector & GetColumnInspector(ROOT::DescriptorId_t physicalColumnId) const
Get storage information for a given column.
std::unique_ptr< ROOT::Internal::RPageSource > fPageSource
static std::unique_ptr< RNTupleInspector > Create(const RNTuple &sourceNTuple)
Create a new RNTupleInspector.
const std::vector< ROOT::DescriptorId_t > GetColumnsByType(ROOT::ENTupleColumnType colType)
Get the IDs of all columns with the given type.
void CollectColumnInfo()
Gather column-level and RNTuple-level information.
std::unique_ptr< TH1D > GetColumnTypeInfoAsHist(ENTupleInspectorHist histKind, std::string_view histName="", std::string_view histTitle="")
Get a histogram showing information for each column type present,.
A column element encapsulates the translation between basic C++ types and their column representation...
static const char * GetColumnTypeName(ROOT::ENTupleColumnType type)
static std::unique_ptr< RColumnElementBase > Generate(ROOT::ENTupleColumnType type)
If CppT == void, use the default C++ type for the given column type.
static std::unique_ptr< RPageSourceFile > CreateFromAnchor(const RNTuple &anchor, const ROOT::RNTupleReadOptions &options=ROOT::RNTupleReadOptions())
Used from the RNTuple class to build a datasource if the anchor is already available.
static std::unique_ptr< RPageSource > Create(std::string_view ntupleName, std::string_view location, const ROOT::RNTupleReadOptions &options=ROOT::RNTupleReadOptions())
Guess the concrete derived page source from the file name (location)
Base class for all ROOT issued exceptions.
Definition RError.hxx:79
ROOT::DescriptorId_t GetFieldZeroId() const
Returns the logical parent of all top-level RNTuple data fields.
Representation of an RNTuple data set in a ROOT file.
Definition RNTuple.hxx:65
const_iterator begin() const
const_iterator end() const
1-D histogram with a double per channel (see TH1 documentation)
Definition TH1.h:925
static TString Format(const char *fmt,...)
Static method which formats a string using a printf style format descriptor and return a TString.
Definition TString.cxx:2378
std::uint64_t DescriptorId_t
Distriniguishes elements of the same type within a descriptor, e.g. different fields.
constexpr DescriptorId_t kInvalidDescriptorId
EValues
Note: this is only temporarily a struct and will become a enum class hence the name convention used.
Definition Compression.h:88
static std::string AlgorithmToString(EAlgorithm::EValues algorithm)
static void output()