|
27 | 27 | #include "util/InputRangeUtils.h" |
28 | 28 | #include "util/Iterators.h" |
29 | 29 | #include "util/JoinAlgorithms/JoinAlgorithms.h" |
| 30 | +#include "util/ParallelExecutor.h" |
30 | 31 | #include "util/ProgressBar.h" |
31 | 32 | #include "util/ThreadSafeQueue.h" |
32 | 33 | #include "util/Timer.h" |
@@ -1787,3 +1788,107 @@ void IndexImpl::setPrefixesForEncodedValues( |
1787 | 1788 | encodedIriManager_ = |
1788 | 1789 | EncodedIriManager{std::move(prefixesWithoutAngleBrackets)}; |
1789 | 1790 | } |
| 1791 | + |
| 1792 | +// _____________________________________________________________________________ |
| 1793 | +void IndexImpl::countDistinct(std::optional<Id>& lastId, size_t& counter, |
| 1794 | + const IdTable& table) { |
| 1795 | + AD_CORRECTNESS_CHECK( |
| 1796 | + !table.empty(), "Empty tables should never be yielded by the lazy scan."); |
| 1797 | + auto col = table.getColumn(0); |
| 1798 | + counter += ql::ranges::distance(col | ::ranges::views::unique([](Id a, Id b) { |
| 1799 | + return a.getBits() == b.getBits(); |
| 1800 | + })); |
| 1801 | + if (lastId == col.front()) { |
| 1802 | + // Avoid double counting in case the last id of the previous block is the |
| 1803 | + // same as the first id of this block. |
| 1804 | + counter--; |
| 1805 | + } |
| 1806 | + lastId = col.back(); |
| 1807 | +} |
| 1808 | + |
| 1809 | +namespace { |
| 1810 | +// Helper function that returns a packaged task that computes distinct counts |
| 1811 | +// over all tables produced by scanning the given permutation. The customAction |
| 1812 | +// is invoked for each table to allow for additional computations while |
| 1813 | +// scanning. |
| 1814 | +std::packaged_task<void()> computeStatistics( |
| 1815 | + const LocatedTriplesSharedState& locatedTriplesSharedState, size_t& counter, |
| 1816 | + const Permutation& permutation, auto customAction) { |
| 1817 | + return std::packaged_task{[&counter, &permutation, &locatedTriplesSharedState, |
| 1818 | + customAction = std::move(customAction)]() { |
| 1819 | + auto cancellationHandle = |
| 1820 | + std::make_shared<ad_utility::SharedCancellationHandle::element_type>(); |
| 1821 | + ScanSpecification scanSpec{std::nullopt, std::nullopt, std::nullopt}; |
| 1822 | + auto tables = permutation.lazyScan( |
| 1823 | + permutation.getScanSpecAndBlocks(scanSpec, *locatedTriplesSharedState), |
| 1824 | + std::nullopt, CompressedRelationReader::ColumnIndicesRef{}, |
| 1825 | + cancellationHandle, *locatedTriplesSharedState); |
| 1826 | + std::optional<Id> lastCol0 = std::nullopt; |
| 1827 | + for (const auto& table : tables) { |
| 1828 | + std::invoke(customAction, table); |
| 1829 | + IndexImpl::countDistinct(lastCol0, counter, table); |
| 1830 | + } |
| 1831 | + }}; |
| 1832 | +} |
| 1833 | +} // namespace |
| 1834 | + |
| 1835 | +// _____________________________________________________________________________ |
| 1836 | +nlohmann::json IndexImpl::recomputeStatistics( |
| 1837 | + const LocatedTriplesSharedState& locatedTriplesSharedState) const { |
| 1838 | + size_t numTriples = 0; |
| 1839 | + size_t numTriplesInternal = 0; |
| 1840 | + size_t numSubjects = 0; |
| 1841 | + size_t numPredicates = 0; |
| 1842 | + size_t numPredicatesInternal = 0; |
| 1843 | + size_t numObjects = 0; |
| 1844 | + uint64_t nextBlankNode = 0; |
| 1845 | + |
| 1846 | + std::vector<std::packaged_task<void()>> tasks; |
| 1847 | + |
| 1848 | + auto getCounterTask = [&locatedTriplesSharedState]( |
| 1849 | + size_t& counter, const Permutation& permutation, |
| 1850 | + auto customAction) { |
| 1851 | + return computeStatistics(locatedTriplesSharedState, counter, permutation, |
| 1852 | + customAction); |
| 1853 | + }; |
| 1854 | + |
| 1855 | + tasks.push_back(getCounterTask( |
| 1856 | + numPredicates, *pso_, |
| 1857 | + [&numTriples, &nextBlankNode](const IdTable& table) { |
| 1858 | + numTriples += table.numRows(); |
| 1859 | + for (auto col : table.getColumns()) { |
| 1860 | + for (auto id : col) { |
| 1861 | + if (id.getDatatype() == Datatype::BlankNodeIndex) { |
| 1862 | + nextBlankNode = |
| 1863 | + std::max(nextBlankNode, id.getBlankNodeIndex().get() + 1); |
| 1864 | + } |
| 1865 | + } |
| 1866 | + } |
| 1867 | + })); |
| 1868 | + |
| 1869 | + tasks.push_back(getCounterTask(numPredicatesInternal, |
| 1870 | + pso_->internalPermutation(), |
| 1871 | + [&numTriplesInternal](const IdTable& table) { |
| 1872 | + numTriplesInternal += table.numRows(); |
| 1873 | + })); |
| 1874 | + |
| 1875 | + if (hasAllPermutations()) { |
| 1876 | + tasks.push_back(getCounterTask(numSubjects, *spo_, ad_utility::noop)); |
| 1877 | + tasks.push_back(getCounterTask(numObjects, *osp_, ad_utility::noop)); |
| 1878 | + } |
| 1879 | + ad_utility::runTasksInParallel(std::move(tasks)); |
| 1880 | + auto configuration = configurationJson_; |
| 1881 | + configuration["num-triples"] = |
| 1882 | + NumNormalAndInternal{numTriples, numTriplesInternal}; |
| 1883 | + configuration["num-predicates"] = |
| 1884 | + NumNormalAndInternal{numPredicates, numPredicatesInternal}; |
| 1885 | + if (hasAllPermutations()) { |
| 1886 | + // These are unused. |
| 1887 | + AD_CORRECTNESS_CHECK(numSubjects_.internal == 0); |
| 1888 | + AD_CORRECTNESS_CHECK(numObjects_.internal == 0); |
| 1889 | + configuration["num-subjects"] = NumNormalAndInternal{numSubjects, 0}; |
| 1890 | + configuration["num-objects"] = NumNormalAndInternal{numObjects, 0}; |
| 1891 | + } |
| 1892 | + configuration["num-blank-nodes-total"] = nextBlankNode; |
| 1893 | + return configuration; |
| 1894 | +} |
0 commit comments