pax_global_header00006660000000000000000000000064136520457410014521gustar00rootroot0000000000000052 comment=dba95c1469a111cfda1e7938d5887d512b41d7b4 mmmulti-0.1/000077500000000000000000000000001365204574100130455ustar00rootroot00000000000000mmmulti-0.1/.gitignore000066400000000000000000000000521365204574100150320ustar00rootroot00000000000000\#* .#* *~ build/ bin/ test/ .gdb_history mmmulti-0.1/CMakeLists.txt000066400000000000000000000123431365204574100156100ustar00rootroot00000000000000# Specify the minimum version for CMake cmake_minimum_required(VERSION 3.1) # Project's name project(mmmulti) # We build using c++14 set(CMAKE_CXX_STANDARD 14) set(THREADS_PREFER_PTHREAD_FLAG ON) find_package(Threads REQUIRED) # Use all standard-compliant optimizations set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -mcx16 -g") set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -mcx16 -g") # Set the output folder where your program will be created set(CMAKE_BINARY_DIR ${CMAKE_SOURCE_DIR}/bin) set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR}) set(LIBRARY_OUTPUT_PATH ${CMAKE_SOURCE_DIR}/lib) # The following folder will be included include_directories("${PROJECT_SOURCE_DIR}") # Add external projects include(${CMAKE_ROOT}/Modules/ExternalProject.cmake) # TODO: We're using INSTALL_DIR very wrong. We *should* be actually installing # the external projects into their prefixes and working with the installed # files. Instead we're building but not installing them and trying to work with # the non-installed build trees. # # Hence the blanked out INSTALL_COMMANDs to suppress the install step. # # We need to NOT blank out UPDATE_COMMAND or we can never change the Git revision we point to. # The cost of this is that we have to re-configure on every build if we do update. # sdsl-lite (full build using its cmake config) ExternalProject_Add(sdsl-lite GIT_REPOSITORY "https://github.com/simongog/sdsl-lite.git" GIT_TAG "ddb0fbbc33bb183baa616f17eb48e261ac2a3672" CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${INSTALL_DIR} # TODO ADD static build flag UPDATE_COMMAND "" INSTALL_COMMAND "") ExternalProject_Get_property(sdsl-lite INSTALL_DIR) set(sdsl-lite_INCLUDE "${INSTALL_DIR}/src/sdsl-lite-build/include") set(sdsl-lite-divsufsort_INCLUDE "${INSTALL_DIR}/src/sdsl-lite-build/external/libdivsufsort/include") set(sdsl-lite_LIB "${INSTALL_DIR}/src/sdsl-lite-build/lib") set(sdsl-lite-divsufsort_LIB "${INSTALL_DIR}/src/sdsl-lite-build/external/libdivsufsort/lib") # DYNAMIC (full build using its cmake config) ExternalProject_Add(dynamic GIT_REPOSITORY "https://github.com/vgteam/DYNAMIC.git" GIT_TAG "615d8be5276bcd4c5a3d8e31679b4f8e81b2eefc" # we don't actually install dynamic... it's header only #CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${INSTALL_DIR} UPDATE_COMMAND "" INSTALL_COMMAND "" BUILD_COMMAND "" CONFIGURE_COMMAND "") ExternalProject_Get_property(dynamic INSTALL_DIR) set(dynamic_INCLUDE "${INSTALL_DIR}/src/dynamic/include") # In-place Parallel Super Scalar Samplesort (IPS⁴o), header only ExternalProject_Add(ips4o GIT_REPOSITORY "https://github.com/SaschaWitt/ips4o.git" GIT_TAG "a34d7d40c0f1279510e35e0dc2c69637b3c5d0b6" #GIT_REPOSITORY "https://github.com/ekg/ips4o.git" #GIT_TAG "f7eccc5a26928b5c0772922f7b76072414ef1801" UPDATE_COMMAND "" INSTALL_COMMAND "" BUILD_COMMAND "" CONFIGURE_COMMAND "") ExternalProject_Get_property(ips4o SOURCE_DIR) set(ips4o_INCLUDE "${SOURCE_DIR}") # atomic queue ExternalProject_Add(atomicqueue GIT_REPOSITORY "https://github.com/max0x7ba/atomic_queue.git" GIT_TAG "430f732da0889b090705ad00ce15d4463fe7b536" UPDATE_COMMAND "" INSTALL_COMMAND "" BUILD_COMMAND "" CONFIGURE_COMMAND "") ExternalProject_Get_property(atomicqueue SOURCE_DIR) set(atomicqueue_INCLUDE "${SOURCE_DIR}") # paryfor parallel_for ExternalProject_Add(paryfor GIT_REPOSITORY "https://github.com/ekg/paryfor" GIT_TAG "509b28a092f732a068e2908bb9e359a8562cd32f" UPDATE_COMMAND "" INSTALL_COMMAND "" BUILD_COMMAND "" CONFIGURE_COMMAND "") ExternalProject_Get_property(paryfor SOURCE_DIR) set(paryfor_INCLUDE "${SOURCE_DIR}") # taywee's C++ args library, header only ExternalProject_Add(tayweeargs GIT_REPOSITORY "https://github.com/Taywee/args.git" GIT_TAG "3de44ec671db452cc0c4ef86399b108939768abb" UPDATE_COMMAND "" INSTALL_COMMAND "") ExternalProject_Get_property(tayweeargs SOURCE_DIR) set(tayweeargs_INCLUDE "${SOURCE_DIR}") #set(CMAKE_BUILD_TYPE Release) # set up our target executable and specify its dependencies and includes add_executable(mmmulti ${CMAKE_SOURCE_DIR}/src/main.cpp ) add_dependencies(mmmulti sdsl-lite) add_dependencies(mmmulti dynamic) add_dependencies(mmmulti ips4o) add_dependencies(mmmulti atomicqueue) add_dependencies(mmmulti paryfor) add_dependencies(mmmulti tayweeargs) target_include_directories(mmmulti PUBLIC "${CMAKE_SOURCE_DIR}/src" "${sdsl-lite_INCLUDE}" "${sdsl-lite-divsufsort_INCLUDE}" "${dynamic_INCLUDE}" "${ips4o_INCLUDE}" "${atomicqueue_INCLUDE}" "${paryfor_INCLUDE}" "${tayweeargs_INCLUDE}") # macOS doesn't want you to link in libatomic this way if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") target_link_libraries(mmmulti "${sdsl-lite_LIB}/libsdsl.a" "${sdsl-lite-divsufsort_LIB}/libdivsufsort.a" "${sdsl-lite-divsufsort_LIB}/libdivsufsort64.a" Threads::Threads ) elseif (TRUE) target_link_libraries(mmmulti "${sdsl-lite_LIB}/libsdsl.a" "${sdsl-lite-divsufsort_LIB}/libdivsufsort.a" "${sdsl-lite-divsufsort_LIB}/libdivsufsort64.a" "-latomic" Threads::Threads ) endif() if (APPLE) elseif (TRUE) # this was hard to track down # https://stackoverflow.com/questions/35116327/when-g-static-link-pthread-cause-segmentation-fault-why set(CMAKE_EXE_LINKER_FLAGS "-static -Wl,--whole-archive -lpthread -Wl,--no-whole-archive") endif() mmmulti-0.1/LICENSE000066400000000000000000000020701365204574100140510ustar00rootroot00000000000000The MIT License (MIT) Copyright (c) 2019 Erik Garrison Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. mmmulti-0.1/README.md000066400000000000000000000175471365204574100143420ustar00rootroot00000000000000# mmmulti memory-mapped multimap, multiset, and (implicit) interval tree ## rationale Sometimes you have a lot of plain-old data, but you need random access to it. These header-only classes combine memory-mapped files with high-performance parallel sorting and appropriate indexing strategies to support very large (>memory but mmap("temp.dat"); ``` We can then add key pairs: ```c++ mmap.open_writer(); // required before adding keys, opens a writer/coordinator thread mmap.append(key1, value1); mmap.append(key1, value2); mmap.append(key1, value3); ``` Calls to `append` are threadsafe once `open_writer` has been called. To query the `mmmulti::map`, first index it, providing the maximum key to expect (remember, we're working on dense keys!): ```c++ mmap.index(max_key); ``` If we index without providing a maximum key, like this: ```c++ mmap.index(num_threads); ``` ... then we don't pad the multimap, and we can only enumerate the keys inside using e.g. `for_each_pair`. This can have some advantages, such as not requiring that our values be non-null (positive integers for instance, or structs with non-null entries). This allows us to simply use the sorted array functionality of the `mmmulti::map`. #### indexing algorithm If `max_key` is specified, we first pads the records with one key/null record per key in the range of `[0,max_key]`. Then, we memory map this file and apply the [ips4o](https://github.com/SaschaWitt/ips4o) in-place parallel super scalar samplesort to the memory mapped buffer to order the records. When padded (`max_key` is specified), the index is completed by building a bitvector of length `max_key`, marking 1 for the first instance of a key in the sorted order, and building an auxiliary data structure to support `select_1(n)` queries on it that allow us to find the records associated with a given key. Without this index, we can only iterate through the key/value pairs. #### supported queries It is now possible to iterate through the records in order with `for_each_pair`. We can look up the nth key or value with `nth_key` and `nth_value`, which can be used to enable parallel traversal of the keys externally. And we can iterate over the values or unique values of a given key with `for_values_of` and `for_unique_values_of`. ## mmmulti::set memory-mapped multiset with iteration This is similar to `mmmulti::map`, but useful where random access to values is not required, and where contiguity of keys is not possible. It drops the index structures and padding, saving space, but preserves the same API. The `mmmulti::set` only provides iteration across its key space. As such, it's useful where we need to collect and count a set of entities. Random access is not currently supported (TODO: it would be easy to implement using binary search, but current applications do not require it). ### usage To construct the `mmmulti::set`: ```c++ #include "mmmultiset.hpp" mmmulti::set mset("temp.dat"); ``` We can then add values: ```c++ mset.open_writer(); // required before adding keys, opens a writer/coordinator thread mset.append(value1); mset.append(value2); mset.append(value3); ``` Calls to `append` are threadsafe once `open_writer` has been called. To use the `mmmulti::set`, first index it: ```c++ mset.index(num_threads); ``` #### indexing algorithm Indexing closes the writer, memory maps the backing file and applies the [ips4o](https://github.com/SaschaWitt/ips4o) in-place parallel super scalar samplesort to the memory mapped buffer to order the records. #### supported queries It is now possible to iterate through the records in order with `for_each_value`, along with their counts with `for_each_value_count`, or just unique values with `for_each_unique_value`. ## mmmulti::iitree memory-mapped implicit interval tree The implicint interval tree data structure sorts a collection of intervals into a linear array ([cgranges](https://github.com/lh3/cgranges)). Tree traversal is achieved by jumping between array indexes. `mmmulti::iitree` implements this model on top of a disk-backed memory-mapped array, and uses the [ips4o](https://github.com/SaschaWitt/ips4o) in-place parallel super scalar to speed up sorting and index generation. Usage is similar to other classes in `mmmulti`. ### usage To construct the `mmmulti::set`: ```c++ #include "mmiitree.hpp" # template arguments are range integer type and stored value mmmulti::iitree tree("temp.dat"); ``` We can then add ranges: ```c++ tree.open_writer(); // required before adding intervals, opens a writer/coordinator thread tree.add(start1, end1, data1); tree.add(start2, end2, data2); tree.add(start3, end3, data3); ``` Calls to `add` are threadsafe once `open_writer` has been called. To use the `mmmulti::iitree`, first index it: ```c++ tree.index(num_threads); ``` #### indexing algorithm Indexing closes the writer, memory maps the backing file and applies the [ips4o](https://github.com/SaschaWitt/ips4o) in-place parallel super scalar samplesort to the memory mapped buffer to order the records. The indexing procedure from [cgranges](https://github.com/lh3/cgranges) is then applied to set up implicit interval tree. #### supported queries To find overlaps for a given query, use `mmmulti::iitree::overlap`. This returns a vector of range ranks in the sorted set of ranges. We can then look up the start, end, and data fields of these in the backing tree. For efficiency, this is done by callback. ```c++ tree.overlap( n, m, [&](const uint64_t& start, const uint64_t& end, const Data& data) { // process record std::cout << "start " << start << std::endl; std::cout << "end " << end << std::endl; std::cout << "data " << data << std::endl; }); ``` It's also possible to iterate through the ranges with `for_each_entry` and also with their counts, where duplicates are present with `for_each_entry_count`. ## building and testing `mmmulti`'s classes are intended to be used as libraries in other applications. The namespace can be included easily as a CMake ExternalProject. A test utility is included to demonstrate usage. To build it: ``` cmake -H. -Bbuild && cmake --build build -- -j 4 ``` And to run some tests: ``` bin/mmmulti -t x -s 10000000 -t 4 10040123 keys 15099011 values 10688272 unique pairs rm -f x # removes test file ``` ## development By adding a PMHF to the frontend, it should be possible to project arbitrary key sets into the dense range required by `mmmulti::map` with only a few bits of overhead per entry. This would also obviate the need to pad our key space. ## author Erik Garrison ## license MIT mmmulti-0.1/src/000077500000000000000000000000001365204574100136345ustar00rootroot00000000000000mmmulti-0.1/src/main.cpp000066400000000000000000000276711365204574100153010ustar00rootroot00000000000000#include #include #include #include #include #include "ips4o.hpp" #include "args.hxx" #include #include "mmmultimap.hpp" #include "mmmultiset.hpp" #include "mmiitree.hpp" #include "paryfor.hpp" int main(int argc, char** argv) { args::ArgumentParser parser("memmapped multimap interface"); args::HelpFlag help(parser, "help", "display this help summary", {'h', "help"}); //args::ValueFlag in_file(parser, "FILE", "use this input file for a uint64_t sort", {'i', "in"}); args::ValueFlag test_file(parser, "FILE", "test mmmultimap with random data in this file", {'T', "test-file"}); args::ValueFlag test_size(parser, "N", "test this many pairs", {'s', "test-size"}); args::ValueFlag max_val(parser, "N", "generate test data in the range [1,max_value]", {'M', "max-value"}); args::ValueFlag threads(parser, "N", "number of threads to use", {'t', "threads"}); args::ValueFlag unique_value_tests(parser, "N", "number of unique value calls to make", {'u', "unique-vals"}); args::Flag test_multiset(parser, "multiset", "test the multiset", {'m', "test-multiset"}); args::Flag test_complex(parser, "complex", "test the multimap with complex values", {'c', "test-complex-values"}); args::Flag test_unpadded(parser, "unpadded", "test the multimap without padding for random access", {'P', "test-unpadded"}); args::Flag test_iitree(parser, "iitree", "test the iitree from cranges", {'I', "test-iitree"}); try { parser.ParseCLI(argc, argv); } catch (args::Help) { std::cout << parser; return 0; } catch (args::ParseError e) { std::cerr << e.what() << std::endl; std::cerr << parser; return 1; } if (argc==1) { std::cout << parser; return 1; } int num_threads = 1; if (args::get(threads)) { num_threads = args::get(threads); } if (!args::get(test_file).empty() && !args::get(test_multiset) && !args::get(test_iitree)) { if (args::get(test_complex)) { std::random_device rd; //Will be used to obtain a seed for the random number engine std::mt19937 gen(rd()); //Standard mersenne_twister_engine seeded with rd() uint64_t max_key = args::get(max_val); std::uniform_int_distribution dis(1, max_key); //std::vector x; x.reserve(1e8); std::remove(args::get(test_file).c_str()); mmmulti::map> mm(args::get(test_file)); uint64_t x_len = args::get(test_size); mm.open_writer(); paryfor::parallel_for( 0, x_len, num_threads, 1000, [&](uint64_t n) { mm.append(dis(gen), std::make_pair(dis(gen), dis(gen))); }); mm.index(num_threads, max_key); uint64_t i = 0; uint64_t key_count = 0; uint64_t value_count = 0; uint64_t unique_value_count = 0; bool first = true; uint64_t last = 0; mm.for_each_pair([&](const uint64_t& a, const std::pair& b) { if (first || a > last) { ++key_count; last = a; first = false; mm.for_unique_values_of(a, [&](const std::pair& v) { ++unique_value_count; }); } ++value_count; }); // exercise unique value search uint64_t unique_value_test_count = args::get(unique_value_tests); auto start = std::chrono::system_clock::now(); uint64_t x = 0; for (uint64_t i = 0; i < unique_value_test_count; ++i) { uint64_t q = dis(gen); mm.for_unique_values_of(q, [&](const std::pair& v) { ++x; }); } auto end = std::chrono::system_clock::now(); std::chrono::duration elapsed_seconds = end-start; std::cerr << key_count << " keys" << std::endl; std::cerr << value_count << " values" << std::endl; std::cerr << unique_value_count << " unique pairs" << std::endl; if (unique_value_test_count) std::cerr << elapsed_seconds.count()/unique_value_test_count << "s per unique value call" << std::endl; } else if (args::get(test_unpadded)) { std::random_device rd; //Will be used to obtain a seed for the random number engine std::mt19937 gen(rd()); //Standard mersenne_twister_engine seeded with rd() uint64_t max_key = args::get(max_val); std::uniform_int_distribution dis(1, max_key); //std::vector x; x.reserve(1e8); std::remove(args::get(test_file).c_str()); mmmulti::map mm(args::get(test_file)); uint64_t x_len = args::get(test_size); mm.open_writer(); paryfor::parallel_for( 0, x_len, num_threads, 1000, [&](uint64_t n) { mm.append(dis(gen), dis(gen)); }); mm.index(num_threads); uint64_t i = 0; uint64_t key_count = 0; uint64_t value_count = 0; uint64_t unique_value_count = 0; bool first = true; uint64_t last = 0; mm.for_each_pair([&](const uint64_t& a, const uint64_t& b) { if (first || a > last) { ++key_count; last = a; first = false; mm.for_unique_values_of(a, [&](const uint64_t& v) { ++unique_value_count; }); } ++value_count; }); std::cerr << key_count << " keys" << std::endl; std::cerr << value_count << " values" << std::endl; std::cerr << unique_value_count << " unique pairs (these can't be found with unpadded mmmultimaps)" << std::endl; } else { std::random_device rd; //Will be used to obtain a seed for the random number engine std::mt19937 gen(rd()); //Standard mersenne_twister_engine seeded with rd() uint64_t max_key = args::get(max_val); std::uniform_int_distribution dis(1, max_key); //std::vector x; x.reserve(1e8); std::remove(args::get(test_file).c_str()); mmmulti::map mm(args::get(test_file)); uint64_t x_len = args::get(test_size); mm.open_writer(); paryfor::parallel_for( 0, x_len, num_threads, 1000, [&](uint64_t n) { mm.append(dis(gen), dis(gen)); }); mm.index(num_threads, max_key); uint64_t i = 0; uint64_t key_count = 0; uint64_t value_count = 0; uint64_t unique_value_count = 0; bool first = true; uint64_t last = 0; mm.for_each_pair([&](const uint64_t& a, const uint64_t& b) { if (first || a > last) { ++key_count; last = a; first = false; mm.for_unique_values_of(a, [&](const uint64_t& v) { ++unique_value_count; }); } ++value_count; }); // exercise unique value search uint64_t unique_value_test_count = args::get(unique_value_tests); auto start = std::chrono::system_clock::now(); uint64_t x = 0; for (uint64_t i = 0; i < unique_value_test_count; ++i) { uint64_t q = dis(gen); mm.for_unique_values_of(q, [&](const uint64_t& v) { ++x; }); } auto end = std::chrono::system_clock::now(); std::chrono::duration elapsed_seconds = end-start; std::cerr << key_count << " keys" << std::endl; std::cerr << value_count << " values" << std::endl; std::cerr << unique_value_count << " unique pairs" << std::endl; if (unique_value_test_count) std::cerr << elapsed_seconds.count()/unique_value_test_count << "s per unique value call" << std::endl; } } else if (!args::get(test_file).empty() && args::get(test_multiset)) { std::random_device rd; //Will be used to obtain a seed for the random number engine std::mt19937 gen(rd()); //Standard mersenne_twister_engine seeded with rd() uint64_t max_value = args::get(max_val); std::uniform_int_distribution dis(1, max_value); //std::vector x; x.reserve(1e8); std::remove(args::get(test_file).c_str()); mmmulti::set ms(args::get(test_file)); uint64_t x_len = args::get(test_size); ms.open_writer(); paryfor::parallel_for( 0, x_len, num_threads, 1000, [&](uint64_t n) { //Use dis to transform the random unsigned int generated by gen into an int in the range ms.append(dis(gen)); }); ms.index(num_threads); uint64_t i = 0; uint64_t value_count = 0; uint64_t unique_value_count = 0; uint64_t sum1 = 0; // exercise unique value search ms.for_each_value_count([&](const uint64_t& value, const uint64_t& count) { ++unique_value_count; value_count += count; sum1 += count * value; }); uint64_t second_count = 0; uint64_t sum2 = 0; for (auto v = ms.begin(); v != ms.end(); ++v) { ++second_count; sum2 += *v; } std::cerr << value_count << " values, expected " << x_len << std::endl; std::cerr << unique_value_count << " unique pairs" << std::endl; std::cerr << "sums " << sum1 << " " << sum2 << std::endl; } else if (!args::get(test_file).empty() && args::get(test_iitree)) { //} else if (args::get(test_iitree)) { std::remove(args::get(test_file).c_str()); mmmulti::iitree tree(args::get(test_file)); std::random_device rd; //Will be used to obtain a seed for the random number engine std::mt19937 gen(rd()); //Standard mersenne_twister_engine seeded with rd() uint64_t max_value = args::get(max_val); std::uniform_int_distribution dis(0, max_value); uint64_t x_len = args::get(test_size); tree.open_writer(); paryfor::parallel_for( 0, x_len, num_threads, 1000, [&](uint64_t n) { uint64_t q = dis(gen); uint64_t r = std::min(q + 100, max_value); tree.add(q, r, dis(gen)); }); tree.index(num_threads); for (int n=0; n n || end < n) { std::cerr << "tree broken at " << n << std::endl; } }); } std::cerr << std::endl; /* tree.for_each_entry([&](const mmmulti::iitree::Interval& ival) { std::cerr << ival.st << ".." << ival.en << " " << ival.data << std::endl; }); */ //std::cerr << std::endl; } return 0; } mmmulti-0.1/src/mmiitree.hpp000066400000000000000000000407271365204574100161720ustar00rootroot00000000000000#pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "ips4o.hpp" #include "atomic_queue.h" /* Suppose there are N=2^(K+1)-1 sorted numbers in an array a[]. They * implicitly form a complete binary tree of height K+1. We consider leaves to * be at level 0. The binary tree has the following properties: * * 1. The lowest k-1 bits of nodes at level k are all 1. The k-th bit is 0. * The first node at level k is indexed by 2^k-1. The root of the tree is * indexed by 2^K-1. * * 2. For a node x at level k, its left child is x-2^(k-1) and the right child * is x+2^(k-1). * * 3. For a node x at level k, it is a left child if its (k+1)-th bit is 0. Its * parent node is x+2^k. Similarly, if the (k+1)-th bit is 1, x is a right * child and its parent is x-2^k. * * 4. For a node x at level k, there are 2^(k+1)-1 nodes in the subtree * descending from x, including x. The left-most leaf is x&~(2^k-1) (masking * the lowest k bits to 0). * * When numbers can't fill a complete binary tree, the parent of a node may not * be present in the array. The implementation here still mimics a complete * tree, though getting the special casing right is a little complex. There may * be alternative solutions. * * As a sorted array can be considered as a binary search tree, we can * implement an interval tree on top of the idea. We only need to record, for * each node, the maximum value in the subtree descending from the node. * * This implementation allows the interval array to be stored in a memory * mapped file on disk. I've got a real lot of intervals to process. -EG */ namespace mmmulti { template // "S" is a scalar type; "T" is the type of data associated with each interval class iitree { struct StackCell { size_t x; // node int k, w; // k: level; w: 0 if left child hasn't been processed StackCell() {}; StackCell(int k_, size_t x_, int w_) : x(x_), k(k_), w(w_) {}; }; public: struct Interval { S st, en, max; T data; }; // note that we have to set max to end initially Interval make_interval(const S &s, const S &e, const T &d) { return {s, e, e, d}; } private: struct IntervalLess { bool operator()(const Interval &a, const Interval &b) const { return a.st < b.st; } }; // memory mapped buffer struct struct mmap_buffer_t { int fd; off_t size; void *data; }; // utilities used by mmmultimap int open_mmap_buffer(const char* path, mmap_buffer_t* buffer) { buffer->data = nullptr; buffer->fd = open(path, O_RDWR); if (buffer->fd == -1) { goto error; } struct stat stats; if (-1 == fstat(buffer->fd, &stats)) { goto error; } if (!(buffer->data = mmap(nullptr, stats.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, buffer->fd, 0 ))) { goto error; } madvise(buffer, stats.st_size, POSIX_MADV_WILLNEED | POSIX_MADV_SEQUENTIAL); buffer->size = stats.st_size; return 0; error: perror(path); if (buffer->data) munmap(buffer->data, stats.st_size); if (buffer->fd != -1) close(buffer->fd); buffer->data = 0; buffer->fd = 0; return -1; } void close_mmap_buffer(mmap_buffer_t* buffer) { if (buffer->data) { munmap(buffer->data, buffer->size); buffer->data = 0; buffer->size = 0; } if (buffer->fd) { close(buffer->fd); buffer->fd = 0; } } std::ofstream writer; char* reader = nullptr; int reader_fd = 0; std::string filename; std::string index_filename; bool sorted = false; // key information uint64_t n_records = 0; bool indexed = false; std::thread writer_thread; // = nullptr; atomic_queue::AtomicQueue2* interval_queue = nullptr; std::atomic work_todo; //std::vector a; uint64_t max_level = 0; uint64_t index_core(Interval* a, size_t a_size) { size_t i, last_i; // last_i points to the rightmost node in the tree S last; // last is the max value at node last_i int64_t k; if (a_size == 0) return -1; for (i = 0; i < a_size; i += 2) last_i = i, last = a[i].max = a[i].en; // leaves (i.e. at level 0) for (k = 1; ((int64_t)1)< el? e : el; e = e > er? e : er; a[i].max = e; // set the max value for node i } last_i = last_i>>k&1? last_i - x : last_i + x; // last_i now points to the parent of the original last_i if (last_i < a_size && a[last_i].max > last) // update last accordingly last = a[last_i].max; } return k - 1; } public: // forward declaration for iterator types class iterator; class const_iterator; // constructor iitree(void) { } iitree(const std::string& f) : filename(f) { } ~iitree(void) { close_writer(); close_reader(); } void set_base_filename(const std::string& f) { filename = f; } void writer_func(void) { Interval ival; while (work_todo.load() || !interval_queue->was_empty()) { if (interval_queue->try_pop(ival)) { do { writer.write((char*)&ival, sizeof(Interval)); } while (interval_queue->try_pop(ival)); } else { std::this_thread::sleep_for(std::chrono::nanoseconds(1)); } } writer.close(); } // close/open backing file void open_writer(void) { if (writer.is_open()) { writer.seekp(0, std::ios_base::end); // seek to the end for appending return; } assert(!filename.empty()); // remove the file; we only call this when making an index, and it's done once writer.open(filename.c_str(), std::ios::binary | std::ios::trunc); if (writer.fail()) { throw std::ios_base::failure(std::strerror(errno)); } interval_queue = new atomic_queue::AtomicQueue2; work_todo.store(true); writer_thread = std::thread(&iitree::writer_func, this); } void close_writer(void) { if (work_todo.load()) { work_todo.store(false); std::this_thread::sleep_for(std::chrono::milliseconds(1)); if (writer_thread.joinable()) { writer_thread.join(); } delete interval_queue; interval_queue = nullptr; } } void open_reader(void) { if (reader_fd) return; //open assert(!filename.empty()); // open in binary mode as we are reading from this interface reader_fd = open(filename.c_str(), O_RDWR); if (reader_fd == -1) { assert(false); } struct stat stats; if (-1 == fstat(reader_fd, &stats)) { assert(false); } if (!(reader = (char*) mmap(NULL, stats.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, reader_fd, 0))) { assert(false); } madvise((void*)reader, stats.st_size, POSIX_MADV_WILLNEED | POSIX_MADV_SEQUENTIAL); } void close_reader(void) { if (reader) { size_t c = record_count(); munmap(reader, c); reader = 0; } if (reader_fd) { close(reader_fd); reader_fd = 0; } } /// write the pair to end of backing file /// return the number of records, which will only work after indexing size_t size(void) const { return n_records; } /// iterator to first value iterator begin(void) { return iterator((Interval*)reader); } /// iterator to one past end iterator end(void) { return iterator(((Interval*)reader)+n_records); } /// const iterator to first value const_iterator begin(void) const { return const_iterator((Interval*)reader); } /// const iterator to one past end const_iterator end(void) const { return const_iterator(((Interval*)reader)+n_records); } /// return the backing buffer char* get_buffer(void) const { return reader; } /// get the record count size_t record_count(void) { int fd = open(filename.c_str(), O_RDWR); if (fd == -1) { assert(false); } struct stat stats; if (-1 == fstat(fd, &stats)) { assert(false); } assert(stats.st_size % sizeof(Interval) == 0); // must be even records size_t count = stats.st_size / sizeof(Interval); return count; } std::ifstream::pos_type filesize(const char* filename) { std::ifstream in(filename, std::ifstream::ate | std::ifstream::binary); return in.tellg(); } /// sort the record in the backing file by key void sort(int num_threads) { close_writer(); close_reader(); if (sorted) return; //std::cerr << "sorting!" << std::endl; mmap_buffer_t buffer; open_mmap_buffer(filename.c_str(), &buffer); uint64_t data_len = buffer.size/sizeof(Interval); // sort in parallel (uses OpenMP if available, std::thread otherwise) ips4o::parallel::sort((Interval*)buffer.data, ((Interval*)buffer.data)+data_len, IntervalLess(), num_threads); close_mmap_buffer(&buffer); sorted = true; } Interval read_value(size_t i) const { Interval v; memcpy(&v, &reader[i*sizeof(Interval)], sizeof(Interval)); return v; } void for_each_entry(const std::function& lambda) const { for (size_t i = 0; i < n_records; ++i) { lambda(read_value(i)); } } void for_each_entry_count(const std::function& lambda) const { assert(sorted); uint64_t curr_count = 0; bool first = true; Interval last; for_each_value([&](const Interval& v) { if (first) { last = v; first = false; } else if (v != last) { lambda(last, curr_count); curr_count = 0; last = v; } ++curr_count; }); lambda(last, curr_count); } /// a local reimplementation of a pointer iterator class iterator { public: iterator(Interval* ptr) : ptr(ptr) {} iterator() : ptr(nullptr) {} iterator(const iitree::iterator& other) : ptr(other.ptr) {} iterator& operator=(const iitree::iterator& other) { ptr = other.ptr; } Interval& operator*() { return *ptr; } bool operator==(const iitree::iterator& other) { return ptr == other.ptr; } bool operator!=(const iitree::iterator& other) { return !(*this == other); } iterator& operator++() { ++ptr; return *this; } iterator operator++(int) { return iterator(ptr++); } iterator& operator--() { --ptr; return *this; } iterator operator--(int) { return iterator(ptr--); } private: Interval* ptr; friend class const_iterator; }; /// a local reimplementation of a const pointer iterator class const_iterator { public: const_iterator(const Interval* ptr) : ptr(ptr) {} const_iterator() : ptr(nullptr) {} const_iterator(const iitree::const_iterator& other) : ptr(other.ptr) {} const_iterator& operator=(const iitree::const_iterator& other) { ptr = other.ptr; } const_iterator(const iitree::iterator& other) : ptr(other.ptr) {} const_iterator& operator=(const iitree::iterator& other) { ptr = other.ptr; } const Interval& operator*() { return *ptr; } bool operator==(const iitree::const_iterator& other) { return ptr == other.ptr; } bool operator!=(const iitree::const_iterator& other) { return !(*this == other); } const_iterator& operator++() { ++ptr; return *this; } const_iterator operator++(int) { return const_iterator(ptr++); } const_iterator& operator--() { --ptr; return *this; } const_iterator operator--(int) { return const_iterator(ptr--); } private: const Interval* ptr; }; Interval* get_array(void) const { return (Interval*)reader; } public: /// write into our write buffer /// open_writer() must be called first to set up our buffer and writer void add(const S &s, const S &e, const T &d) { interval_queue->push(make_interval(s, e, d)); } void index(int num_threads) { sort(num_threads); open_reader(); n_records = record_count(); indexed = true; close_reader(); open_reader(); max_level = index_core(get_array(), n_records); } /// get overlaps, callback takes index, start, end, data void overlap(const S &st, const S &en, const std::function& func) const { int64_t t = 0; StackCell stack[64]; Interval* a = get_array(); stack[t++] = StackCell(max_level, (1LL<> z.k << z.k, i1 = i0 + (1LL<<(z.k+1)) - 1; if (i1 >= n_records) i1 = n_records; for (i = i0; i < i1 && a[i].st < en; ++i) if (st < a[i].en) // if overlap, append to out[] func(i, a[i].st, a[i].en, a[i].data); } else if (z.w == 0) { // if left child not processed size_t y = z.x - (1LL<<(z.k-1)); // the left child of z.x; NB: y may be out of range (i.e. y>=n_records) stack[t++] = StackCell(z.k, z.x, 1); // re-add node z.x, but mark the left child having been processed if (y >= n_records || a[y].max > st) // push the left child if y is out of range or may overlap with the query stack[t++] = StackCell(z.k - 1, y, 0); } else if (z.x < n_records && a[z.x].st < en) { // need to push the right child if (st < a[z.x].en) func(z.x, a[z.x].st, a[z.x].en, a[z.x].data); // test if z.x overlaps the query; if yes, append to out[] stack[t++] = StackCell(z.k - 1, z.x + (1LL<<(z.k-1)), 0); // push the right child } } } /// callback takes only start, end, and data void overlap(const S &st, const S &en, const std::function& func) const { overlap( st, en, [&func](const size_t& idx, const S& start, const S& end, const T& data) { func(start, end, data); }); } //size_t size(void) const { return a.size(); } const S &start(size_t i) const { return get_array()[i].st; } const S &end(size_t i) const { return get_array()[i].en; } const T &data(size_t i) const { return get_array()[i].data; } }; } mmmulti-0.1/src/mmmultimap.hpp000066400000000000000000000347101365204574100165340ustar00rootroot00000000000000#pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sdsl/bit_vectors.hpp" #include "ips4o.hpp" #include "atomic_queue.h" namespace mmmulti { /* 'mmmulti::map' is a disk-backed multimap where keys and values are stored in a binary file. The key space is assumed to be numeric, but values may be of arbitrary size. To build the multimap we first append key/value pairs. To query the multimap we must first index it. We first sort by key using bsort. Then we pad the key space so that we have one entry per integer in the range [0, max(keys)], sorting again to put the padding pairs in the right positions. We record the key space by marking a bitvector of length equal to max(keys) with 1 at those positions corresponding to the first record of each key in the sorted array. We compress this bitvector and build select supports on it We are now able to traverse the sorted array using select queries on this bitvector. */ template class map { private: // memory mapped buffer struct struct mmap_buffer_t { int fd; off_t size; void *data; }; // utilities used by mmmultimap int open_mmap_buffer(const char* path, mmap_buffer_t* buffer) { buffer->data = nullptr; buffer->fd = open(path, O_RDWR); if (buffer->fd == -1) { goto error; } struct stat stats; if (-1 == fstat(buffer->fd, &stats)) { goto error; } if (!(buffer->data = mmap(nullptr, stats.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, buffer->fd, 0 ))) { goto error; } madvise(buffer, stats.st_size, POSIX_MADV_WILLNEED | POSIX_MADV_SEQUENTIAL); buffer->size = stats.st_size; return 0; error: perror(path); if (buffer->data) munmap(buffer->data, stats.st_size); if (buffer->fd != -1) close(buffer->fd); buffer->data = 0; buffer->fd = 0; return -1; } void close_mmap_buffer(mmap_buffer_t* buffer) { if (buffer->data) { munmap(buffer->data, buffer->size); buffer->data = 0; buffer->size = 0; } if (buffer->fd) { close(buffer->fd); buffer->fd = 0; } } typedef struct { Key key; Value value; } Entry; struct EntryLess { bool operator()(const std::pair& a, const std::pair& b) const { return a < b; } }; std::ofstream writer; std::vector writers; char* reader = nullptr; int reader_fd = 0; std::string filename; std::string index_filename; bool sorted = false; bool padded = false; size_t record_size = 0; // key information Key max_key = 0; uint64_t n_records = 0; // null key and value Key nullkey; Value nullvalue; // compressed bitvector marked at key starts sdsl::sd_vector<> key_cbv; // select support for the key cbv sdsl::sd_vector<>::select_1_type key_cbv_select; bool indexed = false; uint32_t OUTPUT_VERSION = 1; // update as we change our format std::thread writer_thread; atomic_queue::AtomicQueue2* entry_queue = nullptr; std::atomic work_todo; void init(void) { record_size = sizeof(Key) + sizeof(Value); nullkey = 0; for (size_t i = 0; i < sizeof(Value); ++i) { ((uint8_t*)&nullvalue)[i] = 0; } } public: // forward declaration for iterator types class iterator; class const_iterator; // constructor map(void) { init(); } map(const std::string& f) : filename(f) { init(); } ~map(void) { close_writer(); close_reader(); } void set_base_filename(const std::string& f) { filename = f; index_filename = filename+".idx"; } // load from base file name void load(const std::string& f) { open_reader(); set_base_filename(f); std::ifstream in(index_filename.c_str()); std::string magic; in.read((char*)magic.c_str(), 9); uint32_t version; in.read((char*) &version, sizeof(version)); assert(version == OUTPUT_VERSION); size_t record_size_in_bytes; sdsl::read_member(record_size_in_bytes, in); assert(record_size_in_bytes == record_size); sdsl::read_member(n_records, in); assert(n_records == record_count()); sdsl::read_member(max_key, in); assert(max_key == nth_key(n_records)); key_cbv.load(in); key_cbv_select.load(in); } // save indexes size_t save(sdsl::structure_tree_node* s = NULL, std::string name = "") { assert(max_key && indexed); sdsl::structure_tree_node* child = sdsl::structure_tree::add_child(s, name, sdsl::util::class_name(*this)); // open the sdsl index std::ofstream out(index_filename.c_str()); size_t written = 0; out << "mmmultimap"; written += 9; uint32_t version_buffer = OUTPUT_VERSION; out.write((char*) &version_buffer, sizeof(version_buffer)); written += sdsl::write_member(record_size, out, child, "record_size"); written += sdsl::write_member(record_count(), out, child, "record_count"); written += sdsl::write_member(max_key, out, child, "max_key"); written += key_cbv.serialize(out, child, "key_cbv"); written += key_cbv_select.serialize(out, child, "key_cbv_select"); out.close(); return written; } void writer_func(void) { Entry entry; while (work_todo.load() || !entry_queue->was_empty()) { if (entry_queue->try_pop(entry)) { do { writer.write((char*)&entry, sizeof(Entry)); } while (entry_queue->try_pop(entry)); } else { std::this_thread::sleep_for(std::chrono::nanoseconds(1)); } } writer.close(); } // close/open backing file void open_writer(void) { if (writer.is_open()) { writer.seekp(0, std::ios_base::end); // seek to the end for appending return; } assert(!filename.empty()); // remove the file; we only call this when making an index, and it's done once writer.open(filename.c_str(), std::ios::binary | std::ios::trunc); if (writer.fail()) { throw std::ios_base::failure(std::strerror(errno)); } entry_queue = new atomic_queue::AtomicQueue2; work_todo.store(true); writer_thread = std::thread(&map::writer_func, this); } void close_writer(void) { if (work_todo.load()) { work_todo.store(false); std::this_thread::sleep_for(std::chrono::milliseconds(1)); if (writer_thread.joinable()) { writer_thread.join(); } delete entry_queue; entry_queue = nullptr; } } void open_reader(void) { if (reader_fd) return; //open assert(!filename.empty()); // open in binary mode as we are reading from this interface reader_fd = open(filename.c_str(), O_RDWR); if (reader_fd == -1) { assert(false); } struct stat stats; if (-1 == fstat(reader_fd, &stats)) { assert(false); } if (!(reader = (char*) mmap(NULL, stats.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, reader_fd, 0))) { assert(false); } madvise((void*)reader, stats.st_size, POSIX_MADV_WILLNEED | POSIX_MADV_SEQUENTIAL); } void close_reader(void) { if (reader) { size_t c = record_count(); munmap(reader, c); reader = 0; } if (reader_fd) { close(reader_fd); reader_fd = 0; } } /// write the pair to the backing file /// open_writer() must be called first to set up our buffer and writer void append(const Key& k, const Value& v) { entry_queue->push((Entry){k, v}); } /// return the number of records, which will only work after indexing size_t size(void) const { return n_records; } /// return the size of each combined record size_t get_record_size(void) const { return record_size; } /// return the backing buffer char* get_buffer(void) const { return reader; } /// get the record count size_t record_count(void) { int fd = open(filename.c_str(), O_RDWR); if (fd == -1) { assert(false); } struct stat stats; if (-1 == fstat(fd, &stats)) { assert(false); } assert(stats.st_size % record_size == 0); // must be even records size_t count = stats.st_size / record_size; close(fd); return count; } std::ifstream::pos_type filesize(const char* filename) { std::ifstream in(filename, std::ifstream::ate | std::ifstream::binary); return in.tellg(); } /// sort the record in the backing file by key void sort(int num_threads) { if (sorted) return; //std::cerr << "sorting!" << std::endl; mmap_buffer_t buffer; open_mmap_buffer(filename.c_str(), &buffer); uint64_t data_len = buffer.size/record_size; // sort in parallel (uses OpenMP if available, std::thread otherwise) ips4o::parallel::sort((std::pair*)buffer.data, ((std::pair*)buffer.data)+data_len, EntryLess(), num_threads); close_mmap_buffer(&buffer); sorted = true; } Entry read_entry(size_t i) const { Entry e; memcpy(&e, &reader[i*record_size], sizeof(Entry)); return e; } // pad our key space with empty records so that we can query it directly with select operations void padsort(int num_threads) { close_reader(); // blindly fill with a single key/value pair for each entity in the key space // running this in parallel causes a strange race condition and segfaults at high thread counts // and it does not provide any performance benefit for (size_t i = 1; i <= max_key; ++i) { append(i, nullvalue); } close_writer(); sort(num_threads); padded = true; } void simplesort(int num_threads) { close_reader(); close_writer(); sort(num_threads); padded = false; } // index void index(int num_threads, Key new_max = 0) { if (new_max) { max_key = new_max; padsort(num_threads); } else { simplesort(num_threads); } open_reader(); n_records = record_count(); if (padded) { sdsl::bit_vector key_bv(n_records+1); // record the key starts Key last = nullkey, curr = nullkey; Value val = nullvalue; Entry entry; //reader.read((char*)&last, sizeof(Key)); for (size_t i = 0; i < n_records; ++i) { entry = read_entry(i); curr = entry.key; val = entry.value; if (curr != last) { key_bv[i] = 1; } last = curr; } // the last key in the sort is our max key max_key = nth_key(n_records-1); key_bv[n_records] = 1; // sentinel // build the compressed bitvector sdsl::util::assign(key_cbv, sdsl::sd_vector<>(key_bv)); key_bv.resize(0); // memory could be tight // build the select supports on the key bitvector sdsl::util::assign(key_cbv_select, sdsl::sd_vector<>::select_1_type(&key_cbv)); } indexed = true; close_reader(); open_reader(); } Key nth_key(size_t n) const { Entry e = read_entry(n); return e.key; } Value nth_value(size_t n) const { Entry e = read_entry(n); return e.value; } void for_each_pair(const std::function& lambda) const { Entry entry; for (size_t i = 0; i < n_records; ++i) { entry = read_entry(i); if (!padded || !is_null(entry.value)) { lambda(entry.key, entry.value); } } } std::vector values(const Key& key) const { std::vector values; for_values_of(key, [&values](const Value& v) { values.push_back(v); }); return values; } std::vector unique_values(const Key& key) const { std::vector values; for_unique_values_of(key, [&values](const Value& v) { values.push_back(v); }); return values; } void for_unique_values_of(const Key& key, const std::function& lambda) const { // quirk: if we've sorted by the whole binary record, // then we can do a simple 'uniq' operation to get the unique values Value last = nullvalue; for_values_of(key, [this,&lambda,&last](const Value& value) { if (value != last) { lambda(value); last = value; } }); } bool is_null(const Value& value) const { for (size_t i = 0; i < sizeof(Value); ++i) { if (((uint8_t*)&value)[i] != 0) { return false; } } return true; } void for_values_of(const Key& key, const std::function& lambda) const { if (!padded || key == 0 || key > max_key) { return; } size_t i = key_cbv_select(key); for ( ; i < n_records; ++i) { Entry entry = read_entry(i); if (entry.key != key) break; if (!is_null(entry.value)) { lambda(entry.value); } } } }; } mmmulti-0.1/src/mmmultiset.hpp000066400000000000000000000275641365204574100165630ustar00rootroot00000000000000#pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sdsl/bit_vectors.hpp" #include "ips4o.hpp" #include "atomic_queue.h" namespace mmmulti { /* 'mmmulti::set' is a disk-backed multiset values are stored in a binary file. The key space is assumed to be numeric, but values may be of arbitrary size. To build the multiset we first append values. We then sort the values. We can now iterate over the unique values of the multiset, such as to obtain their counts. */ template class set { private: // memory mapped buffer struct struct mmap_buffer_t { int fd; off_t size; void *data; }; // utilities used by mmmultimap int open_mmap_buffer(const char* path, mmap_buffer_t* buffer) { buffer->data = nullptr; buffer->fd = open(path, O_RDWR); if (buffer->fd == -1) { goto error; } struct stat stats; if (-1 == fstat(buffer->fd, &stats)) { goto error; } if (!(buffer->data = mmap(nullptr, stats.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, buffer->fd, 0 ))) { goto error; } madvise(buffer, stats.st_size, POSIX_MADV_WILLNEED | POSIX_MADV_SEQUENTIAL); buffer->size = stats.st_size; return 0; error: perror(path); if (buffer->data) munmap(buffer->data, stats.st_size); if (buffer->fd != -1) close(buffer->fd); buffer->data = 0; buffer->fd = 0; return -1; } void close_mmap_buffer(mmap_buffer_t* buffer) { if (buffer->data) { munmap(buffer->data, buffer->size); buffer->data = 0; buffer->size = 0; } if (buffer->fd) { close(buffer->fd); buffer->fd = 0; } } struct ValueLess { bool operator()(const Value &a, const Value &b) const { return a < b; } }; std::ofstream writer; char* reader = nullptr; int reader_fd = 0; std::string filename; std::string index_filename; bool sorted = false; // key information uint64_t n_records = 0; bool indexed = false; std::thread writer_thread; atomic_queue::AtomicQueue2* value_queue = nullptr; std::atomic work_todo; public: // forward declaration for iterator types class iterator; class const_iterator; // constructor set(void) { } set(const std::string& f) : filename(f) { } ~set(void) { close_writer(); close_reader(); } void set_base_filename(const std::string& f) { filename = f; } void writer_func(void) { Value value; while (work_todo.load() || !value_queue->was_empty()) { if (value_queue->try_pop(value)) { do { writer.write((char*)&value, sizeof(Value)); } while (value_queue->try_pop(value)); } else { std::this_thread::sleep_for(std::chrono::nanoseconds(1)); } } writer.close(); } // close/open backing file void open_writer(void) { if (writer.is_open()) { writer.seekp(0, std::ios_base::end); // seek to the end for appending return; } assert(!filename.empty()); // remove the file; we only call this when making an index, and it's done once writer.open(filename.c_str(), std::ios::binary | std::ios::trunc); if (writer.fail()) { throw std::ios_base::failure(std::strerror(errno)); } value_queue = new atomic_queue::AtomicQueue2; work_todo.store(true); writer_thread = std::thread(&set::writer_func, this); } void close_writer(void) { if (work_todo.load()) { work_todo.store(false); std::this_thread::sleep_for(std::chrono::milliseconds(1)); if (writer_thread.joinable()) { writer_thread.join(); } delete value_queue; value_queue = nullptr; } } void open_reader(void) { if (reader_fd) return; //open assert(!filename.empty()); // open in binary mode as we are reading from this interface reader_fd = open(filename.c_str(), O_RDWR); if (reader_fd == -1) { assert(false); } struct stat stats; if (-1 == fstat(reader_fd, &stats)) { assert(false); } if (!(reader = (char*) mmap(NULL, stats.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, reader_fd, 0))) { assert(false); } madvise((void*)reader, stats.st_size, POSIX_MADV_WILLNEED | POSIX_MADV_SEQUENTIAL); } void close_reader(void) { if (reader) { size_t c = record_count(); munmap(reader, c); reader = 0; } if (reader_fd) { close(reader_fd); reader_fd = 0; } } /// write the pair to end of backing file /// open_writer() must be called first to set up our buffer and writer void append(const Value& v) { value_queue->push(v); } /// return the number of records, which will only work after indexing size_t size(void) const { return n_records; } /// iterator to first value iterator begin(void) { return iterator((Value*)reader); } /// iterator to one past end iterator end(void) { return iterator(((Value*)reader)+n_records); } /// const iterator to first value const_iterator begin(void) const { return const_iterator((Value*)reader); } /// const iterator to one past end const_iterator end(void) const { return const_iterator(((Value*)reader)+n_records); } /// return the size of each combined record size_t get_record_size(void) const { return sizeof(Value); } /// return the backing buffer char* get_buffer(void) const { return reader; } /// get the record count size_t record_count(void) { int fd = open(filename.c_str(), O_RDWR); if (fd == -1) { assert(false); } struct stat stats; if (-1 == fstat(fd, &stats)) { assert(false); } assert(stats.st_size % get_record_size() == 0); // must be even records size_t count = stats.st_size / sizeof(Value); close(fd); return count; } std::ifstream::pos_type filesize(const char* filename) { std::ifstream in(filename, std::ifstream::ate | std::ifstream::binary); return in.tellg(); } /// sort the record in the backing file by key void sort(int num_threads) { close_writer(); close_reader(); if (sorted) return; //std::cerr << "sorting!" << std::endl; mmap_buffer_t buffer; open_mmap_buffer(filename.c_str(), &buffer); uint64_t data_len = buffer.size/sizeof(Value); // sort in parallel (uses OpenMP if available, std::thread otherwise) ips4o::parallel::sort((Value*)buffer.data, ((Value*)buffer.data)+data_len, ValueLess(), num_threads); close_mmap_buffer(&buffer); sorted = true; } Value read_value(size_t i) const { Value v; memcpy(&v, &reader[i*sizeof(Value)], sizeof(Value)); return v; } // index void index(int num_threads) { sort(num_threads); open_reader(); n_records = record_count(); indexed = true; close_reader(); open_reader(); } void for_each_value(const std::function& lambda) const { for (size_t i = 0; i < n_records; ++i) { lambda(read_value(i)); } } void for_each_value_count(const std::function& lambda) const { assert(sorted); uint64_t curr_count = 0; bool first = true; Value last; for_each_value([&](const Value& v) { if (first) { last = v; first = false; } else if (v != last) { lambda(last, curr_count); curr_count = 0; last = v; } ++curr_count; }); lambda(last, curr_count); } void for_each_unique_value(const std::function& lambda) const { assert(sorted); bool first = true; Value last; for_each_value([&](const Value& v) { if (first) { last = v; first = false; } else if (v != last) { lambda(last); last = v; } }); lambda(last); } /// a local reimplementation of a pointer iterator class iterator { public: iterator(Value* ptr) : ptr(ptr) {} iterator() : ptr(nullptr) {} iterator(const set::iterator& other) : ptr(other.ptr) {} iterator& operator=(const set::iterator& other) { ptr = other.ptr; } Value& operator*() { return *ptr; } bool operator==(const set::iterator& other) { return ptr == other.ptr; } bool operator!=(const set::iterator& other) { return !(*this == other); } iterator& operator++() { ++ptr; return *this; } iterator operator++(int) { return iterator(ptr++); } iterator& operator--() { --ptr; return *this; } iterator operator--(int) { return iterator(ptr--); } private: Value* ptr; friend class const_iterator; }; /// a local reimplementation of a const pointer iterator class const_iterator { public: const_iterator(const Value* ptr) : ptr(ptr) {} const_iterator() : ptr(nullptr) {} const_iterator(const set::const_iterator& other) : ptr(other.ptr) {} const_iterator& operator=(const set::const_iterator& other) { ptr = other.ptr; } const_iterator(const set::iterator& other) : ptr(other.ptr) {} const_iterator& operator=(const set::iterator& other) { ptr = other.ptr; } const Value& operator*() { return *ptr; } bool operator==(const set::const_iterator& other) { return ptr == other.ptr; } bool operator!=(const set::const_iterator& other) { return !(*this == other); } const_iterator& operator++() { ++ptr; return *this; } const_iterator operator++(int) { return const_iterator(ptr++); } const_iterator& operator--() { --ptr; return *this; } const_iterator operator--(int) { return const_iterator(ptr--); } private: const Value* ptr; }; }; }