pax_global_header00006660000000000000000000000064133435306040014513gustar00rootroot0000000000000052 comment=684c355a6c56208c3d9e0be7990c4923b7ed8c50 libzim-4.0.4/000077500000000000000000000000001334353060400130065ustar00rootroot00000000000000libzim-4.0.4/.gitignore000066400000000000000000000005141334353060400147760ustar00rootroot00000000000000*~ *#* autom4te.cache build compile config.h configure depcomp .deps .dirstamp INSTALL install-sh *.kate-swp *.la .libs libtool *.lo ltmain.sh *.m4 Makefile Makefile.in missing *.o stamp-h1 .svn .*.swp *.zim examples/createZimExample src/tools/zimdump src/tools/zimsearch libzim.pc test-driver test/zimlib-test* test/test-suite.log libzim-4.0.4/.travis.yml000066400000000000000000000013421334353060400151170ustar00rootroot00000000000000language: cpp dist: trusty sudo: required cache: ccache before_install: - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then eval "${MATRIX_EVAL}"; fi - PATH=$PATH:$HOME/bin install: travis/install_deps.sh script: travis/compile.sh env: matrix: - PLATFORM="native_static" - PLATFORM="native_dyn" - PLATFORM="win32_static" - PLATFORM="win32_dyn" - PLATFORM="android_arm" - PLATFORM="android_arm64" addons: apt: packages: - cmake - python3-pip - libbz2-dev - ccache - zlib1g-dev - uuid-dev - cython3 - g++-mingw-w64-i686 - gcc-mingw-w64-i686 - gcc-mingw-w64-base - mingw-w64-tools matrix: include: - env: PLATFORM="native_dyn" os: osx libzim-4.0.4/AUTHORS000066400000000000000000000000431334353060400140530ustar00rootroot00000000000000Tommi Maekitalo libzim-4.0.4/COPYING000066400000000000000000000354341334353060400140520ustar00rootroot00000000000000 GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Library General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS libzim-4.0.4/ChangeLog000066400000000000000000000120701334353060400145600ustar00rootroot00000000000000libzim 4.0.4 ============ * Fix opening of multi-part zim. * Fix convertion of path to wpath on Windows. libzim 4.0.3 ============ * Implement low level file manipilation using different backends libzim 4.0.2 ============ * [Windows] Fix opening of zim file bigger than 4GiB libzim 4.0.1 ============ * [Writer] Fix wrong redirectyon log message * Make libzim compile natively on windows using MSVC * Better message when failing to read a zim file. * Make libzim on windows correctly open unicode path. * Add compilation option to use less memory (but more I/O). Usefull on low memory devices (android) * Small fixes libzim 4.0.0 ============ * [Writer] Remove a lot of memory copy. * [Writer] Add xapian indexing directly in libzim. * [Writer] Better API. * [Writer] Use multi-threading to write clusters. * [Writer] Ensure mimetype of articles article is not null. * Extend test timeout for cluster's test. * Less memory copy for cluster's test. * Allow skipping test using a lot memory using env variable `SKIP_BIG_MEMORY_TEST=1` * Explicitly use the icu namespace to allow using of packaged icu lib. * Use a temporary file name as long as the ZIM writting process is not finished (#163) * [Travis] Do no compile using gcc-5 (but the default trusty's one 4.8) libzim 3.3.0 ============ * Fix handling of big cluster (>4GiB) on 32 bits architecture. This is mainly done by : * Do not mmap the whole cluster by default. * MMap only the memory asociated to an article. * If an article is > 4GiB, the blob associated to it is invalid (data==size==0). * Other information are still valid (directAccessInformation, ...) * Fix writing of extended cluster in writer. * Compile libzim on macos. * Build libzim setting RPATH. * Search result urls are now what is stored in the zim file. They should not start with a `/`. This is a revert of the change made in last release. (See kiwix/kiwix-lib#123) * Spelling corrections in README. libzim 3.2.0 ============ * Support geo query if the xapian database has indexed localisation. * Handle articles bigger than 4Go in the zim file (#110). * Use AND operator between search term. * Fix compilation with recent clang (#95). * Add method to get article's data localisation in the zim file. * Be able to get only a part of article (#77). * Do not crash if we cannot open the xapian Database for some reasons. (kiwix/kiwix-tools#153) * Do not assumen there is always a checksum in the zim file. (kiwix/kiwix-tools#150) * Try to do some sanity checks when opening a zim file. * Use pytest to do some tests (when cython is available). * Use levenshtein distance to sort and have better suggestion results. * Search result urls are now always absolute (starts with a '/'). (kiwix/kiwix-lib#110) * Open the file readonly when checking the zim file (and so be able to check read only file). * Accept absolute url starting with '/' when searching for article. * Fix various bugs libzim 3.1.0 ============ * Lzma is not a optional dependency anymore. * Better handle (report and not crash) invalid zim file. * Embed source of gtest (used only if gtest is not available on the system) * Move zimDump tools out of libzim repository to zim-tools * ZimCreator tools doesn't not read command line to set options. libzim 3.0.0 ============ This is a major change of the libzim. Expect a lot new improvement and API changes. * Add a suggestion mode to the search * Fix licensing issues * Fix wrong stemming of the query when searching * Deactivate searching (and so crash) in the embedded database if the zim is splitted * Rewrite the low level memory management of libzim when reading a zim file: * We use a buffer base entity to handle memory and reading file instead of reading file using stream. * MMap the memory when posible to avoid memory copy. * Use const when posible (API break) * Move to googletest instead of cxxtools for unit-tests. * Fix endiannes bug on arm. * Do not install private headers. Those headers declare private structure and should not be visible (API break) * Compile libzim with `-Werror` and `-Wall` options. * Make libzim thread safe for reading article. The search part is not thread safe, and all search operation must be protected by a lock. * Add method to get only a part of a article. * Move some tools to zim-tools repository. libzim 2.0.0 ============ * Move to meson build system `libzim` now use `meson` as build system instead of `autotools` * Move to C++11 standard. * Fulltext search in zim file. We have integrated the xapian fulltext search in libzim. So now, libzim provide an API to search in a zim containing embeded fulltext index. This means that : *libzim need xapian as (optional) dependencies (if you want compile with xapian support). * The old and unused search API has been removed. * Remove bzip2 support. * Remove Symbian support. * Few API hanges * Make some header files private (not installed); * A `Blob` can now be cast to a `string` directly; * Change a lot of `File` methods to const methods. libzim-4.0.4/README.md000066400000000000000000000101021334353060400142570ustar00rootroot00000000000000ZIM library =========== The ZIM library is the reference implementation for the ZIM file format. It's a solution to read and write ZIM files on many systems and architectures. More information about the ZIM format and the openZIM project at http://www.openzim.org/ Disclaimer ---------- This document assumes you have a little knowledge about software compilation. If you experience difficulties with the dependencies or with the ZIM library compilation itself, we recommend to have a look to [kiwix-build](https://github.com/kiwix/kiwix-build). Preamble -------- Although the ZIM library can be compiled/cross-compiled on/for many systems, the following documentation explains how to do it on POSIX ones. It is primarily though for GNU/Linux systems and has been tested on recent releases of Ubuntu and Fedora. Dependencies ------------ The ZIM library relies on many third parts software libraries. They are prerequisites to the Kiwix library compilation. Following libraries need to be available: * Z ................................................. http://zlib.net/ (package zlib1g-dev on Ubuntu) * LZMA ...................................... http://tukaani.org/lzma/ (package lzma-dev on Ubuntu) * ICU ................................... http://site.icu-project.org/ (package libicu-dev on Ubuntu) * Xapian (optional) .............................. https://xapian.org/ (package libxapian-dev on Ubuntu) * Google test (optional) ........ https://github.com/google/googletest (No valid package on Ubuntu, if gtest is not present, libzim will use embedded one) These dependencies may or may not be packaged by your operating system. They may also be packaged but only in an older version. The compilation script will tell you if one of them is missing or too old. In the worse case, you will have to download and compile a more recent version by hand. If you want to install these dependencies locally, then ensure that meson (through pkg-config) will properly find them. Environment ------------- The ZIM library builds using [Meson](http://mesonbuild.com/) version 0.39 or higher. Meson relies itself on Ninja, pkg-config and few other compilation tools. Install first the few common compilation tools: * Meson * Ninja * Pkg-config These tools should be packaged if you use a cutting edge operating system. If not, have a look to the "Troubleshooting" section. Compilation ----------- Once all dependencies are installed, you can compile ZIM library with: ``` meson . build ninja -C build ``` By default, it will compile dynamic linked libraries. All binary files will be created in the "build" directory created automatically by Meson. If you want statically linked libraries, you can add `--default-library=static` option to the Meson command. Depending of you system, `ninja` may be called `ninja-build`. Installation ------------ If you want to install the libzim and the headers you just have compiled on your system, here we go: ``` ninja -C build install ``` You might need to run the command as root (or using 'sudo'), depending where you want to install the libraries. After the installation succeeded, you may need to run ldconfig (as root). Uninstallation ------------ If you want to uninstall the libzim: ``` ninja -C build uninstall ``` Like for the installation, you might need to run the command as root (or using 'sudo'). Troubleshooting --------------- If you need to install Meson "manually": ``` virtualenv -p python3 ./ # Create virtualenv source bin/activate # Activate the virtualenv pip3 install meson # Install Meson hash -r # Refresh bash paths ``` If you need to install Ninja "manually": ``` git clone git://github.com/ninja-build/ninja.git cd ninja git checkout release ./configure.py --bootstrap mkdir ../bin cp ninja ../bin cd .. ``` If the compilation still fails, you might need to get a more recent version of a dependency than the one packaged by your Linux distribution. Try then with a source tarball distributed by the problematic upstream project or even directly from the source code repository. License ------- GPLv2 or later, see COPYING for more details. libzim-4.0.4/examples/000077500000000000000000000000001334353060400146245ustar00rootroot00000000000000libzim-4.0.4/examples/createZimExample.cpp000066400000000000000000000054731334353060400206000ustar00rootroot00000000000000/* * Copyright (C) 2012 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include #include #include #include #include class TestArticle : public zim::writer::Article { std::string _id; std::string _data; public: TestArticle() { } explicit TestArticle(const std::string& id); virtual ~TestArticle() = default; virtual std::string getAid() const; virtual char getNamespace() const; virtual std::string getUrl() const; virtual std::string getTitle() const; virtual bool isRedirect() const; virtual bool shouldCompress() const { return true; } virtual std::string getMimeType() const; virtual std::string getRedirectAid() const; virtual bool shouldIndex() const { return false; } virtual zim::size_type getSize() const { return _data.size(); } virtual std::string getFilename() const { return ""; } virtual zim::Blob getData() const { return zim::Blob(&_data[0], _data.size()); } }; TestArticle::TestArticle(const std::string& id) : _id(id) { std::ostringstream data; data << "this is article " << id << std::endl; _data = data.str(); } std::string TestArticle::getAid() const { return _id; } char TestArticle::getNamespace() const { return 'A'; } std::string TestArticle::getUrl() const { return _id; } std::string TestArticle::getTitle() const { return _id; } bool TestArticle::isRedirect() const { return false; } std::string TestArticle::getMimeType() const { return "text/plain"; } std::string TestArticle::getRedirectAid() const { return ""; } int main(int argc, char* argv[]) { std::vector _articles; unsigned max = 16; _articles.resize(max); for (unsigned n = 0; n < max; ++n) { std::ostringstream id; id << (n + 1); _articles[n] = TestArticle(id.str()); } try { zim::writer::ZimCreator c; c.startZimCreation("foo.zim"); for (auto& article:_articles) { c.addArticle(article); } c.finishZimCreation(); } catch (const std::exception& e) { std::cerr << e.what() << std::endl; } } libzim-4.0.4/examples/meson.build000066400000000000000000000004001334353060400167600ustar00rootroot00000000000000 executable('createZimExample', 'createZimExample.cpp', link_with: libzim, link_args: extra_link_args, include_directories: include_directory, dependencies: [thread_dep, xapian_dep, icu_dep, zlib_dep, lzma_dep]) libzim-4.0.4/include/000077500000000000000000000000001334353060400144315ustar00rootroot00000000000000libzim-4.0.4/include/meson.build000066400000000000000000000006141334353060400165740ustar00rootroot00000000000000include_directory = include_directories('.') install_headers( 'zim/article.h', 'zim/blob.h', 'zim/error.h', 'zim/file.h', 'zim/fileheader.h', 'zim/fileiterator.h', 'zim/search.h', 'zim/search_iterator.h', 'zim/uuid.h', 'zim/zim.h', subdir:'zim' ) install_headers( 'zim/writer/article.h', 'zim/writer/zimcreator.h', subdir:'zim/writer' ) libzim-4.0.4/include/zim/000077500000000000000000000000001334353060400152305ustar00rootroot00000000000000libzim-4.0.4/include/zim/article.h000066400000000000000000000053241334353060400170300ustar00rootroot00000000000000/* * Copyright (C) 2006 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_ARTICLE_H #define ZIM_ARTICLE_H #include #include "zim.h" #include "blob.h" #include #include namespace zim { class Cluster; class Dirent; class FileImpl; class Article { private: std::shared_ptr file; article_index_type idx; std::shared_ptr getDirent() const; public: Article() : idx(std::numeric_limits::max()) { } Article(std::shared_ptr file_, article_index_type idx_) : file(file_), idx(idx_) { } std::string getParameter() const; std::string getTitle() const; std::string getUrl() const; std::string getLongUrl() const; uint16_t getLibraryMimeType() const; const std::string& getMimeType() const; bool isRedirect() const; bool isLinktarget() const; bool isDeleted() const; char getNamespace() const; article_index_type getRedirectIndex() const; Article getRedirectArticle() const; size_type getArticleSize() const; bool operator< (const Article& a) const { return getNamespace() < a.getNamespace() || (getNamespace() == a.getNamespace() && getTitle() < a.getTitle()); } std::shared_ptr getCluster() const; Blob getData(offset_type offset=0) const; Blob getData(offset_type offset, size_type size) const; offset_type getOffset() const; std::pair getDirectAccessInformation() const; std::string getPage(bool layout = true, unsigned maxRecurse = 10); void getPage(std::ostream&, bool layout = true, unsigned maxRecurse = 10); article_index_type getIndex() const { return idx; } bool good() const { return idx != std::numeric_limits::max(); } }; } #endif // ZIM_ARTICLE_H libzim-4.0.4/include/zim/blob.h000066400000000000000000000034041334353060400163200ustar00rootroot00000000000000/* * Copyright (C) 2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_BLOB_H #define ZIM_BLOB_H #include "zim.h" #include #include #include #include namespace zim { class Buffer; class Blob { const char* _data; size_type _size; std::shared_ptr _buffer; public: Blob(); Blob(const char* data, size_type size); Blob(std::shared_ptr buffer); operator std::string() const { return std::string(_data, _size); } const char* data() const { return _data; } const char* end() const { return _data + _size; } size_type size() const { return _size; } }; inline std::ostream& operator<< (std::ostream& out, const Blob& blob) { if (blob.data()) out.write(blob.data(), blob.size()); return out; } inline bool operator== (const Blob& b1, const Blob& b2) { return b1.size() == b2.size() && std::equal(b1.data(), b1.data() + b1.size(), b2.data()); } } #endif // ZIM_BLOB_H libzim-4.0.4/include/zim/error.h000066400000000000000000000021021334353060400165250ustar00rootroot00000000000000/* * Copyright (C) 2006 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_ERROR_H #define ZIM_ERROR_H #include namespace zim { class ZimFileFormatError : public std::runtime_error { public: explicit ZimFileFormatError(const std::string& msg) : std::runtime_error(msg) { } }; } #endif // ZIM_ERROR_H libzim-4.0.4/include/zim/file.h000066400000000000000000000061211334353060400163200ustar00rootroot00000000000000/* * Copyright (C) 2006,2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_FILE_H #define ZIM_FILE_H #include #include #include #include "zim.h" #include "article.h" #include "blob.h" #include "fileheader.h" class ZimDumper; namespace zim { class Search; class FileImpl; class Cluster; class File { friend class ::ZimDumper; std::shared_ptr impl; public: File() { } explicit File(const std::string& fname); const std::string& getFilename() const; const Fileheader& getFileheader() const; offset_type getFilesize() const; article_index_type getCountArticles() const; Article getArticle(article_index_type idx) const; Article getArticle(char ns, const std::string& url) const; Article getArticleByUrl(const std::string& url) const; Article getArticleByTitle(article_index_type idx) const; Article getArticleByTitle(char ns, const std::string& title) const; std::shared_ptr getCluster(cluster_index_type idx) const; cluster_index_type getCountClusters() const; offset_type getClusterOffset(cluster_index_type idx) const; Blob getBlob(cluster_index_type clusterIdx, blob_index_type blobIdx) const; offset_type getOffset(cluster_index_type clusterIdx, blob_index_type blobIdx) const; article_index_type getNamespaceBeginOffset(char ch) const; article_index_type getNamespaceEndOffset(char ch) const; article_index_type getNamespaceCount(char ns) const; std::string getNamespaces() const; bool hasNamespace(char ch) const; class const_iterator; const_iterator begin() const; const_iterator beginByTitle() const; const_iterator end() const; const_iterator findByTitle(char ns, const std::string& title) const; const_iterator find(char ns, const std::string& url) const; const_iterator find(const std::string& url) const; const Search* search(const std::string& query, int start, int end) const; const Search* suggestions(const std::string& query, int start, int end) const; time_t getMTime() const; const std::string& getMimeType(uint16_t idx) const; std::string getChecksum(); bool verify(); bool is_multiPart() const; }; std::string urldecode(const std::string& url); } #endif // ZIM_FILE_H libzim-4.0.4/include/zim/fileheader.h000066400000000000000000000110301334353060400174640ustar00rootroot00000000000000/* * Copyright (C) 2008 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_FILEHEADER_H #define ZIM_FILEHEADER_H #include #include "zim.h" #include "uuid.h" #include #include #ifdef _WIN32 #define NOMINMAX 1 #include #undef NOMINMAX #undef max #endif namespace zim { class Buffer; class Fileheader { public: static const uint32_t zimMagic; static const uint16_t zimClassicMajorVersion; static const uint16_t zimExtendedMajorVersion; static const uint16_t zimMinorVersion; static const size_type size; private: uint16_t majorVersion; uint16_t minorVersion; Uuid uuid; article_index_type articleCount; offset_type titleIdxPos; offset_type urlPtrPos; offset_type mimeListPos; cluster_index_type clusterCount; offset_type clusterPtrPos; article_index_type mainPage; article_index_type layoutPage; offset_type checksumPos; public: Fileheader() : majorVersion(zimClassicMajorVersion), minorVersion(zimMinorVersion), articleCount(0), titleIdxPos(0), urlPtrPos(0), clusterCount(0), clusterPtrPos(0), mainPage(std::numeric_limits::max()), layoutPage(std::numeric_limits::max()), checksumPos(std::numeric_limits::max()) {} void read(std::shared_ptr buffer); // Do some sanity check, raise a ZimFileFormateError is // something is wrong. void sanity_check() const; uint16_t getMajorVersion() const { return majorVersion; } void setMajorVersion(uint16_t v) { majorVersion = v; } uint16_t getMinorVersion() const { return minorVersion; } void setMinorVersion(uint16_t v) { minorVersion = v; } const Uuid& getUuid() const { return uuid; } void setUuid(const Uuid& uuid_) { uuid = uuid_; } article_index_type getArticleCount() const { return articleCount; } void setArticleCount(article_index_type s) { articleCount = s; } offset_type getTitleIdxPos() const { return titleIdxPos; } void setTitleIdxPos(offset_type p) { titleIdxPos = p; } offset_type getUrlPtrPos() const { return urlPtrPos; } void setUrlPtrPos(offset_type p) { urlPtrPos = p; } offset_type getMimeListPos() const { return mimeListPos; } void setMimeListPos(offset_type p) { mimeListPos = p; } cluster_index_type getClusterCount() const { return clusterCount; } void setClusterCount(cluster_index_type s) { clusterCount = s; } offset_type getClusterPtrPos() const { return clusterPtrPos; } void setClusterPtrPos(offset_type p) { clusterPtrPos = p; } bool hasMainPage() const { return mainPage != std::numeric_limits::max(); } article_index_type getMainPage() const { return mainPage; } void setMainPage(article_index_type s){ mainPage = s; } bool hasLayoutPage() const { return layoutPage != std::numeric_limits::max(); } article_index_type getLayoutPage() const { return layoutPage; } void setLayoutPage(article_index_type s) { layoutPage = s; } bool hasChecksum() const { return getMimeListPos() >= 80; } offset_type getChecksumPos() const { return hasChecksum() ? checksumPos : 0; } void setChecksumPos(offset_type p) { checksumPos = p; } }; std::ostream& operator<< (std::ostream& out, const Fileheader& fh); } #endif // ZIM_FILEHEADER_H libzim-4.0.4/include/zim/fileiterator.h000066400000000000000000000052551334353060400201010ustar00rootroot00000000000000/* * Copyright (C) 2006 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_FILEITERATOR_H #define ZIM_FILEITERATOR_H #include #include "article.h" namespace zim { class File::const_iterator : public std::iterator { public: enum Mode { UrlIterator, ArticleIterator }; private: const File* file; article_index_type idx; mutable Article article; Mode mode; bool is_end() const { return file == 0 || idx >= file->getCountArticles(); } public: explicit const_iterator(const File* file_ = 0, article_index_type idx_ = 0, Mode mode_ = UrlIterator) : file(file_), idx(idx_), mode(mode_) { } article_index_type getIndex() const { return idx; } const File& getFile() const { return *file; } bool operator== (const const_iterator& it) const { return (is_end() && it.is_end()) || (file == it.file && idx == it.idx); } bool operator!= (const const_iterator& it) const { return !operator==(it); } const_iterator& operator++() { ++idx; article = Article(); return *this; } const_iterator operator++(int) { const_iterator it = *this; operator++(); return it; } const_iterator& operator--() { --idx; article = Article(); return *this; } const_iterator operator--(int) { const_iterator it = *this; operator--(); return it; } const Article& operator*() const { if (!article.good()) article = mode == UrlIterator ? file->getArticle(idx) : file->getArticleByTitle(idx); return article; } pointer operator->() const { operator*(); return &article; } }; } #endif // ZIM_FILEITERATOR_H libzim-4.0.4/include/zim/search.h000066400000000000000000000046571334353060400166620ustar00rootroot00000000000000/* * Copyright (C) 2007 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_SEARCH_H #define ZIM_SEARCH_H #include "search_iterator.h" #include #include #include namespace zim { class File; class Search { friend class search_iterator; friend struct search_iterator::InternalData; public: typedef search_iterator iterator; explicit Search(const std::vector zimfiles); explicit Search(const File* zimfile); Search(const Search& it); Search& operator=(const Search& it); Search(Search&& it); Search& operator=(Search&& it); ~Search(); void set_verbose(bool verbose); Search& add_zimfile(const File* zimfile); Search& set_query(const std::string& query); Search& set_georange(float latitude, float longitude, float distance); Search& set_range(int start, int end); Search& set_suggestion_mode(bool suggestion_mode); search_iterator begin() const; search_iterator end() const; int get_matches_estimated() const; private: struct InternalData; std::unique_ptr internal; std::vector zimfiles; mutable std::map valuesmap; mutable std::string prefixes; std::string query; float latitude; float longitude; float distance; int range_start; int range_end; bool suggestion_mode; bool geo_query; mutable bool search_started; mutable bool has_database; mutable bool verbose; mutable int estimated_matches_number; }; } //namespace zim #endif // ZIM_SEARCH_H libzim-4.0.4/include/zim/search_iterator.h000066400000000000000000000042231334353060400205600ustar00rootroot00000000000000/* * Copyright (C) 2006 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_SEARCH_ITERATOR_H #define ZIM_SEARCH_ITERATOR_H #include #include #include "article.h" namespace zim { class Search; class search_iterator : public std::iterator { friend class zim::Search; public: search_iterator(); search_iterator(const search_iterator& it); search_iterator& operator=(const search_iterator& it); search_iterator(search_iterator&& it); search_iterator& operator=(search_iterator&& it); ~search_iterator(); bool operator== (const search_iterator& it) const; bool operator!= (const search_iterator& it) const; search_iterator& operator++(); search_iterator operator++(int); search_iterator& operator--(); search_iterator operator--(int); std::string get_url() const; std::string get_title() const; int get_score() const; std::string get_snippet() const; int get_wordCount() const; int get_size() const; int get_fileIndex() const; reference operator*() const; pointer operator->() const; private: struct InternalData; std::unique_ptr internal; search_iterator(InternalData* internal_data); bool is_end() const; }; } // namespace ziç #endif // ZIM_SEARCH_ITERATOR_H libzim-4.0.4/include/zim/uuid.h000066400000000000000000000027151334353060400163540ustar00rootroot00000000000000/* * Copyright (C) 2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_UUID_H #define ZIM_UUID_H #include #include #include #include namespace zim { struct Uuid { Uuid() { std::memset(data, 0, 16); } Uuid(const char uuid[16]) { std::copy(uuid, uuid+16, data); } static Uuid generate(std::string value = ""); bool operator== (const Uuid& other) const { return std::equal(data, data+16, other.data); } bool operator!= (const Uuid& other) const { return !(*this == other); } unsigned size() const { return 16; } char data[16]; }; std::ostream& operator<< (std::ostream& out, const Uuid& uuid); } #endif // ZIM_UUID_H libzim-4.0.4/include/zim/writer/000077500000000000000000000000001334353060400165445ustar00rootroot00000000000000libzim-4.0.4/include/zim/writer/article.h000066400000000000000000000053211334353060400203410ustar00rootroot00000000000000/* * Copyright (C) 2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_WRITER_ARTICLESOURCE_H #define ZIM_WRITER_ARTICLESOURCE_H #include #include #include #include #include namespace zim { namespace writer { class ArticleSource; class Article { public: virtual std::string getAid() const = 0; virtual char getNamespace() const = 0; virtual std::string getUrl() const = 0; virtual std::string getTitle() const = 0; virtual bool isRedirect() const = 0; virtual bool isLinktarget() const; virtual bool isDeleted() const; virtual std::string getMimeType() const = 0; virtual bool shouldCompress() const = 0; virtual bool shouldIndex() const = 0; virtual std::string getRedirectAid() const = 0; virtual std::string getParameter() const; virtual zim::size_type getSize() const = 0; virtual Blob getData() const = 0; virtual std::string getFilename() const = 0; virtual ~Article() = default; // returns the next category id, to which the article is assigned to virtual std::string getNextCategory(); }; class Category { public: virtual Blob getData() = 0; virtual std::string getUrl() const = 0; virtual std::string getTitle() const = 0; virtual ~Category() = default; }; class ArticleSource { public: virtual void setFilename(const std::string& fname) { } virtual const Article* getNextArticle() = 0; // After fetching the articles and for each article the category ids // using Article::getNextCategory, the writer has a list of category // ids. Using this list, the writer fetches the category data using // this method. virtual Category* getCategory(const std::string& cid); virtual ~ArticleSource() = default; }; } } #endif // ZIM_WRITER_ARTICLESOURCE_H libzim-4.0.4/include/zim/writer/zimcreator.h000066400000000000000000000043551334353060400211030ustar00rootroot00000000000000/* * Copyright (C) 2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_WRITER_ZIMCREATOR_H #define ZIM_WRITER_ZIMCREATOR_H #include #include #include namespace zim { class Fileheader; namespace writer { class ZimCreatorData; class ZimCreator { public: ZimCreator(bool verbose = false); virtual ~ZimCreator(); zim::size_type getMinChunkSize() const { return minChunkSize; } void setMinChunkSize(zim::size_type s) { minChunkSize = s; } void setIndexing(bool indexing, std::string language) { withIndex = indexing; indexingLanguage = language; } void setCompressionThreads(unsigned ct) { compressionThreads = ct; } virtual void startZimCreation(const std::string& fname); virtual void addArticle(const Article& article); virtual void finishZimCreation(); virtual std::string getMainPage() { return ""; } virtual std::string getLayoutPage() { return ""; } virtual zim::Uuid getUuid() { return Uuid::generate(); } private: std::unique_ptr data; bool verbose; bool withIndex = false; size_t minChunkSize = 1024-64; std::string indexingLanguage; unsigned compressionThreads = 4; void fillHeader(Fileheader* header); void write(const Fileheader& header, const std::string& fname) const; static void* clusterWriter(void* arg); }; } } #endif // ZIM_WRITER_ZIMCREATOR_H libzim-4.0.4/include/zim/zim.h000066400000000000000000000027541334353060400162100ustar00rootroot00000000000000/* * Copyright (C) 2006 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_ZIM_H #define ZIM_ZIM_H #include namespace zim { // An index of an article (in a zim file) typedef uint32_t article_index_type; // An index of an cluster (in a zim file) typedef uint32_t cluster_index_type; // An index of a blog (in a cluster) typedef uint32_t blob_index_type; // The size of something (article, zim, cluster, blob, ...) typedef uint64_t size_type; // An offset. typedef uint64_t offset_type; enum CompressionType { zimcompDefault, zimcompNone, zimcompZip, zimcompBzip2, // Not supported anymore in the libzim zimcompLzma }; static const char MimeHtmlTemplate[] = "text/x-zim-htmltemplate"; } #endif // ZIM_ZIM_H libzim-4.0.4/meson.build000066400000000000000000000043041334353060400151510ustar00rootroot00000000000000project('libzim', ['c', 'cpp'], version : '4.0.4', license : 'GPL2', default_options : ['c_std=c11', 'cpp_std=c++11', 'werror=true']) if build_machine.system() != 'windows' add_project_arguments('-D_LARGEFILE64_SOURCE=1', '-D_FILE_OFFSET_BITS=64', language: 'cpp') endif sizeof_off_t = meson.get_compiler('cpp').sizeof('off_t') conf = configuration_data() conf.set('VERSION', '"@0@"'.format(meson.project_version())) conf.set('DIRENT_CACHE_SIZE', get_option('DIRENT_CACHE_SIZE')) conf.set('CLUSTER_CACHE_SIZE', get_option('CLUSTER_CACHE_SIZE')) conf.set('LZMA_MEMORY_SIZE', get_option('LZMA_MEMORY_SIZE')) conf.set10('MMAP_SUPPORT_64', sizeof_off_t==8) if target_machine.system() == 'windows' conf.set('ENABLE_USE_MMAP', false) else conf.set('ENABLE_USE_MMAP', get_option('USE_MMAP')) endif conf.set('ENABLE_USE_BUFFER_HEADER', get_option('USE_BUFFER_HEADER')) zlib_dep = dependency('zlib', required:false) conf.set('ENABLE_ZLIB', zlib_dep.found()) lzma_dep = dependency('liblzma') xapian_dep = dependency('xapian-core', required:false, static:(get_option('default_library')=='static')) conf.set('ENABLE_XAPIAN', xapian_dep.found()) pkg_requires = ['liblzma'] if build_machine.system() == 'windows' thread_dep = dependency('libpthreadVC2') pkg_requires += ['libpthreadVC2'] extra_link_args = ['-lRpcrt4', '-lWs2_32', '-lwinmm', '-licuuc', '-licuin'] extra_cpp_args = ['-DSORTPP_PASS'] else thread_dep = dependency('threads') extra_link_args = [] extra_cpp_args = [] endif if zlib_dep.found() pkg_requires += ['zlib'] endif if xapian_dep.found() pkg_requires += ['xapian-core'] icu_dep = dependency('icu-i18n') pkg_requires += ['icu-i18n'] else icu_dep = dependency('icu-i18n', required:false) endif inc = include_directories('include') subdir('include') subdir('scripts') subdir('static') subdir('src') subdir('examples') subdir('test') pkg_mod = import('pkgconfig') pkg_mod.generate(libraries : libzim, version : meson.project_version(), name : 'libzim', filebase : 'libzim', description : 'A Library to zim.', requires : pkg_requires) libzim-4.0.4/meson_options.txt000066400000000000000000000015371334353060400164510ustar00rootroot00000000000000option('CLUSTER_CACHE_SIZE', type : 'string', value : '16', description : 'set cluster cache size to number (default:16)') option('DIRENT_CACHE_SIZE', type : 'string', value : '512', description : 'set dirent cache size to number (default:512)') option('LZMA_MEMORY_SIZE', type : 'string', value : '128', description : 'set lzma uncompress memory in MB (default:128)') option('USE_MMAP', type: 'boolean', value: true, description: 'Use mmap to avoid copy from file. (default:true, always false on windows)') option('USE_BUFFER_HEADER', type: 'boolean', value: true, description: 'Copy (or use mmap) header index buffers. (default:true) Header index are used to access articles, having them in memory can improve access speed but on low memory devices it may use to many memory. If false, we directly read the index in the file at each article access.') libzim-4.0.4/scripts/000077500000000000000000000000001334353060400144755ustar00rootroot00000000000000libzim-4.0.4/scripts/libzim-compile-resources000077500000000000000000000142431334353060400213530ustar00rootroot00000000000000#!/usr/bin/env python3 ''' Copyright 2016 Matthieu Gautier This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ''' import argparse import os.path import re def full_identifier(filename): parts = os.path.normpath(filename).split(os.sep) parts = [to_identifier(part) for part in parts] print(filename, parts) return parts def to_identifier(name): ident = re.sub(r'[^0-9a-zA-Z]', '_', name) if ident[0].isnumeric(): return "_"+ident return ident resource_impl_template = """ static const unsigned char {data_identifier}[] = {{ {resource_content} }}; namespace RESOURCE {{ {namespaces_open} const std::string {identifier} = init_resource("{env_identifier}", {data_identifier}, {resource_len}); {namespaces_close} }} """ resource_getter_template = """ if (name == "{common_name}") return RESOURCE::{identifier}; """ resource_decl_template = """{namespaces_open} extern const std::string {identifier}; {namespaces_close}""" class Resource: def __init__(self, base_dirs, filename): filename = filename.strip() self.filename = filename self.identifier = full_identifier(filename) found = False for base_dir in base_dirs: try: with open(os.path.join(base_dir, filename), 'rb') as f: self.data = f.read() found = True break except FileNotFoundError: continue if not found: raise Exception("Impossible to found {}".format(filename)) def dump_impl(self): nb_row = len(self.data)//16 + (1 if len(self.data) % 16 else 0) sliced = (self.data[i*16:(i+1)*16] for i in range(nb_row)) return resource_impl_template.format( data_identifier="_".join([""]+self.identifier), resource_content=",\n ".join(", ".join("{:#04x}".format(i) for i in r) for r in sliced), resource_len=len(self.data), namespaces_open=" ".join("namespace {} {{".format(id) for id in self.identifier[:-1]), namespaces_close=" ".join(["}"]*(len(self.identifier)-1)), identifier=self.identifier[-1], env_identifier="RES_"+"_".join(self.identifier)+"_PATH" ) def dump_getter(self): return resource_getter_template.format( common_name=self.filename, identifier="::".join(self.identifier) ) def dump_decl(self): return resource_decl_template.format( namespaces_open=" ".join("namespace {} {{".format(id) for id in self.identifier[:-1]), namespaces_close=" ".join(["}"]*(len(self.identifier)-1)), identifier=self.identifier[-1] ) master_c_template = """//This file is automaically generated. Do not modify it. #include #include #include "{include_file}" static std::string init_resource(const char* name, const unsigned char* content, int len) {{ char * resPath = getenv(name); if (NULL == resPath) return std::string(reinterpret_cast(content), len); std::ifstream ifs(resPath); if (!ifs.good()) return std::string(reinterpret_cast(content), len); return std::string( (std::istreambuf_iterator(ifs)), (std::istreambuf_iterator() )); }} const std::string& getResource_{basename}(const std::string& name) {{ {RESOURCES_GETTER} throw ResourceNotFound("Resource not found."); }} {RESOURCES} """ def gen_c_file(resources, basename): return master_c_template.format( RESOURCES="\n\n".join(r.dump_impl() for r in resources), RESOURCES_GETTER="\n\n".join(r.dump_getter() for r in resources), include_file=basename, basename=to_identifier(basename) ) master_h_template = """//This file is automaically generated. Do not modify it. #ifndef KIWIX_{BASENAME} #define KIWIX_{BASENAME} #include #include namespace RESOURCE {{ {RESOURCES} }}; class ResourceNotFound : public std::runtime_error {{ public: ResourceNotFound(const std::string& what_arg): std::runtime_error(what_arg) {{ }}; }}; const std::string& getResource_{basename}(const std::string& name); #define getResource(a) (getResource_{basename}(a)) #endif // KIWIX_{BASENAME} """ def gen_h_file(resources, basename): return master_h_template.format( RESOURCES="\n ".join(r.dump_decl() for r in resources), BASENAME=basename.upper(), basename=basename, ) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--cxxfile', help='The Cpp file name to generate') parser.add_argument('--hfile', help='The h file name to generate') parser.add_argument('--source_dir', help="Additional directory where to look for resources.", action='append') parser.add_argument('resource_file', help='The list of resources to compile.') args = parser.parse_args() base_dir = os.path.dirname(os.path.realpath(args.resource_file)) source_dir = args.source_dir or [] with open(args.resource_file, 'r') as f: resources = [Resource([base_dir]+source_dir, filename) for filename in f.readlines()] h_identifier = to_identifier(os.path.basename(args.hfile)) with open(args.hfile, 'w') as f: f.write(gen_h_file(resources, h_identifier)) with open(args.cxxfile, 'w') as f: f.write(gen_c_file(resources, os.path.basename(args.hfile))) libzim-4.0.4/scripts/meson.build000066400000000000000000000000711334353060400166350ustar00rootroot00000000000000 res_compiler = find_program('libzim-compile-resources') libzim-4.0.4/src/000077500000000000000000000000001334353060400135755ustar00rootroot00000000000000libzim-4.0.4/src/_dirent.h000066400000000000000000000103701334353060400153730ustar00rootroot00000000000000/* * Copyright (C) 2006 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_DIRENT_H #define ZIM_DIRENT_H #include #include #include #include #include "zim_types.h" #include "debug.h" namespace zim { class Buffer; class InvalidSize : public std::exception {}; class Dirent { protected: uint16_t mimeType; uint32_t version; cluster_index_t clusterNumber; // only used when redirect is false blob_index_t blobNumber; // only used when redirect is false article_index_t redirectIndex; // only used when redirect is true char ns; std::string title; std::string url; std::string parameter; public: // these constants are put into mimeType field static const uint16_t redirectMimeType = 0xffff; static const uint16_t linktargetMimeType = 0xfffe; static const uint16_t deletedMimeType = 0xfffd; Dirent() : mimeType(0), version(0), clusterNumber(0), blobNumber(0), redirectIndex(0), ns('\0') {} Dirent(std::unique_ptr buffer); bool isRedirect() const { return mimeType == redirectMimeType; } bool isLinktarget() const { return mimeType == linktargetMimeType; } bool isDeleted() const { return mimeType == deletedMimeType; } bool isArticle() const { return !isRedirect() && !isLinktarget() && !isDeleted(); } uint16_t getMimeType() const { return mimeType; } uint32_t getVersion() const { return version; } void setVersion(uint32_t v) { version = v; } cluster_index_t getClusterNumber() const { return isRedirect() ? cluster_index_t(0) : clusterNumber; } blob_index_t getBlobNumber() const { return isRedirect() ? blob_index_t(0) : blobNumber; } article_index_t getRedirectIndex() const { return isRedirect() ? redirectIndex : article_index_t(0); } char getNamespace() const { return ns; } const std::string& getTitle() const { return title.empty() ? url : title; } const std::string& getUrl() const { return url; } std::string getLongUrl() const; const std::string& getParameter() const { return parameter; } size_t getDirentSize() const { size_t ret = (isRedirect() ? 12 : 16) + url.size() + parameter.size() + 2; if (title != url) ret += title.size(); return ret; } void setTitle(const std::string& title_) { title = title_; } void setUrl(char ns_, const std::string& url_) { ns = ns_; url = url_; } void setParameter(const std::string& parameter_) { parameter = parameter_; } void setRedirect(article_index_t idx) { redirectIndex = idx; mimeType = redirectMimeType; } void setMimeType(uint16_t mime) { mimeType = mime; } void setLinktarget() { ASSERT(mimeType, ==, 0); mimeType = linktargetMimeType; } void setDeleted() { ASSERT(mimeType, ==, 0); mimeType = deletedMimeType; } void setArticle(uint16_t mimeType_, cluster_index_t clusterNumber_, blob_index_t blobNumber_) { ASSERT(mimeType, ==, 0); mimeType = mimeType_; clusterNumber = clusterNumber_; blobNumber = blobNumber_; } }; } #endif // ZIM_DIRENT_H libzim-4.0.4/src/article.cpp000066400000000000000000000165161334353060400157350ustar00rootroot00000000000000/* * Copyright (C) 2006 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include #include "template.h" #include "_dirent.h" #include "cluster.h" #include #include "fileimpl.h" #include "file_part.h" #include #include #include #include "log.h" log_define("zim.article") namespace zim { size_type Article::getArticleSize() const { auto dirent = getDirent(); return size_type(file->getCluster(dirent->getClusterNumber()) ->getBlobSize(dirent->getBlobNumber())); } namespace { class Ev : public TemplateParser::Event { std::ostream& out; Article& article; std::shared_ptr file; unsigned maxRecurse; public: Ev(std::ostream& out_, Article& article_, std::shared_ptr file_, unsigned maxRecurse_) : out(out_), article(article_), file(file_), maxRecurse(maxRecurse_) { } void onData(const std::string& data); void onToken(const std::string& token); void onLink(char ns, const std::string& title); }; void Ev::onData(const std::string& data) { out << data; } void Ev::onToken(const std::string& token) { log_trace("onToken(\"" << token << "\")"); if (token == "title") out << article.getTitle(); else if (token == "url") out << article.getUrl(); else if (token == "namespace") out << article.getNamespace(); else if (token == "content") { if (maxRecurse <= 0) throw std::runtime_error("maximum recursive limit is reached"); article.getPage(out, false, maxRecurse - 1); } else { log_warn("unknown token \"" << token << "\" found in template"); out << "<%" << token << "%>"; } } void Ev::onLink(char ns, const std::string& url) { if (maxRecurse <= 0) throw std::runtime_error("maximum recursive limit is reached"); std::pair r = file->findx(ns, url); if (r.first) { Article(file, article_index_type(r.second)).getPage(out, false, maxRecurse - 1); } else { throw std::runtime_error(std::string("impossible to find article ") + std::string(1, ns) + std::string("/") + url); } } } std::shared_ptr Article::getDirent() const { return file->getDirent(article_index_t(idx)); } std::string Article::getParameter() const { return getDirent()->getParameter(); } std::string Article::getTitle() const { return getDirent()->getTitle(); } std::string Article::getUrl() const { return getDirent()->getUrl(); } std::string Article::getLongUrl() const { return getDirent()->getLongUrl(); } uint16_t Article::getLibraryMimeType() const { return getDirent()->getMimeType(); } const std::string& Article::getMimeType() const { return file->getMimeType(getLibraryMimeType()); } bool Article::isRedirect() const { return getDirent()->isRedirect(); } bool Article::isLinktarget() const { return getDirent()->isLinktarget(); } bool Article::isDeleted() const { return getDirent()->isDeleted(); } char Article::getNamespace() const { return getDirent()->getNamespace(); } article_index_type Article::getRedirectIndex() const { return article_index_type(getDirent()->getRedirectIndex()); } Article Article::getRedirectArticle() const { return Article(file, getRedirectIndex()); } std::shared_ptr Article::getCluster() const { auto dirent = getDirent(); if ( dirent->isRedirect() || dirent->isLinktarget() || dirent->isDeleted() ) { return std::shared_ptr(); } return file->getCluster(dirent->getClusterNumber()); } Blob Article::getData(offset_type offset) const { auto size = getArticleSize()-offset; return getData(offset, size); } Blob Article::getData(offset_type offset, size_type size) const { std::shared_ptr cluster = getCluster(); if (!cluster) { return Blob(); } return cluster->getBlob(getDirent()->getBlobNumber(), offset_t(offset), zsize_t(size)); } offset_type Article::getOffset() const { auto dirent = getDirent(); if (dirent->isRedirect() || dirent->isLinktarget() || dirent->isDeleted()) return 0; return offset_type(file->getBlobOffset(dirent->getClusterNumber(), dirent->getBlobNumber())); } std::pair Article::getDirectAccessInformation() const { auto dirent = getDirent(); if ( dirent->isRedirect() || dirent->isLinktarget() || dirent->isDeleted() ) { return std::make_pair("", 0); } auto full_offset = file->getBlobOffset(dirent->getClusterNumber(), dirent->getBlobNumber()); if (!full_offset) { // cluster is compressed return std::make_pair("", 0); } auto part_its = file->getFileParts(full_offset, zsize_t(getArticleSize())); auto range = part_its.first->first; auto part = part_its.first->second; if (++part_its.first != part_its.second) { return std::make_pair("", 0); } auto local_offset = full_offset - range.min; return std::make_pair(part->filename(), offset_type(local_offset)); } std::string Article::getPage(bool layout, unsigned maxRecurse) { std::ostringstream s; getPage(s, layout, maxRecurse); return s.str(); } void Article::getPage(std::ostream& out, bool layout, unsigned maxRecurse) { log_trace("Article::getPage(" << layout << ", " << maxRecurse << ')'); if (getMimeType().compare(0, 9, "text/html") == 0 || getMimeType() == MimeHtmlTemplate) { if (layout && file->getFileheader().hasLayoutPage()) { Article layoutPage(file, file->getFileheader().getLayoutPage()); Blob data = layoutPage.getData(); Ev ev(out, *this, file, maxRecurse); log_debug("call template parser"); TemplateParser parser(&ev); for (const char* p = data.data(); p != data.end(); ++p) parser.parse(*p); parser.flush(); return; } else if (getMimeType() == MimeHtmlTemplate) { Blob data = getData(); Ev ev(out, *this, file, maxRecurse); TemplateParser parser(&ev); for (const char* p = data.data(); p != data.end(); ++p) parser.parse(*p); parser.flush(); return; } } // default case - template cases has return above out << getData(); } } libzim-4.0.4/src/blob.cpp000066400000000000000000000023401334353060400152160ustar00rootroot00000000000000/* * Copyright (C) 2017 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "zim/blob.h" #include "debug.h" #include "buffer.h" namespace zim { Blob::Blob() : _data(0), _size(0) {} Blob::Blob(const char* data, size_type size) : _data(data), _size(size) { ASSERT(size, <, SIZE_MAX); ASSERT(data, <, (void*)(SIZE_MAX-size)); } Blob::Blob(std::shared_ptr buffer) : _data(buffer->data()), _size(size_type(buffer->size())), _buffer(buffer) {} } //zim libzim-4.0.4/src/buffer.cpp000066400000000000000000000037661334353060400155660ustar00rootroot00000000000000/* * Copyright (C) 2017 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "buffer.h" #include #include #include #include #include #include #include #ifndef _WIN32 # include # include #endif namespace zim { std::shared_ptr Buffer::sub_buffer(offset_t offset, zsize_t size) const { return std::make_shared(shared_from_this(), offset, size); } #ifdef ENABLE_USE_MMAP MMapBuffer::MMapBuffer(int fd, offset_t offset, zsize_t size): Buffer(size), _offset(0) { offset_t pa_offset(offset.v & ~(sysconf(_SC_PAGE_SIZE) - 1)); _offset = offset-pa_offset; #if defined(__APPLE__) #define MAP_FLAGS MAP_PRIVATE #else #define MAP_FLAGS MAP_PRIVATE|MAP_POPULATE #endif #if !MMAP_SUPPORT_64 if(pa_offset.v >= INT32_MAX) { throw MMapException(); } #endif _data = (char*)mmap(NULL, size.v + _offset.v, PROT_READ, MAP_FLAGS, fd, pa_offset.v); if (_data == MAP_FAILED ) { std::ostringstream s; s << "Cannot mmap size " << size.v << " at off " << offset.v << " : " << strerror(errno); throw std::runtime_error(s.str()); } #undef MAP_FLAGS } MMapBuffer::~MMapBuffer() { munmap(_data, size_.v + _offset.v); } #endif } //zim libzim-4.0.4/src/buffer.h000066400000000000000000000057371334353060400152330ustar00rootroot00000000000000/* * Copyright (C) 2017 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_BUFFER_H_ #define ZIM_BUFFER_H_ #include #include #include #include #include "config.h" #include "zim_types.h" #include "endian_tools.h" #include "debug.h" namespace zim { class MMapException : std::exception {}; class Buffer : public std::enable_shared_from_this { public: Buffer(zsize_t size) : size_(size) { ASSERT(size_.v, <, SIZE_MAX); }; virtual ~Buffer() {}; virtual const char* data(offset_t offset=offset_t(0)) const = 0; virtual char at(offset_t offset) const { return *(data(offset)); } zsize_t size() const { return size_; } virtual std::shared_ptr sub_buffer(offset_t offset, zsize_t size) const; template T as(offset_t offset) const { ASSERT(offset.v, <, size_.v); ASSERT(offset.v+sizeof(T), <=, size_.v); return fromLittleEndian(data(offset)); } protected: const zsize_t size_; }; template class MemoryBuffer : public Buffer { public: MemoryBuffer(const char* buffer, zsize_t size) : Buffer(size), _data(buffer) {} virtual ~MemoryBuffer() { if ( CLEAN_AT_END ) { delete [] _data; } } const char* data(offset_t offset) const { ASSERT(offset.v, <=, size_.v); return _data + offset.v; } private: const char* _data; }; #ifdef ENABLE_USE_MMAP class MMapBuffer : public Buffer { public: MMapBuffer(int fd, offset_t offset, zsize_t size); ~MMapBuffer(); const char* data(offset_t offset) const { offset += _offset; return _data + offset.v; } private: offset_t _offset; char* _data; }; #endif class SubBuffer : public Buffer { public: SubBuffer(const std::shared_ptr src, offset_t offset, zsize_t size) : Buffer(size), _data(src, src->data(offset)) { ASSERT(offset.v+size.v, <=, src->size().v); } const char* data(offset_t offset) const { ASSERT(offset.v, <=, size_.v); return _data.get() + offset.v; } private: std::shared_ptr _data; }; }; #endif //ZIM_BUFFER_H_ libzim-4.0.4/src/cache.h000066400000000000000000000253211334353060400150140ustar00rootroot00000000000000/* * Copyright (C) 2008 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_CACHE_H #define ZIM_CACHE_H #include #include #include #ifdef _WIN32 #define NOMINMAX #include #undef NOMINMAX #undef max #endif namespace zim { /** Implements a container for caching elements. The cache holds a list of key-value-pairs. There are 2 main operations for accessing the cache: put and get. Put takes a key and a value and puts the element into the list. Get takes a key and optional a value. If the value for the key is found, it is returned. The passed value otherwise. By default the value is constructed with the empty ctor of the value-type. The cache has a maximum size, after which key-value-pairs are dropped, when a new item is put into the cache. The algorithm for this cache is as follows: - when the cache is not full, new elements are appended - new elements are put into the middle of the list otherwise - the last element of the list is then dropped - when getting a value and the value is found, it is put to the beginning of the list When elements are searched, a linear search is done using the ==-operator of the key type. The caching algorithm keeps elements, which are fetched more than once in the first half of the list. In the second half the elements are either new or the elements are pushed from the first half to the second half by other elements, which are found in the cache. You should be aware, that the key type should be simple. Comparing keys must be cheap. Copying elements (both key and value) must be possible and should be cheap, since they are moved in the underlying container. */ template class Cache { struct Data { bool winner; unsigned serial; Value value; Data() { } Data(bool winner_, unsigned serial_, const Value& value_) : winner(winner_), serial(serial_), value(value_) { } }; typedef std::map DataType; DataType data; typename DataType::size_type maxElements; unsigned serial; unsigned hits; unsigned misses; unsigned _nextSerial() { if (serial == std::numeric_limits::max()) { for (typename DataType::iterator it = data.begin(); it != data.end(); ++it) it->second.serial = 0; serial = 1; } return serial++; } typename DataType::iterator _getOldest(bool winner) { typename DataType::iterator foundElement = data.begin(); typename DataType::iterator it = data.begin(); for (++it; it != data.end(); ++it) if (it->second.winner == winner && (foundElement->second.winner != winner || it->second.serial < foundElement->second.serial)) foundElement = it; return foundElement; } typename DataType::iterator _getNewest(bool winner) { typename DataType::iterator foundElement = data.begin(); typename DataType::iterator it = data.begin(); for (++it; it != data.end(); ++it) if (it->second.winner == winner && (foundElement->second.winner != winner || it->second.serial > foundElement->second.serial)) foundElement = it; return foundElement; } // drop one element void _dropLooser() { // look for the oldest element in the list of loosers to drop it data.erase(_getOldest(false)); } void _makeLooser() { // look for the oldest element in the list of winners to make it a looser typename DataType::iterator it = _getOldest(true); it->second.winner = false; it->second.serial = _nextSerial(); } public: typedef typename DataType::size_type size_type; typedef Value value_type; explicit Cache(size_type maxElements_) : maxElements(maxElements_ + (maxElements_ & 1)), serial(0), hits(0), misses(0) { } /// returns the number of elements currently in the cache size_type size() const { return data.size(); } /// returns the maximum number of elements in the cache size_type getMaxElements() const { return maxElements; } void setMaxElements(size_type maxElements_) { size_type numWinners = size() < maxElements / 2 ? size() : maxElements / 2; maxElements_ += (maxElements_ & 1); if (maxElements_ > maxElements) { maxElements = maxElements_; while (numWinners < maxElements / 2) { _getNewest(false)->winner = true; ++numWinners; } } else { while (maxElements > maxElements_) { _dropLooser(); _dropLooser(); _makeLooser(); maxElements -= 2; } while (numWinners > maxElements / 2) { _getNewest(true)->winner = false; --numWinners; } } } /// removes a element from the cache and returns true, if found bool erase(const Key& key) { typename DataType::iterator it = data.find(key); if (it == data.end()) return false; if (it->second.winner) _getNewest(false)->winner=true; data.erase(it); return true; } /// clears the cache. void clear(bool stats = false) { data.clear(); if (stats) hits = misses = 0; } /// puts a new element in the cache. If the element is already found in /// the cache, it is considered a cache hit and pushed to the top of the /// list. void put(const Key& key, const Value& value) { typename DataType::iterator it; if (data.size() < maxElements) { data.insert(data.begin(), typename DataType::value_type(key, Data(data.size() < maxElements / 2, _nextSerial(), value))); } else if ((it = data.find(key)) == data.end()) { // element not found _dropLooser(); data.insert(data.begin(), typename DataType::value_type(key, Data(false, _nextSerial(), value))); } else { // element found it->second.serial = _nextSerial(); if (!it->second.winner) { // move element to the winner part it->second.winner = true; _makeLooser(); } } } /// puts a new element on the top of the cache. If the element is already /// found in the cache, it is considered a cache hit and pushed to the /// top of the list. This method actually overrides the need, that a element /// needs a hit to get to the top of the cache. void put_top(const Key& key, const Value& value) { typename DataType::iterator it; if (data.size() < maxElements) { if (data.size() >= maxElements / 2) _makeLooser(); data.insert(data.begin(), typename DataType::value_type(key, Data(true, _nextSerial(), value))); } else if ((it = data.find(key)) == data.end()) { // element not found _dropLooser(); _makeLooser(); data.insert(data.begin(), typename DataType::value_type(key, Data(true, _nextSerial(), value))); } else { // element found it->second.serial = _nextSerial(); if (!it->second.winner) { // move element to the winner part it->second.winner = true; _makeLooser(); } } } Value* getptr(const Key& key) { typename DataType::iterator it = data.find(key); if (it == data.end()) return 0; it->second.serial = _nextSerial(); if (!it->second.winner) { // move element to the winner part it->second.winner = true; _makeLooser(); } return &it->second.value; } /// returns a pair of values - a flag, if the value was found and the /// value if found or the passed default otherwise. If the value is /// found it is a cahce hit and pushed to the top of the list. std::pair getx(const Key& key, Value def = Value()) { Value* v = getptr(key); return v ? std::pair(true, *v) : std::pair(false, def); } /// returns the value to a key or the passed default value if not found. /// If the value is found it is a cahce hit and pushed to the top of the /// list. Value get(const Key& key, Value def = Value()) { return getx(key, def).second; } /// returns the number of hits. unsigned getHits() const { return hits; } /// returns the number of misses. unsigned getMisses() const { return misses; } /// returns the cache hit ratio between 0 and 1. double hitRatio() const { return hits+misses > 0 ? static_cast(hits)/static_cast(hits+misses) : 0; } /// returns the ratio, between held elements and maximum elements. double fillfactor() const { return static_cast(data.size()) / static_cast(maxElements); } /* void dump(std::ostream& out) const { out << "cache max size=" << maxElements << " current size=" << size() << '\n'; for (typename DataType::const_iterator it = data.begin(); it != data.end(); ++it) { out << "\tkey=\"" << it->first << "\" value=\"" << it->second.value << "\" serial=" << it->second.serial << " winner=" << it->second.winner << '\n'; } out << "--------\n"; } */ }; } #endif // ZIM_CACHE_H libzim-4.0.4/src/cluster.cpp000066400000000000000000000067031334353060400157700ustar00rootroot00000000000000/* * Copyright (C) 2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "cluster.h" #include #include #include "file_reader.h" #include "endian_tools.h" #include #include #include #include "log.h" #include "config.h" log_define("zim.cluster") #define log_debug1(e) namespace zim { Cluster::Cluster(std::shared_ptr reader_, CompressionType comp, bool isExtended) : compression(comp), isExtended(isExtended), reader(reader_), startOffset(0) { auto d = reader->offset(); if (isExtended) { startOffset = read_header(); } else { startOffset = read_header(); } reader = reader->sub_reader(startOffset); auto d1 = reader->offset(); ASSERT(d+startOffset, ==, d1); } /* This return the number of char read */ template offset_t Cluster::read_header() { // read first offset, which specifies, how many offsets we need to read OFFSET_TYPE offset; offset = reader->read(offset_t(0)); size_t n_offset = offset / sizeof(OFFSET_TYPE); offset_t data_address(offset); // read offsets offsets.clear(); offsets.reserve(n_offset); offsets.push_back(offset_t(0)); auto buffer = reader->get_buffer(offset_t(0), zsize_t(offset)); offset_t current = offset_t(sizeof(OFFSET_TYPE)); while (--n_offset) { OFFSET_TYPE new_offset = buffer->as(current); ASSERT(new_offset, >=, offset); ASSERT(offset, >=, data_address.v); ASSERT(offset, <=, reader->size().v); offset = new_offset; offsets.push_back(offset_t(offset - data_address.v)); current += sizeof(OFFSET_TYPE); } ASSERT(offset, ==, reader->size().v); return data_address; } Blob Cluster::getBlob(blob_index_t n) const { if (size()) { auto blobSize = getBlobSize(n); if (blobSize.v > SIZE_MAX) { return Blob(); } auto buffer = reader->get_buffer(offsets[blob_index_type(n)], getBlobSize(n)); return Blob(buffer); } else { return Blob(); } } Blob Cluster::getBlob(blob_index_t n, offset_t offset, zsize_t size) const { if (this->size()) { offset += offsets[blob_index_type(n)]; size = std::min(size, getBlobSize(n)); if (size.v > SIZE_MAX) { return Blob(); } auto buffer = reader->get_buffer(offset, size); return Blob(buffer); } else { return Blob(); } } zsize_t Cluster::size() const { if (isExtended) return zsize_t(offsets.size() * sizeof(uint64_t) + reader->size().v); else return zsize_t(offsets.size() * sizeof(uint32_t) + reader->size().v); } } libzim-4.0.4/src/cluster.h000066400000000000000000000044501334353060400154320ustar00rootroot00000000000000/* * Copyright (C) 2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_CLUSTER_H #define ZIM_CLUSTER_H #include #include "buffer.h" #include "zim_types.h" #include #include #include #include "zim_types.h" namespace zim { class Blob; class Reader; class Cluster : public std::enable_shared_from_this { typedef std::vector Offsets; const CompressionType compression; const bool isExtended; Offsets offsets; std::shared_ptr reader; offset_t startOffset; template offset_t read_header(); public: Cluster(std::shared_ptr reader, CompressionType comp, bool isExtended); CompressionType getCompression() const { return compression; } bool isCompressed() const { return compression == zimcompZip || compression == zimcompBzip2 || compression == zimcompLzma; } blob_index_t count() const { return blob_index_t(offsets.size() - 1); } zsize_t size() const; zsize_t getBlobSize(blob_index_t n) const { return zsize_t(offsets[blob_index_type(n)+1].v - offsets[blob_index_type(n)].v); } offset_t getBlobOffset(blob_index_t n) const { return startOffset + offsets[blob_index_type(n)]; } Blob getBlob(blob_index_t n) const; Blob getBlob(blob_index_t n, offset_t offset, zsize_t size) const; void clear(); void init_from_buffer(Buffer& buffer); }; } #endif // ZIM_CLUSTER_H libzim-4.0.4/src/config.h.in000066400000000000000000000004171334353060400156220ustar00rootroot00000000000000 #mesondefine VERSION #mesondefine DIRENT_CACHE_SIZE #mesondefine CLUSTER_CACHE_SIZE #mesondefine LZMA_MEMORY_SIZE #mesondefine ENABLE_ZLIB #mesondefine ENABLE_XAPIAN #mesondefine ENABLE_USE_MMAP #mesondefine ENABLE_USE_BUFFER_HEADER #mesondefine MMAP_SUPPORT_64 libzim-4.0.4/src/debug.h000066400000000000000000000036401334353060400150370ustar00rootroot00000000000000/* * Copyright (C) 2017 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef DEBUG_H_ #define DEBUG_H_ #include #include #if defined (NDEBUG) # define ASSERT(left, operator, right) (void(0)) #else #if !defined(_WIN32) && !defined(__APPLE__) && !defined(__ANDROID__) #include #endif template void _on_assert_fail(const char* vara, const char* op, const char* varb, T a, U b, const char* file, int line) { std::cerr << "\nAssertion failed at "<< file << ":" << line << "\n " << vara << "[" << a << "] " << op << " " << varb << "[" << b << "]" << std::endl; #if !defined(_WIN32) && !defined(__APPLE__) && !defined(__ANDROID__) void *callstack[64]; size_t size; size = backtrace(callstack, 64); char** strings = backtrace_symbols(callstack, size); for (size_t i=0; i #include "buffer.h" #include "endian_tools.h" #include "log.h" #include #include log_define("zim.dirent") namespace zim { ////////////////////////////////////////////////////////////////////// // Dirent // const uint16_t Dirent::redirectMimeType; const uint16_t Dirent::linktargetMimeType; const uint16_t Dirent::deletedMimeType; Dirent::Dirent(std::unique_ptr buffer) : Dirent() { uint16_t mimeType = buffer->as(offset_t(0)); bool redirect = (mimeType == Dirent::redirectMimeType); bool linktarget = (mimeType == Dirent::linktargetMimeType); bool deleted = (mimeType == Dirent::deletedMimeType); uint8_t extraLen = buffer->data()[2]; char ns = buffer->data()[3]; uint32_t version = buffer->as(offset_t(4)); setVersion(version); offset_t current = offset_t(8); if (redirect) { article_index_t redirectIndex(buffer->as(current)); current += sizeof(article_index_t); log_debug("redirectIndex=" << redirectIndex); setRedirect(article_index_t(redirectIndex)); } else if (linktarget || deleted) { log_debug("linktarget or deleted entry"); setArticle(mimeType, cluster_index_t(0), blob_index_t(0)); } else { log_debug("read article entry"); uint32_t clusterNumber = buffer->as(current); current += sizeof(uint32_t); uint32_t blobNumber = buffer->as(current); current += sizeof(uint32_t); log_debug("mimeType=" << mimeType << " clusterNumber=" << clusterNumber << " blobNumber=" << blobNumber); setArticle(mimeType, cluster_index_t(clusterNumber), blob_index_t(blobNumber)); } std::string url; std::string title; std::string parameter; log_debug("read url, title and parameters"); offset_type url_size = strlen(buffer->data(current)); if (current.v + url_size >= buffer->size().v) { throw(InvalidSize()); } url = std::string(buffer->data(current), url_size); current += url_size + 1; offset_type title_size = strlen(buffer->data(current)); if (current.v + title_size >= buffer->size().v) { throw(InvalidSize()); } title = std::string(buffer->data(current), title_size); current += title_size + 1; if (current.v + extraLen > buffer->size().v) { throw(InvalidSize()); } parameter = std::string(buffer->data(current), extraLen); setUrl(ns, url); setTitle(title); setParameter(parameter); } std::string Dirent::getLongUrl() const { log_trace("Dirent::getLongUrl()"); log_debug("namespace=" << getNamespace() << " title=" << getTitle()); return std::string(1, getNamespace()) + '/' + getUrl(); } } libzim-4.0.4/src/endian_tools.h000066400000000000000000000045671334353060400164400ustar00rootroot00000000000000/* * Copyright (C) 2006 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ENDIAN_H #define ENDIAN_H #include #include #include namespace zim { template struct ToLittleEndianImpl; template struct ToLittleEndianImpl{ static void write(const T& d, char* dst) { uint16_t v = static_cast(d); dst[0] = static_cast(v); dst[1] = static_cast(v>>8); } }; template struct ToLittleEndianImpl{ static void write(const T& d, char* dst) { uint32_t v = static_cast(d); dst[0] = static_cast(v); dst[1] = static_cast(v>>8); dst[2] = static_cast(v>>16); dst[3] = static_cast(v>>24); } }; template struct ToLittleEndianImpl{ static void write(const T& d, char* dst) { uint64_t v = static_cast(d); dst[0] = static_cast(v); dst[1] = static_cast(v>>8); dst[2] = static_cast(v>>16); dst[3] = static_cast(v>>24); dst[4] = static_cast(v>>32); dst[5] = static_cast(v>>40); dst[6] = static_cast(v>>48); dst[7] = static_cast(v>>56); } }; //////////////////////////////////////////////////////////////////////// template inline void toLittleEndian(T d, char* dst) { ToLittleEndianImpl::write(d, dst); } template inline T fromLittleEndian(const char* ptr) { T ret = 0; for(size_t i=0; i(static_cast(ptr[i])) << (i*8)); } return ret; } } #endif // ENDIAN_H libzim-4.0.4/src/envvalue.cpp000066400000000000000000000027141334353060400161320ustar00rootroot00000000000000/* * Copyright (C) 2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include #include namespace zim { unsigned envValue(const char* env, unsigned def) { const char* v = ::getenv(env); if (v) { std::istringstream s(v); s >> def; } return def; } unsigned envMemSize(const char* env, unsigned def) { const char* v = ::getenv(env); if (v) { char unit = '\0'; std::istringstream s(v); s >> def >> unit; switch (unit) { case 'k': case 'K': def *= 1024; break; case 'm': case 'M': def *= 1024 * 1024; break; case 'g': case 'G': def *= 1024 * 1024 * 1024; break; } } return def; } } libzim-4.0.4/src/envvalue.h000066400000000000000000000017511334353060400155770ustar00rootroot00000000000000/* * Copyright (C) 2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_ENVVALUE_H #define ZIM_ENVVALUE_H namespace zim { unsigned envValue(const char* env, unsigned def); unsigned envMemSize(const char* env, unsigned def); } #endif // ZIM_ENVVALUE_H libzim-4.0.4/src/file.cpp000066400000000000000000000173111334353060400152230ustar00rootroot00000000000000/* * Copyright (C) 2006,2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include #include "fileimpl.h" #include #include #include "log.h" #include #include log_define("zim.file") namespace zim { namespace { int hexval(char ch) { if (ch >= '0' && ch <= '9') return ch - '0'; if (ch >= 'a' && ch <= 'f') return ch - 'a' + 10; if (ch >= 'A' && ch <= 'F') return ch - 'A' + 10; return -1; } } File::File(const std::string& fname) : impl(new FileImpl(fname)) { } const std::string& File::getFilename() const { return impl->getFilename(); } const Fileheader& File::getFileheader() const { return impl->getFileheader(); } size_type File::getFilesize() const { return impl->getFilesize().v; } article_index_type File::getCountArticles() const { return article_index_type(impl->getCountArticles()); } Article File::getArticle(article_index_type idx) const { if (idx >= article_index_type(impl->getCountArticles())) throw ZimFileFormatError("article index out of range"); return Article(impl, idx); } Article File::getArticle(char ns, const std::string& url) const { log_trace("File::getArticle('" << ns << "', \"" << url << ')'); std::pair r = impl->findx(ns, url); return r.first ? Article(impl, article_index_type(r.second)) : Article(); } Article File::getArticleByUrl(const std::string& url) const { log_trace("File::getArticle(\"" << url << ')'); std::pair r = impl->findx(url); return r.first ? Article(impl, article_index_type(r.second)) : Article(); } Article File::getArticleByTitle(article_index_type idx) const { return Article(impl, article_index_type(impl->getIndexByTitle(article_index_t(idx)))); } Article File::getArticleByTitle(char ns, const std::string& title) const { log_trace("File::getArticleByTitle('" << ns << "', \"" << title << ')'); std::pair r = impl->findxByTitle(ns, title); return r.first ? Article(impl, article_index_type(impl->getIndexByTitle(r.second))) : Article(); } std::shared_ptr File::getCluster(cluster_index_type idx) const { return impl->getCluster(cluster_index_t(idx)); } cluster_index_type File::getCountClusters() const { return cluster_index_type(impl->getCountClusters()); } offset_type File::getClusterOffset(cluster_index_type idx) const { return offset_type(impl->getClusterOffset(cluster_index_t(idx))); } Blob File::getBlob(cluster_index_type clusterIdx, blob_index_type blobIdx) const { return impl->getCluster(cluster_index_t(clusterIdx))->getBlob(blob_index_t(blobIdx)); } article_index_type File::getNamespaceBeginOffset(char ch) const { return article_index_type(impl->getNamespaceBeginOffset(ch)); } article_index_type File::getNamespaceEndOffset(char ch) const { return article_index_type(impl->getNamespaceEndOffset(ch)); } article_index_type File::getNamespaceCount(char ns) const { return getNamespaceEndOffset(ns) - getNamespaceBeginOffset(ns); } std::string File::getNamespaces() const { return impl->getNamespaces(); } bool File::hasNamespace(char ch) const { article_index_t off = impl->getNamespaceBeginOffset(ch); return off < impl->getCountArticles() && impl->getDirent(off)->getNamespace() == ch; } File::const_iterator File::begin() const { return const_iterator(this, 0); } File::const_iterator File::beginByTitle() const { return const_iterator(this, 0, const_iterator::ArticleIterator); } File::const_iterator File::end() const { return const_iterator(this, getCountArticles()); } File::const_iterator File::find(char ns, const std::string& url) const { std::pair r = impl->findx(ns, url); return File::const_iterator(this, article_index_type(r.second)); } File::const_iterator File::find(const std::string& url) const { std::pair r = impl->findx(url); return File::const_iterator(this, article_index_type(r.second)); } File::const_iterator File::findByTitle(char ns, const std::string& title) const { std::pair r = impl->findxByTitle(ns, title); return File::const_iterator(this, article_index_type(r.second), const_iterator::ArticleIterator); } const Search* File::search(const std::string& query, int start, int end) const { Search* search = new Search(this); search->set_query(query); search->set_range(start, end); return search; } const Search* File::suggestions(const std::string& query, int start, int end) const { Search* search = new Search(this); search->set_query(query); search->set_range(start, end); search->set_suggestion_mode(true); return search; } offset_type File::getOffset(cluster_index_type clusterIdx, blob_index_type blobIdx) const { return offset_type(impl->getBlobOffset( cluster_index_t(clusterIdx), blob_index_t(blobIdx))); } time_t File::getMTime() const { return impl->getMTime(); } const std::string& File::getMimeType(uint16_t idx) const { return impl->getMimeType(idx); } std::string File::getChecksum() { return impl->getChecksum(); } bool File::verify() { return impl->verify(); } bool File::is_multiPart() const { return impl->is_multiPart(); } std::string urldecode(const std::string& url) { std::string ret; enum { state_0, state_h1, state_h2 } state = state_0; char ch = '\0'; for (std::string::const_iterator it = url.begin(); it != url.end(); ++it) { switch (state) { case state_0: if (*it == '+') ret += ' '; else if (*it == '%') state = state_h1; else ret += *it; break; case state_h1: if ( (*it >= '0' && *it <= '9') || (*it >= 'A' && *it <= 'F') || (*it >= 'a' && *it <= 'f')) { ch = *it; state = state_h2; } else { ret += '%'; ret += *it; state = state_0; } break; case state_h2: if ( (*it >= '0' && *it <= '9') || (*it >= 'A' && *it <= 'F') || (*it >= 'a' && *it <= 'f')) { ret += static_cast(hexval(ch) * 16 + hexval(*it)); } else { ret += static_cast(hexval(ch)); ret += *it; } state = state_0; break; } } switch (state) { case state_0: break; case state_h1: ret += '%'; break; case state_h2: ret += '%'; ret += ch; break; } return ret; } } libzim-4.0.4/src/file_compound.cpp000066400000000000000000000053751334353060400171360ustar00rootroot00000000000000/* * Copyright (C) 2017 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "file_compound.h" #include "buffer.h" #include #include #include #include #ifdef _WIN32 # include #else # include #endif namespace zim { FileCompound::FileCompound(const std::string& filename): _fsize(0) { try { auto part = new FilePart<>(filename); emplace(Range(offset_t(0), offset_t(part->size().v)), part); _fsize = part->size(); } catch(...) { int errnoSave = errno; _fsize = zsize_t(0); for (char ch0 = 'a'; ch0 <= 'z'; ++ch0) { std::string fname0 = filename + ch0; for (char ch1 = 'a'; ch1 <= 'z'; ++ch1) { std::string fname1 = fname0 + ch1; try { auto currentPart = new FilePart<>(fname1); emplace(Range(offset_t(_fsize.v), offset_t((_fsize+currentPart->size()).v)), currentPart); _fsize += currentPart->size(); } catch (...) { break; } } } if (empty()) { std::ostringstream msg; msg << "error " << errnoSave << " opening file \"" << filename; throw std::runtime_error(msg.str()); } } } FileCompound::FileCompound(FilePart<>* filePart): _fsize(0) { emplace(Range(offset_t(0), offset_t(filePart->size().v)), filePart); _fsize = filePart->size(); } FileCompound::~FileCompound() { for(auto it=begin(); it!=end(); it++) { auto filepart = it->second; delete filepart; } } time_t FileCompound::getMTime() const { if (mtime || empty()) return mtime; const char* fname = begin()->second->filename().c_str(); #if defined(HAVE_STAT64) && ! defined(__APPLE__) struct stat64 st; int ret = ::stat64(fname, &st); #else struct stat st; int ret = ::stat(fname, &st); #endif if (ret != 0) { std::ostringstream msg; msg << "stat failed with errno " << errno << " : " << strerror(errno); throw std::runtime_error(msg.str()); } mtime = st.st_mtime; return mtime; } } // zim libzim-4.0.4/src/file_compound.h000066400000000000000000000040041334353060400165670ustar00rootroot00000000000000/* * Copyright (C) 2017 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_FILE_COMPOUND_H_ #define ZIM_FILE_COMPOUND_H_ #include "file_part.h" #include "zim_types.h" #include #include #include namespace zim { class FileReader; struct Range { Range(const offset_t point ) : min(point), max(point) {} Range(const offset_t min, const offset_t max) : min(min), max(max) {} const offset_t min; const offset_t max; }; struct less_range : public std::binary_function< Range, Range, bool> { bool operator()(const Range& lhs, const Range& rhs) const { return lhs.min < rhs.min && lhs.max <= rhs.min; } }; class FileCompound : public std::map*, less_range> { public: FileCompound(const std::string& filename); FileCompound(FilePart<>* fpart); ~FileCompound(); zsize_t fsize() const { return _fsize; }; time_t getMTime() const; bool fail() const { return empty(); }; bool is_multiPart() const { return size() > 1; }; std::pair locate(offset_t offset, zsize_t size) const { return equal_range(Range(offset, offset+size)); } private: zsize_t _fsize; mutable time_t mtime; }; }; #endif //ZIM_FILE_COMPOUND_H_ libzim-4.0.4/src/file_part.h000066400000000000000000000033171334353060400157170ustar00rootroot00000000000000/* * Copyright (C) 2017 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_FILE_PART_H_ #define ZIM_FILE_PART_H_ #include #include #include #include "zim_types.h" #include "fs.h" namespace zim { template class FilePart { public: FilePart(const std::string& filename) : m_filename(filename), m_fhandle(FS::openFile(filename)), m_size(m_fhandle.getSize()) {} FilePart(int fd) : m_filename(""), m_fhandle(fd), m_size(m_fhandle.getSize()) {} ~FilePart() = default; const std::string& filename() const { return m_filename; }; const typename FS::FD& fhandle() const { return m_fhandle; }; zsize_t size() const { return m_size; }; bool fail() const { return !m_size; }; bool good() const { return bool(m_size); }; private: const std::string m_filename; typename FS::FD m_fhandle; zsize_t m_size; }; }; #endif //ZIM_FILE_PART_H_ libzim-4.0.4/src/file_reader.cpp000066400000000000000000000302051334353060400165420ustar00rootroot00000000000000/* * Copyright (C) 2017 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include #include #include "file_reader.h" #include "file_compound.h" #include "buffer.h" #include "config.h" #include "envvalue.h" #include #include #include #include #include #include #include #include #if defined(_MSC_VER) # include # include typedef SSIZE_T ssize_t; #endif #if defined(ENABLE_ZLIB) #include #endif namespace zim { FileReader::FileReader(std::shared_ptr source) : FileReader(source, offset_t(0), source->fsize()) {} FileReader::FileReader(std::shared_ptr source, offset_t offset) : FileReader(source, offset, zsize_t(source->fsize().v-offset.v)) {} FileReader::FileReader(std::shared_ptr source, offset_t offset, zsize_t size) : source(source), _offset(offset), _size(size) { ASSERT(offset.v, <, source->fsize().v); ASSERT(offset.v+size.v, <=, source->fsize().v); } char FileReader::read(offset_t offset) const { ASSERT(offset.v, <, _size.v); offset += _offset; auto part_pair = source->lower_bound(offset); auto& fhandle = part_pair->second->fhandle(); offset_t local_offset = offset - part_pair->first.min; ASSERT(local_offset, <=, part_pair->first.max); char ret; try { fhandle.readAt(&ret, zsize_t(1), local_offset); } catch (std::runtime_error& e) { //Error while reading. std::ostringstream s; s << "Cannot read a char.\n"; s << " - File part is " << part_pair->second->filename() << "\n"; s << " - File part size is " << part_pair->second->size().v << "\n"; s << " - File part range is " << part_pair->first.min << "-" << part_pair->first.max << "\n"; s << " - Reading offset at " << offset.v << "\n"; s << " - local offset is " << local_offset.v << "\n"; s << " - error is " << strerror(errno) << "\n"; std::error_code ec(errno, std::generic_category()); throw std::system_error(ec, s.str()); }; return ret; } void FileReader::read(char* dest, offset_t offset, zsize_t size) const { ASSERT(offset.v, <, _size.v); ASSERT(offset.v+size.v, <=, _size.v); if (! size ) { return; } offset += _offset; auto found_range = source->locate(offset, size); for(auto current = found_range.first; current!=found_range.second; current++){ auto part = current->second; Range partRange = current->first; offset_t local_offset = offset-partRange.min; ASSERT(size.v, >, 0U); zsize_t size_to_get = zsize_t(std::min(size.v, part->size().v-local_offset.v)); try { part->fhandle().readAt(dest, size_to_get, local_offset); } catch (std::runtime_error& e) { std::ostringstream s; s << "Cannot read chars.\n"; s << " - File part is " << part->filename() << "\n"; s << " - File part size is " << part->size().v << "\n"; s << " - File part range is " << partRange.min << "-" << partRange.max << "\n"; s << " - size_to_get is " << size_to_get.v << "\n"; s << " - total size is " << size.v << "\n"; s << " - Reading offset at " << offset.v << "\n"; s << " - local offset is " << local_offset.v << "\n"; s << " - error is " << strerror(errno) << "\n"; std::error_code ec(errno, std::generic_category()); throw std::system_error(ec, s.str()); }; ASSERT(size_to_get, <=, size); dest += size_to_get.v; size -= size_to_get; offset += size_to_get; } ASSERT(size.v, ==, 0U); } std::shared_ptr FileReader::get_buffer(offset_t offset, zsize_t size) const { ASSERT(size, <=, _size); #ifdef ENABLE_USE_MMAP try { auto found_range = source->locate(_offset+offset, size); auto first_part_containing_it = found_range.first; if (++first_part_containing_it != found_range.second) { throw MMapException(); } // The range is in only one part auto range = found_range.first->first; auto part = found_range.first->second; auto local_offset = offset + _offset - range.min; ASSERT(size, <=, part->size()); int fd = part->fhandle().getNativeHandle(); auto buffer = std::shared_ptr(new MMapBuffer(fd, local_offset, size)); return buffer; } catch(MMapException& e) #endif { // The range is several part, or we are on Windows. // We will have to do some memory copies :/ // [TODO] Use Windows equivalent for mmap. char* p = new char[size.v]; auto ret_buffer = std::shared_ptr(new MemoryBuffer(p, size)); read(p, offset, size); return ret_buffer; } } bool Reader::can_read(offset_t offset, zsize_t size) { return (offset.v <= this->size().v && (offset.v+size.v) <= this->size().v); } char* lzma_uncompress(const char* raw_data, zsize_t raw_size, zsize_t* dest_size) { // We don't know what will be the result size. // Let's assume it will be something like the minChunkSize used at creation zsize_t _dest_size = zsize_t(1024*1024); char* ret_data = new char[_dest_size.v]; lzma_stream stream = LZMA_STREAM_INIT; unsigned memsize = envMemSize("ZIM_LZMA_MEMORY_SIZE", LZMA_MEMORY_SIZE * 1024 * 1024); auto errcode = lzma_stream_decoder(&stream, memsize, 0); if (errcode != LZMA_OK) { throw std::runtime_error("Impossible to allocated needed memory to uncompress lzma stream"); } stream.next_in = (const unsigned char*)raw_data; stream.avail_in = raw_size.v; stream.next_out = (unsigned char*) ret_data; stream.avail_out = _dest_size.v; do { errcode = lzma_code(&stream, LZMA_FINISH); if (errcode == LZMA_BUF_ERROR) { if (stream.avail_in == 0 && stream.avail_out != 0) { // End of input stream. // lzma haven't recognize the end of the input stream but there is no // more input. // As we know that we should have all the input stream, it is probably // because the stream has not been close correctly at zim creation. // It means that the lzma stream is not full and this is an error in the // zim file. } else { //Not enought output size _dest_size.v *= 2; char * new_ret_data = new char[_dest_size.v]; memcpy(new_ret_data, ret_data, stream.total_out); stream.next_out = (unsigned char*)(new_ret_data + stream.total_out); stream.avail_out = _dest_size.v - stream.total_out; delete [] ret_data; ret_data = new_ret_data; continue; } } if (errcode != LZMA_STREAM_END && errcode != LZMA_OK) { throw ZimFileFormatError("Invalid lzma stream for cluster."); } } while (errcode != LZMA_STREAM_END); dest_size->v = stream.total_out; lzma_end(&stream); return ret_data; } #if defined(ENABLE_ZLIB) char* zip_uncompress(const char* raw_data, zsize_t raw_size, zsize_t* dest_size) { zsize_t _dest_size = zsize_t(1024*1024); char* ret_data = new char[_dest_size.v]; z_stream stream; memset(&stream, 0, sizeof(stream)); stream.next_in = (unsigned char*) raw_data; stream.avail_in = raw_size.v; stream.next_out = (unsigned char*) ret_data; stream.avail_out = _dest_size.v; auto errcode = ::inflateInit(&stream); if (errcode != Z_OK) { throw std::runtime_error("Impossible to allocated needed memory to uncompress zlib stream"); } do { errcode = ::inflate(&stream, Z_FINISH); if (errcode == Z_BUF_ERROR ) { if (stream.avail_in == 0 && stream.avail_out != 0) { // End of input stream. // zlib haven't recognize the end of the input stream but there is no // more input. // As we know that we should have all the input stream, it is probably // because the stream has not been close correctly at zim creation. // It means that the zlib stream is not full and this is an error in the // zim file. } else { //Not enought output size _dest_size.v *= 2; char * new_ret_data = new char[_dest_size.v]; memcpy(new_ret_data, ret_data, stream.total_out); stream.next_out = (unsigned char*)(new_ret_data + stream.total_out); stream.avail_out = _dest_size.v - stream.total_out; delete [] ret_data; ret_data = new_ret_data; continue; } } if (errcode != Z_STREAM_END && errcode != Z_OK) { throw ZimFileFormatError("Invalid zlib stream for cluster."); } } while ( errcode != Z_STREAM_END ); dest_size->v = stream.total_out; ::inflateEnd(&stream); return ret_data; } #endif std::shared_ptr Reader::get_clusterBuffer(offset_t offset, zsize_t size, CompressionType comp) const { auto raw_buffer = get_buffer(offset, size); zsize_t uncompressed_size(0); char* uncompressed_data = nullptr; switch (comp) { case zimcompLzma: uncompressed_data = lzma_uncompress(raw_buffer->data(), size, &uncompressed_size); break; case zimcompZip: #if defined(ENABLE_ZLIB) uncompressed_data = zip_uncompress(raw_buffer->data(), size, &uncompressed_size); #else throw std::runtime_error("zlib not enabled in this library"); #endif break; default: throw std::logic_error("compressions should not be something else than zimcompLzma or zimComZip."); } return std::shared_ptr(new MemoryBuffer(uncompressed_data, uncompressed_size)); } std::unique_ptr Reader::sub_clusterReader(offset_t offset, zsize_t size, CompressionType* comp, bool* extended) const { uint8_t clusterInfo = read(offset); *comp = static_cast(clusterInfo & 0x0F); *extended = clusterInfo & 0x10; switch (*comp) { case zimcompDefault: case zimcompNone: { // No compression, just a sub_reader return sub_reader(offset+offset_t(1), size-zsize_t(1)); } break; case zimcompLzma: case zimcompZip: { auto buffer = get_clusterBuffer(offset+offset_t(1), size-zsize_t(1), *comp); return std::unique_ptr(new BufferReader(buffer)); } break; case zimcompBzip2: throw std::runtime_error("bzip2 not enabled in this library"); default: throw ZimFileFormatError("Invalid compression flag"); } } std::unique_ptr FileReader::sub_reader(offset_t offset, zsize_t size) const { ASSERT(size, <=, _size); return std::unique_ptr(new FileReader(source, _offset+offset, size)); } //BufferReader::BufferReader(std::shared_ptr source) // : source(source) {} std::shared_ptr BufferReader::get_buffer(offset_t offset, zsize_t size) const { return source->sub_buffer(offset, size); } std::unique_ptr BufferReader::sub_reader(offset_t offset, zsize_t size) const { //auto source_addr = source->data(0); auto sub_buff = get_buffer(offset, size); //auto buff_addr = sub_buff->data(0); std::unique_ptr sub_read(new BufferReader(sub_buff)); return sub_read; } zsize_t BufferReader::size() const { return source->size(); } offset_t BufferReader::offset() const { return offset_t((offset_type)(static_cast(source->data(offset_t(0))))); } void BufferReader::read(char* dest, offset_t offset, zsize_t size) const { ASSERT(offset.v, <, source->size().v); ASSERT(offset+offset_t(size.v), <=, offset_t(source->size().v)); if (! size ) { return; } memcpy(dest, source->data(offset), size.v); } char BufferReader::read(offset_t offset) const { ASSERT(offset.v, <, source->size().v); char dest; dest = *source->data(offset); return dest; } } // zim libzim-4.0.4/src/file_reader.h000066400000000000000000000074021334353060400162120ustar00rootroot00000000000000/* * Copyright (C) 2017 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_FILE_READER_H_ #define ZIM_FILE_READER_H_ #include #include "zim_types.h" #include "endian_tools.h" #include "debug.h" namespace zim { class Buffer; class FileCompound; class Reader { public: Reader() {}; virtual zsize_t size() const = 0; virtual ~Reader() {}; virtual void read(char* dest, offset_t offset, zsize_t size) const = 0; template T read(offset_t offset) const { ASSERT(offset.v, <, size().v); ASSERT(offset.v+sizeof(T), <=, size().v); char tmp_buf[sizeof(T)]; read(tmp_buf, offset, zsize_t(sizeof(T))); return fromLittleEndian(tmp_buf); } virtual char read(offset_t offset) const = 0; virtual std::shared_ptr get_buffer(offset_t offset, zsize_t size) const = 0; std::shared_ptr get_buffer(offset_t offset) const { return get_buffer(offset, zsize_t(size().v-offset.v)); } virtual std::unique_ptr sub_reader(offset_t offset, zsize_t size) const = 0; std::unique_ptr sub_reader(offset_t offset) const { return sub_reader(offset, zsize_t(size().v-offset.v)); } virtual offset_t offset() const = 0; std::unique_ptr sub_clusterReader(offset_t offset, zsize_t size, CompressionType* comp, bool* extented) const; bool can_read(offset_t offset, zsize_t size); private: std::shared_ptr get_clusterBuffer(offset_t offset, zsize_t size, CompressionType comp) const; }; class FileReader : public Reader { public: FileReader(std::shared_ptr source); ~FileReader() {}; zsize_t size() const { return _size; }; offset_t offset() const { return _offset; }; char read(offset_t offset) const; void read(char* dest, offset_t offset, zsize_t size) const; std::shared_ptr get_buffer(offset_t offset, zsize_t size) const; std::unique_ptr sub_reader(offset_t offest, zsize_t size) const; private: FileReader(std::shared_ptr source, offset_t offset); FileReader(std::shared_ptr source, offset_t offset, zsize_t size); std::shared_ptr source; offset_t _offset; zsize_t _size; }; class BufferReader : public Reader { public: BufferReader(std::shared_ptr source) : source(source) {} virtual ~BufferReader() {}; zsize_t size() const; offset_t offset() const; void read(char* dest, offset_t offset, zsize_t size) const; char read(offset_t offset) const; std::shared_ptr get_buffer(offset_t offset, zsize_t size) const; std::unique_ptr sub_reader(offset_t offset, zsize_t size) const; private: std::shared_ptr source; }; }; #endif // ZIM_FILE_READER_H_ libzim-4.0.4/src/fileheader.cpp000066400000000000000000000107211334353060400163720ustar00rootroot00000000000000/* * Copyright (C) 2008 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include #include #include #include #include "log.h" #include "endian_tools.h" #include "buffer.h" log_define("zim.file.header") namespace zim { const uint32_t Fileheader::zimMagic = 0x044d495a; // ="ZIM^d" const uint16_t Fileheader::zimClassicMajorVersion = 5; const uint16_t Fileheader::zimExtendedMajorVersion = 6; const uint16_t Fileheader::zimMinorVersion = 0; const offset_type Fileheader::size = 80; // This is also mimeListPos (so an offset) std::ostream& operator<< (std::ostream& out, const Fileheader& fh) { char header[Fileheader::size]; toLittleEndian(Fileheader::zimMagic, header); toLittleEndian(fh.getMajorVersion(), header + 4); toLittleEndian(fh.getMinorVersion(), header + 6); std::copy(fh.getUuid().data, fh.getUuid().data + sizeof(Uuid), header + 8); toLittleEndian(fh.getArticleCount(), header + 24); toLittleEndian(fh.getClusterCount(), header + 28); toLittleEndian(fh.getUrlPtrPos(), header + 32); toLittleEndian(fh.getTitleIdxPos(), header + 40); toLittleEndian(fh.getClusterPtrPos(), header + 48); toLittleEndian(fh.getMimeListPos(), header + 56); toLittleEndian(fh.getMainPage(), header + 64); toLittleEndian(fh.getLayoutPage(), header + 68); toLittleEndian(fh.getChecksumPos(), header + 72); out.write(header, Fileheader::size); return out; } void Fileheader::read(std::shared_ptr buffer) { uint32_t magicNumber = buffer->as(offset_t(0)); if (magicNumber != Fileheader::zimMagic) { log_error("invalid magic number " << magicNumber << " found - " << Fileheader::zimMagic << " expected"); throw ZimFileFormatError("Invalid magic number"); } uint16_t major_version = buffer->as(offset_t(4)); if (major_version != zimClassicMajorVersion && major_version != zimExtendedMajorVersion) { log_error("invalid zimfile major version " << major_version << " found - " << Fileheader::zimMajorVersion << " expected"); throw ZimFileFormatError("Invalid version"); } setMajorVersion(major_version); setMinorVersion(buffer->as(offset_t(6))); Uuid uuid; std::copy(buffer->data(offset_t(8)), buffer->data(offset_t(24)), uuid.data); setUuid(uuid); setArticleCount(buffer->as(offset_t(24))); setClusterCount(buffer->as(offset_t(28))); setUrlPtrPos(buffer->as(offset_t(32))); setTitleIdxPos(buffer->as(offset_t(40))); setClusterPtrPos(buffer->as(offset_t(48))); setMimeListPos(buffer->as(offset_t(56))); setMainPage(buffer->as(offset_t(64))); setLayoutPage(buffer->as(offset_t(68))); setChecksumPos(buffer->as(offset_t(72))); sanity_check(); } void Fileheader::sanity_check() const { if (!!articleCount != !!clusterCount) { throw ZimFileFormatError("No article <=> No cluster"); } if (mimeListPos != size && mimeListPos != 72) { throw ZimFileFormatError("mimelistPos must be 80."); } if (urlPtrPos < mimeListPos) { throw ZimFileFormatError("urlPtrPos must be > mimelistPos."); } if (titleIdxPos < mimeListPos) { throw ZimFileFormatError("titleIdxPos must be > mimelistPos."); } if (clusterPtrPos < mimeListPos) { throw ZimFileFormatError("clusterPtrPos must be > mimelistPos."); } if (clusterCount > articleCount) { throw ZimFileFormatError("Cluster count cannot be higher than article count."); } if (checksumPos != 0 && checksumPos < mimeListPos) { throw ZimFileFormatError("checksumPos must be > mimeListPos."); } } } libzim-4.0.4/src/fileimpl.cpp000066400000000000000000000456161334353060400161160ustar00rootroot00000000000000/* * Copyright (C) 2006,2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "fileimpl.h" #include #include "_dirent.h" #include "file_compound.h" #include "file_reader.h" #include #include #include #include #include #include #include #include "config.h" #include "log.h" #include "envvalue.h" #include "md5stream.h" log_define("zim.file.impl") namespace zim { ////////////////////////////////////////////////////////////////////// // FileImpl // FileImpl::FileImpl(const std::string& fname) : zimFile(new FileCompound(fname)), zimReader(new FileReader(zimFile)), bufferDirentZone(256), bufferDirentLock(PTHREAD_MUTEX_INITIALIZER), filename(fname), direntCache(envValue("ZIM_DIRENTCACHE", DIRENT_CACHE_SIZE)), direntCacheLock(PTHREAD_MUTEX_INITIALIZER), clusterCache(envValue("ZIM_CLUSTERCACHE", CLUSTER_CACHE_SIZE)), clusterCacheLock(PTHREAD_MUTEX_INITIALIZER), cacheUncompressedCluster(envValue("ZIM_CACHEUNCOMPRESSEDCLUSTER", false)), namespaceBeginLock(PTHREAD_MUTEX_INITIALIZER), namespaceEndLock(PTHREAD_MUTEX_INITIALIZER) { log_trace("read file \"" << fname << '"'); if (zimFile->fail()) throw ZimFileFormatError(std::string("can't open zim-file \"") + fname + '"'); filename = fname; // read header if (size_type(zimReader->size()) < Fileheader::size) { throw ZimFileFormatError("zim-file is too small to contain a header"); } try { header.read(zimReader->get_buffer(offset_t(0), zsize_t(Fileheader::size))); } catch (ZimFileFormatError& e) { throw e; } catch (...) { throw ZimFileFormatError("error reading zim-file header."); } // urlPtrOffsetReader zsize_t size(header.getArticleCount() * 8); if (!zimReader->can_read(offset_t(header.getUrlPtrPos()), size)) { throw ZimFileFormatError("Reading out of zim file."); } #ifdef ENABLE_USE_BUFFER_HEADER urlPtrOffsetReader = std::unique_ptr(new BufferReader( zimReader->get_buffer(offset_t(header.getUrlPtrPos()), size))); #else urlPtrOffsetReader = zimReader->sub_reader(offset_t(header.getUrlPtrPos()), size); #endif // Create titleIndexBuffer size = zsize_t(header.getArticleCount() * 4); if (!zimReader->can_read(offset_t(header.getTitleIdxPos()), size)) { throw ZimFileFormatError("Reading out of zim file."); } #ifdef ENABLE_USE_BUFFER_HEADER titleIndexReader = std::unique_ptr(new BufferReader( zimReader->get_buffer(offset_t(header.getTitleIdxPos()), size))); #else titleIndexReader = zimReader->sub_reader(offset_t(header.getTitleIdxPos()), size); #endif // clusterOffsetBuffer size = zsize_t(header.getClusterCount() * 8); if (!zimReader->can_read(offset_t(header.getClusterPtrPos()), size)) { throw ZimFileFormatError("Reading out of zim file."); } #ifdef ENABLE_USE_BUFFER_HEADER clusterOffsetReader = std::unique_ptr(new BufferReader( zimReader->get_buffer(offset_t(header.getClusterPtrPos()), size))); #else clusterOffsetReader = zimReader->sub_reader(offset_t(header.getClusterPtrPos()), size); #endif if (!getCountClusters()) log_warn("no clusters found"); else { offset_t lastOffset = getClusterOffset(cluster_index_t(cluster_index_type(getCountClusters()) - 1)); log_debug("last offset=" << lastOffset.v << " file size=" << zimFile->fsize().v); if (lastOffset.v > zimFile->fsize().v) { log_fatal("last offset (" << lastOffset << ") larger than file size (" << zimFile->fsize() << ')'); throw ZimFileFormatError("last cluster offset larger than file size; file corrupt"); } } if (header.hasChecksum() && header.getChecksumPos() != (zimFile->fsize().v-16) ) { throw ZimFileFormatError("Checksum position is not valid"); } // read mime types size = zsize_t(header.getUrlPtrPos() - header.getMimeListPos()); // No need to check access, getUrlPtrPos is in the zim file, and we are // sure that getMimeListPos is 80. auto buffer = zimReader->get_buffer(offset_t(header.getMimeListPos()), size); offset_t current = offset_t(0); while (current.v < size.v) { offset_type len = strlen(buffer->data(current)); if (len == 0) { break; } if (current.v + len >= size.v) { throw(ZimFileFormatError("Error getting mimelists.")); } std::string mimeType(buffer->data(current), len); mimeTypes.push_back(mimeType); current += (len + 1); } } std::pair FileImpl::findx(char ns, const std::string& url) { log_debug("find article by url " << ns << " \"" << url << "\", in file \"" << getFilename() << '"'); article_index_type l = article_index_type(getNamespaceBeginOffset(ns)); article_index_type u = article_index_type(getNamespaceEndOffset(ns)); if (l == u) { log_debug("namespace " << ns << " not found"); return std::pair(false, article_index_t(0)); } unsigned itcount = 0; while (u - l > 1) { ++itcount; article_index_type p = l + (u - l) / 2; auto d = getDirent(article_index_t(p)); int c = ns < d->getNamespace() ? -1 : ns > d->getNamespace() ? 1 : url.compare(d->getUrl()); if (c < 0) u = p; else if (c > 0) l = p; else { log_debug("article found after " << itcount << " iterations in file \"" << getFilename() << "\" at index " << p); return std::pair(true, article_index_t(p)); } } auto d = getDirent(article_index_t(l)); int c = url.compare(d->getUrl()); if (c == 0) { log_debug("article found after " << itcount << " iterations in file \"" << getFilename() << "\" at index " << l); return std::pair(true, article_index_t(l)); } log_debug("article not found after " << itcount << " iterations (\"" << d.getUrl() << "\" does not match)"); return std::pair(false, article_index_t(c < 0 ? l : u)); } std::pair FileImpl::findx(const std::string& url) { size_t start = 0; if (url[0] == '/') { start = 1; } if (url.size() < (2+start) || url[1+start] != '/') return std::pair(false, article_index_t(0)); return findx(url[start], url.substr(2+start)); } std::pair FileImpl::findxByTitle(char ns, const std::string& title) { log_debug("find article by title " << ns << " \"" << title << "\", in file \"" << getFilename() << '"'); article_index_type l = article_index_type(getNamespaceBeginOffset(ns)); article_index_type u = article_index_type(getNamespaceEndOffset(ns)); if (l == u) { log_debug("namespace " << ns << " not found"); return std::pair(false, article_index_t(0)); } unsigned itcount = 0; while (u - l > 1) { ++itcount; article_index_type p = l + (u - l) / 2; auto d = getDirentByTitle(article_index_t(p)); int c = ns < d->getNamespace() ? -1 : ns > d->getNamespace() ? 1 : title.compare(d->getTitle()); if (c < 0) u = p; else if (c > 0) l = p; else { log_debug("article found after " << itcount << " iterations in file \"" << getFilename() << "\" at index " << p); return std::pair(true, article_index_t(p)); } } auto d = getDirentByTitle(article_index_t(l)); int c = title.compare(d->getTitle()); if (c == 0) { log_debug("article found after " << itcount << " iterations in file \"" << getFilename() << "\" at index " << l); return std::pair(true, article_index_t(l)); } log_debug("article not found after " << itcount << " iterations (\"" << d.getTitle() << "\" does not match)"); return std::pair(false, article_index_t(c < 0 ? l : u)); } std::pair FileImpl::getFileParts(offset_t offset, zsize_t size) { return zimFile->locate(offset, size); } std::shared_ptr FileImpl::getDirent(article_index_t idx) { log_trace("FileImpl::getDirent(" << idx << ')'); if (idx >= getCountArticles()) throw ZimFileFormatError("article index out of range"); pthread_mutex_lock(&direntCacheLock); auto v = direntCache.getx(idx); if (v.first) { log_debug("dirent " << idx << " found in cache; hits " << direntCache.getHits() << " misses " << direntCache.getMisses() << " ratio " << direntCache.hitRatio() * 100 << "% fillfactor " << direntCache.fillfactor()); pthread_mutex_unlock(&direntCacheLock); return v.second; } log_debug("dirent " << idx << " not found in cache; hits " << direntCache.getHits() << " misses " << direntCache.getMisses() << " ratio " << direntCache.hitRatio() * 100 << "% fillfactor " << direntCache.fillfactor()); pthread_mutex_unlock(&direntCacheLock); offset_t indexOffset = getOffset(urlPtrOffsetReader.get(), idx.v); // We don't know the size of the dirent because it depends of the size of // the title, url and extra parameters. // This is a pitty but we have no choices. // We cannot take a buffer of the size of the file, it would be really inefficient. // Let's do try, catch and retry while chosing a smart value for the buffer size. // Most dirent will be "Article" entry (header's size == 16) without extra parameters. // Let's hope that url + title size will be < 256 and if not try again with a bigger size. pthread_mutex_lock(&bufferDirentLock); zsize_t bufferSize = zsize_t(256); std::shared_ptr dirent; while (true) { bufferDirentZone.reserve(size_type(bufferSize)); zimReader->read(bufferDirentZone.data(), indexOffset, bufferSize); auto direntBuffer = std::unique_ptr(new MemoryBuffer(bufferDirentZone.data(), bufferSize)); try { dirent = std::make_shared(std::move(direntBuffer)); } catch (InvalidSize&) { // buffer size is not enougth, try again : bufferSize += 256; continue; } // Success ! break; } pthread_mutex_unlock(&bufferDirentLock); log_debug("dirent read from " << indexOffset); pthread_mutex_lock(&direntCacheLock); direntCache.put(idx, dirent); pthread_mutex_unlock(&direntCacheLock); return dirent; } std::shared_ptr FileImpl::getDirentByTitle(article_index_t idx) { if (idx >= getCountArticles()) throw ZimFileFormatError("article index out of range"); return getDirent(getIndexByTitle(idx)); } article_index_t FileImpl::getIndexByTitle(article_index_t idx) { if (idx >= getCountArticles()) throw ZimFileFormatError("article index out of range"); article_index_t ret(titleIndexReader->read( offset_t(sizeof(article_index_t)*idx.v))); return ret; } std::shared_ptr FileImpl::getCluster(cluster_index_t idx) { if (idx >= getCountClusters()) throw ZimFileFormatError("cluster index out of range"); pthread_mutex_lock(&clusterCacheLock); auto cluster(clusterCache.get(idx)); pthread_mutex_unlock(&clusterCacheLock); if (cluster) { log_debug("cluster " << idx << " found in cache; hits " << clusterCache.getHits() << " misses " << clusterCache.getMisses() << " ratio " << clusterCache.hitRatio() * 100 << "% fillfactor " << clusterCache.fillfactor()); return cluster; } offset_t clusterOffset(getClusterOffset(idx)); cluster_index_t next_idx(idx.v + 1); offset_t nextClusterOffset( (next_idx < getCountClusters()) ? getClusterOffset(next_idx).v : (header.hasChecksum()) ? header.getChecksumPos() : zimFile->fsize().v ); zsize_t clusterSize(nextClusterOffset.v - clusterOffset.v); log_debug("read cluster " << idx << " from offset " << clusterOffset); CompressionType comp; bool extended; std::shared_ptr reader = zimReader->sub_clusterReader(clusterOffset, clusterSize, &comp, &extended); cluster = std::shared_ptr(new Cluster(reader, comp, extended)); log_debug("put cluster " << idx << " into cluster cache; hits " << clusterCache.getHits() << " misses " << clusterCache.getMisses() << " ratio " << clusterCache.hitRatio() * 100 << "% fillfactor " << clusterCache.fillfactor()); pthread_mutex_lock(&clusterCacheLock); clusterCache.put(idx, cluster); pthread_mutex_unlock(&clusterCacheLock); return cluster; } offset_t FileImpl::getOffset(const Reader* reader, size_t idx) { offset_t offset(reader->read(offset_t(sizeof(offset_type)*idx))); return offset; } offset_t FileImpl::getClusterOffset(cluster_index_t idx) { return getOffset(clusterOffsetReader.get(), idx.v); } offset_t FileImpl::getBlobOffset(cluster_index_t clusterIdx, blob_index_t blobIdx) { auto cluster = getCluster(clusterIdx); if (cluster->isCompressed()) return offset_t(0); return getClusterOffset(clusterIdx) + offset_t(1) + cluster->getBlobOffset(blobIdx); } article_index_t FileImpl::getNamespaceBeginOffset(char ch) { log_trace("getNamespaceBeginOffset(" << ch << ')'); pthread_mutex_lock(&namespaceBeginLock); NamespaceCache::const_iterator it = namespaceBeginCache.find(ch); if (it != namespaceBeginCache.end()) { article_index_t ret(it->second); pthread_mutex_unlock(&namespaceBeginLock); return ret; } pthread_mutex_unlock(&namespaceBeginLock); article_index_type lower = 0; article_index_type upper = article_index_type(getCountArticles()); auto d = getDirent(article_index_t(0)); while (upper - lower > 1) { article_index_type m = lower + (upper - lower) / 2; auto d = getDirent(article_index_t(m)); if (d->getNamespace() >= ch) upper = m; else lower = m; } article_index_t ret = article_index_t(d->getNamespace() < ch ? upper : lower); pthread_mutex_lock(&namespaceBeginLock); namespaceBeginCache[ch] = ret; pthread_mutex_unlock(&namespaceBeginLock); return ret; } article_index_t FileImpl::getNamespaceEndOffset(char ch) { log_trace("getNamespaceEndOffset(" << ch << ')'); pthread_mutex_lock(&namespaceEndLock); NamespaceCache::const_iterator it = namespaceEndCache.find(ch); if (it != namespaceEndCache.end()) { article_index_t ret = it->second; pthread_mutex_unlock(&namespaceEndLock); return ret; } pthread_mutex_unlock(&namespaceEndLock); article_index_type lower = 0; article_index_type upper = article_index_type(getCountArticles()); log_debug("namespace " << ch << " lower=" << lower << " upper=" << upper); while (upper - lower > 1) { article_index_type m = lower + (upper - lower) / 2; auto d = getDirent(article_index_t(m)); if (d->getNamespace() > ch) upper = m; else lower = m; log_debug("namespace " << d->getNamespace() << " m=" << m << " lower=" << lower << " upper=" << upper); } pthread_mutex_lock(&namespaceEndLock); namespaceEndCache[ch] = article_index_t(upper); pthread_mutex_unlock(&namespaceEndLock); return article_index_t(upper); } std::string FileImpl::getNamespaces() { std::string namespaces; auto d = getDirent(article_index_t(0)); namespaces = d->getNamespace(); article_index_t idx(0); while ((idx = getNamespaceEndOffset(d->getNamespace())) < getCountArticles()) { d = getDirent(idx); namespaces += d->getNamespace(); } return namespaces; } const std::string& FileImpl::getMimeType(uint16_t idx) const { if (idx > mimeTypes.size()) { std::ostringstream msg; msg << "unknown mime type code " << idx; throw std::runtime_error(msg.str()); } return mimeTypes[idx]; } std::string FileImpl::getChecksum() { if (!header.hasChecksum()) return std::string(); std::shared_ptr chksum; try { chksum = zimReader->get_buffer(offset_t(header.getChecksumPos()), zsize_t(16)); } catch (...) { log_warn("error reading checksum"); return std::string(); } char hexdigest[33]; hexdigest[32] = '\0'; static const char hex[] = "0123456789abcdef"; char* p = hexdigest; for (int i = 0; i < 16; ++i) { uint8_t v = chksum->at(offset_t(i)); *p++ = hex[v >> 4]; *p++ = hex[v & 0xf]; } log_debug("chksum=" << hexdigest); return hexdigest; } bool FileImpl::verify() { if (!header.hasChecksum()) return false; Md5stream md5; offset_type checksumPos = header.getChecksumPos(); offset_type currentPos = 0; for(auto part = zimFile->begin(); part != zimFile->end(); part++) { std::ifstream stream(part->second->filename()); char ch; for(/*NOTHING*/ ; currentPos < checksumPos && stream.get(ch).good(); currentPos++) { md5 << ch; } if (stream.bad()) { perror("error while reading file"); return false; } if (currentPos == checksumPos) { break; } } if (currentPos != checksumPos) { return false; } unsigned char chksumCalc[16]; auto chksumFile = zimReader->get_buffer(offset_t(header.getChecksumPos()), zsize_t(16)); md5.getDigest(chksumCalc); if (std::memcmp(chksumFile->data(), chksumCalc, 16) != 0) { return false; } return true; } time_t FileImpl::getMTime() const { return zimFile->getMTime(); } zim::zsize_t FileImpl::getFilesize() const { return zimFile->fsize(); } bool FileImpl::is_multiPart() const { return zimFile->is_multiPart(); } } libzim-4.0.4/src/fileimpl.h000066400000000000000000000075541334353060400155620ustar00rootroot00000000000000/* * Copyright (C) 2006 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_FILEIMPL_H #define ZIM_FILEIMPL_H #include #include #include #include #include #include #include #include "cache.h" #include "_dirent.h" #include "cluster.h" #include "buffer.h" #include "file_reader.h" #include "file_compound.h" #include "zim_types.h" namespace zim { class FileImpl { std::shared_ptr zimFile; std::shared_ptr zimReader; std::vector bufferDirentZone; pthread_mutex_t bufferDirentLock; Fileheader header; std::string filename; std::unique_ptr titleIndexReader; std::unique_ptr urlPtrOffsetReader; std::unique_ptr clusterOffsetReader; offset_t getOffset(const Reader* reader, size_t idx); Cache> direntCache; pthread_mutex_t direntCacheLock; Cache> clusterCache; pthread_mutex_t clusterCacheLock; bool cacheUncompressedCluster; typedef std::map NamespaceCache; NamespaceCache namespaceBeginCache; pthread_mutex_t namespaceBeginLock; NamespaceCache namespaceEndCache; pthread_mutex_t namespaceEndLock; typedef std::vector MimeTypes; MimeTypes mimeTypes; public: explicit FileImpl(const std::string& fname); time_t getMTime() const; const std::string& getFilename() const { return filename; } const Fileheader& getFileheader() const { return header; } zsize_t getFilesize() const; std::pair getFileParts(offset_t offset, zsize_t size); std::shared_ptr getDirent(article_index_t idx); std::shared_ptr getDirentByTitle(article_index_t idx); article_index_t getIndexByTitle(article_index_t idx); article_index_t getCountArticles() const { return article_index_t(header.getArticleCount()); } std::pair findx(char ns, const std::string& url); std::pair findx(const std::string& url); std::pair findxByTitle(char ns, const std::string& title); std::shared_ptr getCluster(cluster_index_t idx); cluster_index_t getCountClusters() const { return cluster_index_t(header.getClusterCount()); } offset_t getClusterOffset(cluster_index_t idx); offset_t getBlobOffset(cluster_index_t clusterIdx, blob_index_t blobIdx); article_index_t getNamespaceBeginOffset(char ch); article_index_t getNamespaceEndOffset(char ch); article_index_t getNamespaceCount(char ns) { return getNamespaceEndOffset(ns) - getNamespaceBeginOffset(ns); } std::string getNamespaces(); bool hasNamespace(char ch) const; const std::string& getMimeType(uint16_t idx) const; std::string getChecksum(); bool verify(); bool is_multiPart() const; }; } #endif // ZIM_FILEIMPL_H libzim-4.0.4/src/fs.h000066400000000000000000000020511334353060400143540ustar00rootroot00000000000000/* * Copyright (C) 2018 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_FS_H_ #define ZIM_FS_H_ #ifdef _WIN32 # include "fs_windows.h" #else # include "fs_unix.h" #endif namespace zim { #ifdef _WIN32 using DEFAULTFS = windows::FS; #else using DEFAULTFS = unix::FS; #endif }; #endif //ZIM_FS_H_ libzim-4.0.4/src/fs_unix.cpp000066400000000000000000000057321334353060400157630ustar00rootroot00000000000000/* * Copyright (C) 2018 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "fs_unix.h" #include #include #include #include #include #include #include namespace zim { namespace unix { zsize_t FD::readAt(char* dest, zsize_t size, offset_t offset) const { #ifdef __APPLE__ # define PREAD pread #else # define PREAD pread64 #endif ssize_t full_size_read = 0; auto size_to_read = size.v; auto current_offset = offset.v; errno = 0; while (size_to_read > 0) { auto size_read = PREAD(m_fd, dest, size_to_read, current_offset); if (size_read == -1) { return zsize_t(-1); } size_to_read -= size_read; current_offset += size_read; full_size_read += size_read; } return zsize_t(full_size_read); #undef PREAD } zsize_t FD::getSize() const { struct stat sb; fstat(m_fd, &sb); return zsize_t(sb.st_size); } bool FD::seek(offset_t offset) { return static_cast(offset.v) == lseek(m_fd, offset.v, SEEK_SET); } bool FD::close() { if (m_fd != -1) { return ::close(m_fd); } return -1; } FD FS::openFile(path_t filepath) { int fd = open(filepath.c_str(), O_RDONLY); if (fd == -1) { throw std::runtime_error(""); } return FD(fd); } bool FS::makeDirectory(path_t path) { return !mkdir(path.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); } void FS::rename(path_t old_path, path_t new_path) { ::rename(old_path.c_str(), new_path.c_str()); } std::string FS::join(path_t base, path_t name) { return base + "/" + name; } bool FS::remove(path_t path) { DIR* dir; /* It's a directory, remove all its entries first */ if ((dir = opendir(path.c_str())) != NULL) { struct dirent* ent; while ((ent = readdir(dir)) != NULL) { std::string childName = ent->d_name; if (childName != "." && childName != "..") { auto childPath = join(path, childName); remove(childPath); } } closedir(dir); return removeDir(path); } /* It's a file */ else { return removeFile(path); } } bool FS::removeDir(path_t path) { return rmdir(path.c_str()); } bool FS::removeFile(path_t path) { return ::remove(path.c_str()); } }; // unix namespace }; // zim namespace libzim-4.0.4/src/fs_unix.h000066400000000000000000000042261334353060400154250ustar00rootroot00000000000000/* * Copyright (C) 2018 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_FS_UNIX_H_ #define ZIM_FS_UNIX_H_ #include "zim_types.h" #include #include #include #include #include #include namespace zim { namespace unix { using path_t = const std::string&; class FD { public: using fd_t = int; private: fd_t m_fd = -1; public: FD() = default; FD(fd_t fd): m_fd(fd) {}; FD(const FD& o) = delete; FD(FD&& o) : m_fd(o.m_fd) { o.m_fd = -1; } FD& operator=(FD&& o) { m_fd = o.m_fd; o.m_fd = -1; return *this; } ~FD() { close(); } zsize_t readAt(char* dest, zsize_t size, offset_t offset) const; zsize_t getSize() const; fd_t getNativeHandle() const { return m_fd; } fd_t release() { int ret = m_fd; m_fd = -1; return ret; } bool seek(offset_t offset); bool close(); }; struct FS { using FD = zim::unix::FD; static std::string join(path_t base, path_t name); static FD openFile(path_t filepath); static bool makeDirectory(path_t path); static void rename(path_t old_path, path_t new_path); static bool remove(path_t path); static bool removeDir(path_t path); static bool removeFile(path_t path); }; }; // unix namespace }; // zim namespace #endif //ZIM_FS_UNIX_H_ libzim-4.0.4/src/fs_windows.cpp000066400000000000000000000107241334353060400164670ustar00rootroot00000000000000/* * Copyright (C) 2018 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "fs_windows.h" #include #include #include #include #include #include #include #include namespace zim { namespace windows { struct ImplFD { HANDLE m_handle = INVALID_HANDLE_VALUE; CRITICAL_SECTION m_criticalSection; ImplFD() { InitializeCriticalSection(&m_criticalSection); } ImplFD(HANDLE handle) : m_handle(handle) { InitializeCriticalSection(&m_criticalSection); } ~ImplFD() { DeleteCriticalSection(&m_criticalSection); } }; FD::FD() : mp_impl(new ImplFD()) {} FD::FD(fd_t handle) : mp_impl(new ImplFD(handle)) {} FD::FD(int fd): mp_impl(new ImplFD(reinterpret_cast(_get_osfhandle(fd)))) {} FD::FD(FD&& o) = default; FD& FD::operator=(FD&& o) = default; FD::~FD() { if (mp_impl) close(); } zsize_t FD::readAt(char* dest, zsize_t size, offset_t offset) const { if (!mp_impl) return zsize_t(-1); EnterCriticalSection(&mp_impl->m_criticalSection); LARGE_INTEGER off; off.QuadPart = offset.v; if (!SetFilePointerEx(mp_impl->m_handle, off, NULL, FILE_BEGIN)) { goto err; } DWORD size_read; if (!ReadFile(mp_impl->m_handle, dest, size.v, &size_read, NULL)) { goto err; } if (size_read != size.v) { goto err; } LeaveCriticalSection(&mp_impl->m_criticalSection); return size; err: LeaveCriticalSection(&mp_impl->m_criticalSection); return zsize_t(-1); } bool FD::seek(offset_t offset) { if(!mp_impl) return false; LARGE_INTEGER off; off.QuadPart = offset.v; return SetFilePointerEx(mp_impl->m_handle, off, NULL, FILE_BEGIN); } zsize_t FD::getSize() const { if(!mp_impl) return zsize_t(0); LARGE_INTEGER size; if (!GetFileSizeEx(mp_impl->m_handle, &size)) { size.QuadPart = 0; } return zsize_t(size.QuadPart); } int FD::release() { if(!mp_impl) return -1; int ret = _open_osfhandle(reinterpret_cast(mp_impl->m_handle), 0); mp_impl->m_handle = INVALID_HANDLE_VALUE; return ret; } bool FD::close() { if (!mp_impl || mp_impl->m_handle == INVALID_HANDLE_VALUE) { return false; } return CloseHandle(mp_impl->m_handle); } std::unique_ptr FS::toWideChar(path_t path) { auto size = MultiByteToWideChar(CP_UTF8, 0, path.c_str(), -1, nullptr, 0); auto wdata = std::unique_ptr(new wchar_t[size]); auto ret = MultiByteToWideChar(CP_UTF8, 0, path.c_str(), -1, wdata.get(), size); if (0 == ret) { std::ostringstream oss; oss << "Cannot convert path to wchar : " << GetLastError(); throw std::runtime_error(oss.str()); } return wdata; } FD FS::openFile(path_t filepath) { auto wpath = toWideChar(filepath); FD::fd_t handle; handle = CreateFileW(wpath.get(), GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_READONLY|FILE_FLAG_RANDOM_ACCESS, NULL); if (handle == INVALID_HANDLE_VALUE) { std::ostringstream oss; oss << "Cannot open file : " << GetLastError(); throw std::runtime_error(oss.str()); } return FD(handle); } bool FS::makeDirectory(path_t path) { auto wpath = toWideChar(path); auto ret = CreateDirectoryW(wpath.get(), NULL); return ret; } void FS::rename(path_t old_path, path_t new_path) { MoveFileW(toWideChar(old_path).get(), toWideChar(new_path).get()); } std::string FS::join(path_t base, path_t name) { return base + "\\" + name; } bool FS::removeDir(path_t path) { return RemoveDirectoryW(toWideChar(path).get()); } bool FS::removeFile(path_t path) { return DeleteFileW(toWideChar(path).get()); } }; // windows namespace }; // zim namespace libzim-4.0.4/src/fs_windows.h000066400000000000000000000037421334353060400161360ustar00rootroot00000000000000/* * Copyright (C) 2018 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_FS_WINDOWS_H_ #define ZIM_FS_WINDOWS_H_ #include "zim_types.h" #include #include typedef void* HANDLE; namespace zim { namespace windows { using path_t = const std::string&; struct ImplFD; class FD { public: typedef HANDLE fd_t; private: std::unique_ptr mp_impl; public: FD(); FD(fd_t handle); FD(int fd); FD(const FD& o) = delete; FD(FD&& o); FD& operator=(FD&& o); FD& operator=(const FD& o) = delete; ~FD(); zsize_t readAt(char* dest, zsize_t size, offset_t offset) const; zsize_t getSize() const; int release(); bool seek(offset_t offset); bool close(); }; struct FS { using FD = zim::windows::FD; static std::string join(path_t base, path_t name); static std::unique_ptr toWideChar(path_t path); static FD openFile(path_t filepath); static bool makeDirectory(path_t path); static void rename(path_t old_path, path_t new_path); static bool remove(path_t path); static bool removeDir(path_t path); static bool removeFile(path_t path); }; }; // windows namespace }; // zim namespace #endif //ZIM_FS_WINDOWS_H_ libzim-4.0.4/src/levenshtein.cpp000077500000000000000000000015401334353060400166300ustar00rootroot00000000000000 #include "levenshtein.h" #include #include int levenshtein_distance(const std::string &s1, const std::string &s2) { int s1len = s1.size(); int s2len = s2.size(); auto column_start = (decltype(s1len))1; auto column = new decltype(s1len)[s1len + 1]; std::iota(column + column_start - 1, column + s1len + 1, column_start - 1); for (auto x = column_start; x <= s2len; x++) { column[0] = x; auto last_diagonal = x - column_start; for (auto y = column_start; y <= s1len; y++) { auto old_diagonal = column[y]; auto possibilities = { column[y] + 1, column[y - 1] + 1, last_diagonal + (s1[y - 1] == s2[x - 1]? 0 : 1) }; column[y] = std::min(possibilities); last_diagonal = old_diagonal; } } auto result = column[s1len]; delete[] column; return result; } libzim-4.0.4/src/levenshtein.h000066400000000000000000000002421334353060400162700ustar00rootroot00000000000000 #ifndef LEVENSHTEIN_H #define LEVENSHTEIN_H #include int levenshtein_distance(const std::string &s1, const std::string &s2); #endif // LEVENSHTEIN_H libzim-4.0.4/src/log.h000066400000000000000000000020361334353060400145300ustar00rootroot00000000000000/* * Copyright (C) 2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "config.h" #ifdef WITH_CXXTOOLS #include #else #define log_define(e) #define log_fatal(e) #define log_error(e) #define log_warn(e) #define log_info(e) #define log_debug(e) #define log_trace(e) #define log_init() #endif libzim-4.0.4/src/md5.c000066400000000000000000000241311334353060400144270ustar00rootroot00000000000000/* MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm */ /* Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All rights reserved. License to copy and use this software is granted provided that it is identified as the "RSA Data Security, Inc. MD5 Message-Digest Algorithm" in all material mentioning or referencing this software or this function. License is also granted to make and use derivative works provided that such works are identified as "derived from the RSA Data Security, Inc. MD5 Message-Digest Algorithm" in all material mentioning or referencing the derived work. RSA Data Security, Inc. makes no representations concerning either the merchantability of this software or the suitability of this software for any particular purpose. It is provided "as is" without express or implied warranty of any kind. These notices must be retained in any copies of any part of this documentation and/or software. */ #include "md5.h" #include #define MD5_CTX struct zim_MD5_CTX /* Constants for MD5Transform routine. */ #define S11 7 #define S12 12 #define S13 17 #define S14 22 #define S21 5 #define S22 9 #define S23 14 #define S24 20 #define S31 4 #define S32 11 #define S33 16 #define S34 23 #define S41 6 #define S42 10 #define S43 15 #define S44 21 static void MD5Transform PROTO_LIST ((UINT4 [4], const unsigned char [64])); static void Encode PROTO_LIST ((unsigned char *, UINT4 *, unsigned int)); static void Decode PROTO_LIST ((UINT4 *, const unsigned char *, unsigned int)); /* static void MD5_memcpy PROTO_LIST ((POINTER, POINTER, unsigned int)); static void MD5_memset PROTO_LIST ((POINTER, int, unsigned int)); */ #define MD5_memcpy memcpy #define MD5_memset memset static unsigned char PADDING[64] = { 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; /* F, G, H and I are basic MD5 functions. */ #define F(x, y, z) (((x) & (y)) | ((~x) & (z))) #define G(x, y, z) (((x) & (z)) | ((y) & (~z))) #define H(x, y, z) ((x) ^ (y) ^ (z)) #define I(x, y, z) ((y) ^ ((x) | (~z))) /* ROTATE_LEFT rotates x left n bits. */ #define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n)))) /* FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4. Rotation is separate from addition to prevent recomputation. */ #define FF(a, b, c, d, x, s, ac) { \ (a) += F ((b), (c), (d)) + (x) + (UINT4)(ac); \ (a) = ROTATE_LEFT ((a), (s)); \ (a) += (b); \ } #define GG(a, b, c, d, x, s, ac) { \ (a) += G ((b), (c), (d)) + (x) + (UINT4)(ac); \ (a) = ROTATE_LEFT ((a), (s)); \ (a) += (b); \ } #define HH(a, b, c, d, x, s, ac) { \ (a) += H ((b), (c), (d)) + (x) + (UINT4)(ac); \ (a) = ROTATE_LEFT ((a), (s)); \ (a) += (b); \ } #define II(a, b, c, d, x, s, ac) { \ (a) += I ((b), (c), (d)) + (x) + (UINT4)(ac); \ (a) = ROTATE_LEFT ((a), (s)); \ (a) += (b); \ } /* MD5 initialization. Begins an MD5 operation, writing a new context. */ void zim_MD5Init (MD5_CTX* context) { context->count[0] = context->count[1] = 0; /* Load magic initialization constants. */ context->state[0] = 0x67452301; context->state[1] = 0xefcdab89; context->state[2] = 0x98badcfe; context->state[3] = 0x10325476; } /* MD5 block update operation. Continues an MD5 message-digest operation, processing another message block, and updating the context. */ void zim_MD5Update ( MD5_CTX *context, const unsigned char *input, /* input block */ unsigned int inputLen) /* length of input block */ { unsigned int i, index, partLen; /* Compute number of bytes mod 64 */ index = (unsigned int)((context->count[0] >> 3) & 0x3F); /* Update number of bits */ if ((context->count[0] += ((UINT4)inputLen << 3)) < ((UINT4)inputLen << 3)) context->count[1]++; context->count[1] += ((UINT4)inputLen >> 29); partLen = 64 - index; /* Transform as many times as possible. */ if (inputLen >= partLen) { MD5_memcpy ((POINTER)&context->buffer[index], (POINTER)input, partLen); MD5Transform (context->state, context->buffer); for (i = partLen; i + 63 < inputLen; i += 64) MD5Transform (context->state, &input[i]); index = 0; } else i = 0; /* Buffer remaining input */ MD5_memcpy ((POINTER)&context->buffer[index], (POINTER)&input[i], inputLen-i); } /* MD5 finalization. Ends an MD5 message-digest operation, writing the the message digest and zeroizing the context. */ void zim_MD5Final ( unsigned char digest[16], /* message digest */ MD5_CTX *context) /* context */ { unsigned char bits[8]; unsigned int index, padLen; /* Save number of bits */ Encode (bits, context->count, 8); /* Pad out to 56 mod 64. */ index = (unsigned int)((context->count[0] >> 3) & 0x3f); padLen = (index < 56) ? (56 - index) : (120 - index); zim_MD5Update (context, PADDING, padLen); /* Append length (before padding) */ zim_MD5Update (context, bits, 8); /* Store state in digest */ Encode (digest, context->state, 16); /* Zeroize sensitive information. */ MD5_memset ((POINTER)context, 0, sizeof (*context)); } /* MD5 basic transformation. Transforms state based on block. */ static void MD5Transform ( UINT4 state[4], const unsigned char block[64]) { UINT4 a = state[0], b = state[1], c = state[2], d = state[3], x[16]; Decode (x, block, 64); /* Round 1 */ FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */ FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */ FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */ FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */ FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */ FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */ FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */ FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */ FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */ FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */ FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */ FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */ FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */ FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */ FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */ FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */ /* Round 2 */ GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */ GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */ GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */ GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */ GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */ GG (d, a, b, c, x[10], S22, 0x2441453); /* 22 */ GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */ GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */ GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */ GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */ GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */ GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */ GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */ GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */ GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */ GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */ /* Round 3 */ HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */ HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */ HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */ HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */ HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */ HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */ HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */ HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */ HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */ HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */ HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */ HH (b, c, d, a, x[ 6], S34, 0x4881d05); /* 44 */ HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */ HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */ HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */ HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */ /* Round 4 */ II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */ II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */ II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */ II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */ II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */ II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */ II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */ II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */ II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */ II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */ II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */ II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */ II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */ II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */ II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */ II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */ state[0] += a; state[1] += b; state[2] += c; state[3] += d; /* Zeroize sensitive information. */ MD5_memset ((POINTER)x, 0, sizeof (x)); } /* Encodes input (UINT4) into output (unsigned char). Assumes len is a multiple of 4. */ static void Encode ( unsigned char *output, UINT4 *input, unsigned int len) { unsigned int i, j; for (i = 0, j = 0; j < len; i++, j += 4) { output[j] = (unsigned char)(input[i] & 0xff); output[j+1] = (unsigned char)((input[i] >> 8) & 0xff); output[j+2] = (unsigned char)((input[i] >> 16) & 0xff); output[j+3] = (unsigned char)((input[i] >> 24) & 0xff); } } /* Decodes input (unsigned char) into output (UINT4). Assumes len is a multiple of 4. */ static void Decode ( UINT4 *output, const unsigned char *input, unsigned int len) { unsigned int i, j; for (i = 0, j = 0; j < len; i++, j += 4) output[i] = ((UINT4)input[j]) | (((UINT4)input[j+1]) << 8) | (((UINT4)input[j+2]) << 16) | (((UINT4)input[j+3]) << 24); } #if 0 /* Note: Replace "for loop" with standard memcpy if possible. */ static void MD5_memcpy ( POINTER output, POINTER input, unsigned int len) { unsigned int i; for (i = 0; i < len; i++) output[i] = input[i]; } /* Note: Replace "for loop" with standard memset if possible. */ static void MD5_memset ( POINTER output, int value, unsigned int len) { unsigned int i; for (i = 0; i < len; i++) ((char *)output)[i] = (char)value; } #endif libzim-4.0.4/src/md5.h000066400000000000000000000070441334353060400144400ustar00rootroot00000000000000/* * Copyright (C) 2003 Tommi Maekitalo * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * As a special exception, you may use this file as part of a free * software library without restriction. Specifically, if other files * instantiate templates or use macros or inline functions from this * file, or you compile this file and link it with other files to * produce an executable, this file does not by itself cause the * resulting executable to be covered by the GNU General Public * License. This exception does not however invalidate any other * reasons why the executable file might be covered by the GNU Library * General Public License. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ /* Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All rights reserved. License to copy and use this software is granted provided that it is identified as the "RSA Data Security, Inc. MD5 Message-Digest Algorithm" in all material mentioning or referencing this software or this function. License is also granted to make and use derivative works provided that such works are identified as "derived from the RSA Data Security, Inc. MD5 Message-Digest Algorithm" in all material mentioning or referencing the derived work. RSA Data Security, Inc. makes no representations concerning either the merchantability of this software or the suitability of this software for any particular purpose. It is provided "as is" without express or implied warranty of any kind. These notices must be retained in any copies of any part of this documentation and/or software. */ /* RSAREF types and constants */ /* PROTOTYPES should be set to one if and only if the compiler supports function argument prototyping. The following makes PROTOTYPES default to 0 if it has not already been defined with C compiler flags. */ #ifndef ZIM_MD5_H #define ZIM_MD5_H #ifndef PROTOTYPES #define PROTOTYPES 1 #endif /* POINTER defines a generic pointer type */ typedef unsigned char *POINTER; /* UINT2 defines a two byte word */ typedef unsigned short int UINT2; /* UINT4 defines a four byte word */ typedef unsigned int UINT4; /* PROTO_LIST is defined depending on how PROTOTYPES is defined above. If using PROTOTYPES, then PROTO_LIST returns the list, otherwise it returns an empty list. */ #if PROTOTYPES #define PROTO_LIST(list) list #else #define PROTO_LIST(list) () #endif /* MD5 context. */ struct zim_MD5_CTX { UINT4 state[4]; /* state (ABCD) */ UINT4 count[2]; /* number of bits, modulo 2^64 (lsb first) */ unsigned char buffer[64]; /* input buffer */ }; #ifdef __cplusplus extern "C" { #endif void zim_MD5Init PROTO_LIST ((struct zim_MD5_CTX *)); void zim_MD5Update PROTO_LIST ((struct zim_MD5_CTX *, const unsigned char *, unsigned int)); void zim_MD5Final PROTO_LIST ((unsigned char [16], struct zim_MD5_CTX *)); #ifdef __cplusplus } #endif #endif /* ZIM_MD5_H */ libzim-4.0.4/src/md5stream.cpp000066400000000000000000000065601334353060400162110ustar00rootroot00000000000000/* * Copyright (C) 2003 Tommi Maekitalo * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * As a special exception, you may use this file as part of a free * software library without restriction. Specifically, if other files * instantiate templates or use macros or inline functions from this * file, or you compile this file and link it with other files to * produce an executable, this file does not by itself cause the * resulting executable to be covered by the GNU General Public * License. This exception does not however invalidate any other * reasons why the executable file might be covered by the GNU Library * General Public License. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "md5stream.h" #include "md5.h" #include namespace zim { //////////////////////////////////////////////////////////////////////// // Md5streambuf // Md5streambuf::Md5streambuf() : context(new zim_MD5_CTX()) { zim_MD5Init(context); } Md5streambuf::~Md5streambuf() { delete context; } std::streambuf::int_type Md5streambuf::overflow( std::streambuf::int_type ch) { if (pptr() == 0) { // Ausgabepuffer ist leer - initialisieren zim_MD5Init(context); } else { // konsumiere Zeichen aus dem Puffer zim_MD5Update(context, (const unsigned char*)pbase(), pptr() - pbase()); } // setze Ausgabepuffer setp(buffer, buffer + bufsize); if (ch != traits_type::eof()) { // das Zeichen, welches den overflow ausgelst hat, stecken // wir in den Puffer. *pptr() = traits_type::to_char_type(ch); pbump(1); } return 0; } std::streambuf::int_type Md5streambuf::underflow() { // nur Ausgabestrom return traits_type::eof(); } int Md5streambuf::sync() { if (pptr() != pbase()) { // konsumiere Zeichen aus dem Puffer zim_MD5Update(context, (const unsigned char*)pbase(), pptr() - pbase()); // leere Ausgabepuffer setp(buffer, buffer + bufsize); } return 0; } void Md5streambuf::getDigest(unsigned char digest_[16]) { if (pptr()) { if (pptr() != pbase()) { // konsumiere Zeichen aus dem Puffer zim_MD5Update(context, (const unsigned char*)pbase(), pptr() - pbase()); } // deinitialisiere Ausgabepuffer setp(0, 0); } else { zim_MD5Init(context); } zim_MD5Final(digest, context); std::memcpy(digest_, digest, 16); } //////////////////////////////////////////////////////////////////////// // Md5stream // const char* Md5stream::getHexDigest() { static const char hex[] = "0123456789abcdef"; unsigned char md5[16]; getDigest(md5); int i; char* p = hexdigest; for (i = 0; i < 16; ++i) { *p++ = hex[md5[i] >> 4]; *p++ = hex[md5[i] & 0xf]; } *p = '\0'; return hexdigest; } } libzim-4.0.4/src/md5stream.h000066400000000000000000000061471334353060400156570ustar00rootroot00000000000000/* * Copyright (C) 2003 Tommi Maekitalo * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * As a special exception, you may use this file as part of a free * software library without restriction. Specifically, if other files * instantiate templates or use macros or inline functions from this * file, or you compile this file and link it with other files to * produce an executable, this file does not by itself cause the * resulting executable to be covered by the GNU General Public * License. This exception does not however invalidate any other * reasons why the executable file might be covered by the GNU Library * General Public License. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef ZIM_MD5STREAM_H #define ZIM_MD5STREAM_H #include struct zim_MD5_CTX; namespace zim { class Md5streambuf : public std::streambuf { public: Md5streambuf(); ~Md5streambuf(); void getDigest(unsigned char digest[16]); private: static const unsigned int bufsize = 64; char buffer[bufsize]; zim_MD5_CTX* context; unsigned char digest[16]; std::streambuf::int_type overflow(std::streambuf::int_type ch); std::streambuf::int_type underflow(); int sync(); }; /** This is a easy and safe interface to MD5-calculation. To get a MD5-sum of data, instantiate a md5stream, copy your data into it and read the digest. After calling getDigest or getHexDigest, the class can be reused for another md5-calculation. The algorithm is automatically reinitialized when the first character is received. example: \code int main(int argc, char* argv[]) { Md5stream s; for (int i = 1; i < argc; ++i) { std::ifstream in(argv[i]); if (in) { s << in.rdbuf(); std::cout << s.getHexDigest() << " " << argv[i] << std::endl; } } } \endcode */ class Md5stream : public std::ostream { public: typedef std::ostreambuf_iterator iterator; private: Md5streambuf streambuf; char hexdigest[33]; public: /// initializes md5-calculation Md5stream() : std::ostream(0) { init(&streambuf); } /// ends md5-calculation and returns 16 bytes digest void getDigest(unsigned char digest[16]) { streambuf.getDigest(digest); } /// ends md5-calculation and digest as 32 bytes hex const char* getHexDigest(); /// returns output-iterator to Md5stream iterator begin() { return iterator(&streambuf); } }; } #endif // ZIM_MD5STREAM_H libzim-4.0.4/src/meson.build000066400000000000000000000034351334353060400157440ustar00rootroot00000000000000 configure_file(output : 'config.h', configuration : conf, input : 'config.h.in') src_directory = include_directories('.') common_sources = [ # 'config.h', 'article.cpp', 'cluster.cpp', 'dirent.cpp', 'envvalue.cpp', 'file.cpp', 'fileheader.cpp', 'fileimpl.cpp', 'file_compound.cpp', 'file_reader.cpp', 'blob.cpp', 'buffer.cpp', 'md5.c', 'md5stream.cpp', 'search.cpp', 'search_iterator.cpp', 'template.cpp', 'uuid.cpp', 'levenshtein.cpp', 'tools.cpp', 'writer/zimcreator.cpp', 'writer/lzmastream.cpp', 'writer/article.cpp', 'writer/cluster.cpp', 'writer/dirent.cpp', 'writer/xapianIndexer.cpp', 'writer/tee.cpp' ] if host_machine.system() == 'windows' common_sources += 'fs_windows.cpp' else common_sources += 'fs_unix.cpp' endif zlib_sources = [ 'writer/deflatestream.cpp' ] xapian_sources = [ 'xapian/htmlparse.cc', 'xapian/myhtmlparse.cc' ] sources = common_sources deps = [thread_dep, lzma_dep] if zlib_dep.found() sources += zlib_sources deps += [zlib_dep] endif if xapian_dep.found() sources += xapian_sources sources += lib_resources deps += [xapian_dep, icu_dep] endif libzim = library('zim', sources, include_directories : inc, dependencies : deps, link_args : extra_link_args, cpp_args : extra_cpp_args, version: meson.project_version(), install : true, build_rpath : join_paths(get_option('prefix'), get_option('libdir')), install_rpath: '$ORIGIN') libzim_dep = declare_dependency(link_with: libzim, include_directories: include_directory) libzim-4.0.4/src/search.cpp000066400000000000000000000322171334353060400155530ustar00rootroot00000000000000/* * Copyright (C) 2007 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include #include #include "search_internal.h" #include "levenshtein.h" #include "fs.h" #include #include #include #if !defined(_WIN32) # include #else # include #endif #include #if defined(ENABLE_XAPIAN) #include "xapian.h" #include #endif #define MAX_MATCHES_TO_SORT 10000 namespace zim { #if defined(ENABLE_XAPIAN) namespace { /* Split string in a token array */ std::vector split(const std::string & str, const std::string & delims=" *-") { std::string::size_type lastPos = str.find_first_not_of(delims, 0); std::string::size_type pos = str.find_first_of(delims, lastPos); std::vector tokens; while (std::string::npos != pos || std::string::npos != lastPos) { tokens.push_back(str.substr(lastPos, pos - lastPos)); lastPos = str.find_first_not_of(delims, pos); pos = str.find_first_of(delims, lastPos); } return tokens; } std::map read_valuesmap(const std::string &s) { std::map result; std::vector elems = split(s, ";"); for(std::vector::iterator elem = elems.begin(); elem != elems.end(); elem++) { std::vector tmp_elems = split(*elem, ":"); result.insert( std::pair(tmp_elems[0], atoi(tmp_elems[1].c_str())) ); } return result; } void setup_queryParser(Xapian::QueryParser* queryparser, Xapian::Database& database, const std::string& language, const std::string& stopwords) { queryparser->set_database(database); if ( ! language.empty() ) { /* Build ICU Local object to retrieve ISO-639 language code (from ISO-639-3) */ icu::Locale languageLocale(language.c_str()); /* Configuring language base steemming */ try { Xapian::Stem stemmer = Xapian::Stem(languageLocale.getLanguage()); queryparser->set_stemmer(stemmer); queryparser->set_stemming_strategy(Xapian::QueryParser::STEM_ALL); } catch (...) { std::cout << "No steemming for language '" << languageLocale.getLanguage() << "'" << std::endl; } } if ( ! stopwords.empty() ) { std::string stopWord; std::istringstream file(stopwords); Xapian::SimpleStopper* stopper = new Xapian::SimpleStopper(); while (std::getline(file, stopWord, '\n')) { stopper->add(stopWord); } stopper->release(); queryparser->set_stopper(stopper); } } class LevenshteinDistanceMaker : public Xapian::KeyMaker { public: LevenshteinDistanceMaker(const std::string& query, size_t value_index): query(query), value_index(value_index) {} ~LevenshteinDistanceMaker() = default; virtual std::string operator() (const Xapian::Document &doc) const { auto document_value = doc.get_value(value_index); return Xapian::sortable_serialise( levenshtein_distance(document_value, query)); } private: std::string query; size_t value_index; }; } #endif Search::Search(const std::vector zimfiles) : internal(new InternalData), zimfiles(zimfiles), prefixes(""), query(""), latitude(0), longitude(0), distance(0), range_start(0), range_end(0), suggestion_mode(false), geo_query(false), search_started(false), has_database(false), verbose(false), estimated_matches_number(0) {} Search::Search(const File* zimfile) : internal(new InternalData), prefixes(""), query(""), latitude(0), longitude(0), distance(0), range_start(0), range_end(0), suggestion_mode(false), geo_query(false), search_started(false), has_database(false), verbose(false), estimated_matches_number(0) { zimfiles.push_back(zimfile); } Search::Search(const Search& it) : internal(new InternalData), zimfiles(it.zimfiles), prefixes(it.prefixes), query(it.query), latitude(it.latitude), longitude(it.longitude), distance(it.distance), range_start(it.range_start), range_end(it.range_end), suggestion_mode(it.suggestion_mode), geo_query(it.geo_query), search_started(false), has_database(false), verbose(it.verbose), estimated_matches_number(0) { } Search& Search::operator=(const Search& it) { if ( internal ) internal.reset(); zimfiles = it.zimfiles; prefixes = it.prefixes; query = it.query; latitude = it.latitude; longitude = it.longitude; distance = it.distance; range_start = it.range_start; range_end = it.range_end; suggestion_mode = it.suggestion_mode; geo_query = it.geo_query; search_started = false; has_database = false; verbose = it.verbose; estimated_matches_number = 0; return *this; } Search::Search(Search&& it) = default; Search& Search::operator=(Search&& it) = default; Search::~Search() = default; void Search::set_verbose(bool verbose) { std::cout << "set verbose" << std::endl; this->verbose = verbose; } Search& Search::add_zimfile(const File* zimfile) { zimfiles.push_back(zimfile); return *this; } Search& Search::set_query(const std::string& query) { this->query = query; return *this; } Search& Search::set_georange(float latitude, float longitude, float distance) { this->latitude = latitude; this->longitude = longitude; this->distance = distance; geo_query = true; return *this; } Search& Search::set_range(int start, int end) { this->range_start = start; this->range_end = end; return *this; } Search& Search::set_suggestion_mode(const bool suggestion_mode) { this->suggestion_mode = suggestion_mode; return *this; } Search::iterator Search::begin() const { #if defined(ENABLE_XAPIAN) if ( this->search_started ) { return new search_iterator::InternalData(this, internal->results.begin()); } std::vector::const_iterator it; bool first = true; std::string language; std::string stopwords; for(it=zimfiles.begin(); it!=zimfiles.end(); it++) { const File* zimfile = *it; if (zimfile->is_multiPart()) { continue; } zim::Article xapianArticle = zimfile->getArticle('X', "fulltext/xapian"); if (!xapianArticle.good()) { xapianArticle = zimfile->getArticle('Z', "/fulltextIndex/xapian"); } if (!xapianArticle.good()) { continue; } auto dbOffset = xapianArticle.getOffset(); if (dbOffset == 0) { continue; } std::cerr << "Try to open " << zimfile->getFilename() << " at offset " << dbOffset; DEFAULTFS::FD databasefd; try { databasefd = DEFAULTFS::openFile(zimfile->getFilename()); } catch (...) { std::cerr << "Impossible to open " << zimfile->getFilename() << std::endl; std::cerr << strerror(errno) << std::endl; continue; } if (!databasefd.seek(offset_t(dbOffset))) { std::cerr << "Something went wrong seeking databasedb " << zimfile->getFilename() << std::endl; std::cerr << "dbOffest = " << dbOffset << std::endl; continue; } Xapian::Database database; try { database = Xapian::Database(databasefd.release()); } catch( Xapian::DatabaseError& e) { std::cerr << "Something went wrong opening xapian database for zimfile " << zimfile->getFilename() << std::endl; std::cerr << "dbOffest = " << dbOffset << std::endl; std::cerr << "error = " << e.get_msg() << std::endl; continue; } if ( first ) { this->valuesmap = read_valuesmap(database.get_metadata("valuesmap")); language = database.get_metadata("language"); if (language.empty() ) { // Database created before 2017/03 has no language metadata. // However, term were stemmed anyway and we need to stem our // search query the same the database was created. // So we need a language, let's use the one of the zim. // If zimfile has no language metadata, we can't do lot more here :/ auto article = zimfile->getArticle('M', "Language"); if ( article.good() ) { language = article.getData(); } } stopwords = database.get_metadata("stopwords"); this->prefixes = database.get_metadata("prefixes"); } else { std::map valuesmap = read_valuesmap(database.get_metadata("valuesmap")); if (this->valuesmap != valuesmap ) { // [TODO] Ignore the database, raise a error ? } } internal->xapian_databases.push_back(database); internal->database.add_database(database); has_database = true; } if ( ! has_database ) { if (verbose) { std::cout << "No database, no result" << std::endl; } estimated_matches_number = 0; return nullptr; } Xapian::QueryParser* queryParser = new Xapian::QueryParser(); if (verbose) { std::cout << "Setup queryparser using language " << language << std::endl; } queryParser->set_default_op(Xapian::Query::op::OP_AND); setup_queryParser(queryParser, internal->database, language, stopwords); std::string prefix = ""; unsigned flags = Xapian::QueryParser::FLAG_DEFAULT; if (suggestion_mode) { if (verbose) { std::cout << "Mark query as 'partial'" << std::endl; } flags |= Xapian::QueryParser::FLAG_PARTIAL; if (this->prefixes.find("S") != std::string::npos ) { if (verbose) { std::cout << "Searching in title namespace" << std::endl; } prefix = "S"; } } Xapian::Query query; try { query = queryParser->parse_query(this->query, flags, prefix); } catch (Xapian::QueryParserError& e) { estimated_matches_number = 0; return nullptr; } if (verbose) { std::cout << "Parsed query '" << this->query << "' to " << query.get_description() << std::endl; } delete queryParser; Xapian::Enquire enquire(internal->database); Xapian::KeyMaker* keyMaker(nullptr); if (geo_query && valuesmap.find("geo.position") != valuesmap.end()) { Xapian::GreatCircleMetric metric; Xapian::LatLongCoord centre(latitude, longitude); Xapian::LatLongDistancePostingSource ps(valuesmap["geo.position"], centre, metric, distance); if ( this->query.empty()) { query = Xapian::Query(&ps); } else { query = Xapian::Query(Xapian::Query::OP_FILTER, query, Xapian::Query(&ps)); } } enquire.set_query(query); if (suggestion_mode) { size_t value_index = 0; bool has_custom_distance_maker = true; if ( !valuesmap.empty() ) { if ( valuesmap.find("title") != valuesmap.end() ) { value_index = valuesmap["title"]; } else { // This should not happen as valuesmap has a title entry, but let's // be tolerent. has_custom_distance_maker = false; } } auto temp_results = enquire.get_mset(0,0); if ( has_custom_distance_maker && temp_results.get_matches_estimated() <= MAX_MATCHES_TO_SORT ) { keyMaker = new LevenshteinDistanceMaker(this->query, value_index); enquire.set_sort_by_key(keyMaker, false); } } internal->results = enquire.get_mset(this->range_start, this->range_end-this->range_start); search_started = true; estimated_matches_number = internal->results.get_matches_estimated(); delete keyMaker; return new search_iterator::InternalData(this, internal->results.begin()); #else estimated_matches_number = 0; return nullptr; #endif } Search::iterator Search::end() const { #if defined(ENABLE_XAPIAN) if ( ! has_database ) { return nullptr; } return new search_iterator::InternalData(this, internal->results.end()); #else return nullptr; #endif } int Search::get_matches_estimated() const { // Ensure that the search as begin begin(); return estimated_matches_number; } } //namespace zim libzim-4.0.4/src/search_internal.h000066400000000000000000000050371334353060400171140ustar00rootroot00000000000000/* * Copyright (C) 2006 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_SEARCH_INTERNAL_H #define ZIM_SEARCH_INTERNAL_H #include "config.h" #if defined(ENABLE_XAPIAN) #include #endif namespace zim { struct Search::InternalData { #if defined(ENABLE_XAPIAN) std::vector xapian_databases; Xapian::Database database; Xapian::MSet results; #endif }; struct search_iterator::InternalData { #if defined(ENABLE_XAPIAN) const Search* search; Xapian::MSetIterator iterator; Xapian::Document _document; bool document_fetched; #endif Article _article; bool article_fetched; #if defined(ENABLE_XAPIAN) InternalData(const Search* search, Xapian::MSetIterator iterator) : search(search), iterator(iterator), document_fetched(false), article_fetched(false) {}; Xapian::Document get_document() { if ( !document_fetched ) { if (iterator != search->internal->results.end()) { _document = iterator.get_document(); } document_fetched = true; } return _document; } #endif int get_databasenumber() { #if defined(ENABLE_XAPIAN) Xapian::docid docid = *iterator; return (docid - 1) % search->zimfiles.size(); #endif return 0; } Article& get_article() { #if defined(ENABLE_XAPIAN) if ( !article_fetched ) { int databasenumber = get_databasenumber(); const File* file = search->zimfiles[databasenumber]; if ( ! file ) _article = Article(); else _article = file->getArticleByUrl(get_document().get_data()); article_fetched = true; } #endif return _article; } }; }; //namespace zim #endif //ZIM_SEARCH_INTERNAL_H libzim-4.0.4/src/search_iterator.cpp000066400000000000000000000155371334353060400174720ustar00rootroot00000000000000/* * Copyright (C) 2017 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "xapian/myhtmlparse.h" #include #include #include #include "search_internal.h" namespace zim { search_iterator::~search_iterator() = default; search_iterator::search_iterator(search_iterator&& it) = default; search_iterator& search_iterator::operator=(search_iterator&& it) = default; search_iterator::search_iterator() : search_iterator(nullptr) {}; search_iterator::search_iterator(InternalData* internal_data) : internal(internal_data) {} search_iterator::search_iterator(const search_iterator& it) : internal(nullptr) { if (it.internal) internal = std::unique_ptr(new InternalData(*it.internal)); } search_iterator & search_iterator::operator=(const search_iterator& it) { if ( ! it.internal ) internal.reset(); else if ( ! internal ) internal = std::unique_ptr(new InternalData(*it.internal)); else *internal = *it.internal; return *this; } bool search_iterator::operator==(const search_iterator& it) const { #if defined(ENABLE_XAPIAN) if ( ! internal && ! it.internal) return true; if ( ! internal || ! it.internal) return false; return (internal->search == it.internal->search && internal->iterator == it.internal->iterator); #else // If there is no xapian, there is no search. There is only one iterator: end. // So all iterators are equal. return true; #endif } bool search_iterator::operator!=(const search_iterator& it) const { return ! (*this == it); } search_iterator& search_iterator::operator++() { #if defined(ENABLE_XAPIAN) if ( ! internal ) { return *this; } ++(internal->iterator); internal->document_fetched = false; internal->article_fetched = false; #endif return *this; } search_iterator search_iterator::operator++(int) { search_iterator it = *this; operator++(); return it; } search_iterator& search_iterator::operator--() { #if defined(ENABLE_XAPIAN) if ( ! internal ) { return *this; } --(internal->iterator); internal->document_fetched = false; internal->article_fetched = false; #endif return *this; } search_iterator search_iterator::operator--(int) { search_iterator it = *this; operator--(); return it; } std::string search_iterator::get_url() const { #if defined(ENABLE_XAPIAN) if ( ! internal ) { return ""; } return internal->get_document().get_data(); #else return ""; #endif } std::string search_iterator::get_title() const { #if defined(ENABLE_XAPIAN) if ( ! internal ) { return ""; } if ( internal->search->valuesmap.empty() ) { /* This is the old legacy version. Guess and try */ return internal->get_document().get_value(0); } else if ( internal->search->valuesmap.find("title") != internal->search->valuesmap.end() ) { return internal->get_document().get_value(internal->search->valuesmap["title"]); } #endif return ""; } int search_iterator::get_score() const { #if defined(ENABLE_XAPIAN) if ( ! internal ) { return 0; } return internal->iterator.get_percent(); #else return 0; #endif } std::string search_iterator::get_snippet() const { #if defined(ENABLE_XAPIAN) if ( ! internal ) { return ""; } if ( internal->search->valuesmap.empty() ) { /* This is the old legacy version. Guess and try */ std::string stored_snippet = internal->get_document().get_value(1); if ( ! stored_snippet.empty() ) return stored_snippet; /* Let's continue here, and see if we can genenate one */ } else if ( internal->search->valuesmap.find("snippet") != internal->search->valuesmap.end() ) { return internal->get_document().get_value(internal->search->valuesmap["snippet"]); } /* No reader, no snippet */ Article& article = internal->get_article(); if ( ! article.good() ) return ""; /* Get the content of the article to generate a snippet. We parse it and use the html dump to avoid remove html tags in the content and be able to nicely cut the text at random place. */ zim::MyHtmlParser htmlParser; std::string content = article.getData(); try { htmlParser.parse_html(content, "UTF-8", true); } catch (...) {} return internal->search->internal->results.snippet(htmlParser.dump, 500); #else return ""; #endif } int search_iterator::get_size() const { #if defined(ENABLE_XAPIAN) if ( ! internal ) { return -1; } if ( internal->search->valuesmap.empty() ) { /* This is the old legacy version. Guess and try */ return internal->get_document().get_value(2).empty() == true ? -1 : atoi(internal->get_document().get_value(2).c_str()); } else if ( internal->search->valuesmap.find("size") != internal->search->valuesmap.end() ) { return atoi(internal->get_document().get_value(internal->search->valuesmap["size"]).c_str()); } #endif /* The size is never used. Do we really want to get the content and calculate the size ? */ return -1; } int search_iterator::get_wordCount() const { #if defined(ENABLE_XAPIAN) if ( ! internal ) { return -1; } if ( internal->search->valuesmap.empty() ) { /* This is the old legacy version. Guess and try */ return internal->get_document().get_value(3).empty() == true ? -1 : atoi(internal->get_document().get_value(3).c_str()); } else if ( internal->search->valuesmap.find("wordcount") != internal->search->valuesmap.end() ) { return atoi(internal->get_document().get_value(internal->search->valuesmap["wordcount"]).c_str()); } #endif return -1; } int search_iterator::get_fileIndex() const { #if defined(ENABLE_XAPIAN) if ( internal ) { return internal->get_databasenumber(); } #endif return 0; } search_iterator::reference search_iterator::operator*() const { return internal->get_article(); } search_iterator::pointer search_iterator::operator->() const { return &internal->get_article(); } } // namespace zim libzim-4.0.4/src/template.cpp000066400000000000000000000055611334353060400161230ustar00rootroot00000000000000/* * Copyright (C) 2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "template.h" namespace zim { void TemplateParser::state_data(char ch) { data += ch; if (ch == '<') { state = &TemplateParser::state_lt; save = data.size() - 1; } } void TemplateParser::state_lt(char ch) { data += ch; if (ch == '%') state = &TemplateParser::state_token0; else state = &TemplateParser::state_data; } void TemplateParser::state_token0(char ch) { data += ch; if (ch == '/') state = &TemplateParser::state_link0; else { token = data.size() - 1; state = &TemplateParser::state_token; } } void TemplateParser::state_token(char ch) { data += ch; if (ch == '%') state = &TemplateParser::state_token_end; } void TemplateParser::state_token_end(char ch) { if (ch == '>') { if (event) { event->onData(data.substr(0, save)); event->onToken(data.substr(token, data.size() - token - 1)); data.clear(); } state = &TemplateParser::state_data; } else { data += ch; state = &TemplateParser::state_data; } } void TemplateParser::state_link0(char ch) { data += ch; ns = ch; state = &TemplateParser::state_link; } void TemplateParser::state_link(char ch) { data += ch; if (ch == '/') { token = data.size(); state = &TemplateParser::state_title; } else state = &TemplateParser::state_data; } void TemplateParser::state_title(char ch) { data += ch; if (ch == '%') { token_e = data.size() - 1; state = &TemplateParser::state_title_end; } } void TemplateParser::state_title_end(char ch) { data += ch; if (ch == '>') { if (event) { event->onData(data.substr(0, save)); event->onLink(ns, data.substr(token, token_e - token)); } data.clear(); state = &TemplateParser::state_data; } } void TemplateParser::flush() { if (event) event->onData(data); data.clear(); state = &TemplateParser::state_data; } } libzim-4.0.4/src/template.h000066400000000000000000000042111334353060400155570ustar00rootroot00000000000000/* * Copyright (C) 2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_TEMPLATE_H #define ZIM_TEMPLATE_H #include namespace zim { class TemplateParser { public: class Event { public: virtual void onData(const std::string& data) = 0; virtual void onToken(const std::string& token) = 0; virtual void onLink(char ns, const std::string& url) = 0; virtual ~Event() = default; }; private: Event* event; std::string data; std::string::size_type save; std::string::size_type token; std::string::size_type token_e; char ns; typedef void (TemplateParser::*state_type)(char); state_type state; void state_data(char ch); void state_lt(char ch); void state_token0(char ch); void state_token(char ch); void state_token_end(char ch); void state_link0(char ch); void state_link(char ch); void state_title(char ch); void state_title_end(char ch); public: explicit TemplateParser(Event* ev) : event(ev), state(&TemplateParser::state_data) { } void parse(char ch) { (this->*state)(ch); } void parse(const std::string& s) { for (std::string::const_iterator ch = s.begin(); ch != s.end(); ++ch) parse(*ch); } void flush(); }; } #endif // ZIM_TEMPLATE_H libzim-4.0.4/src/tools.cpp000066400000000000000000000033161334353060400154440ustar00rootroot00000000000000/* * Copyright 2013-2016 Emmanuel Engelhart * Copyright 2016 Matthieu Gautier * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301, USA. */ #include "tools.h" #include #include #include #include #include #include #include #include #include #include #ifdef _WIN32 # include # include # include # include # define SEPARATOR "\\" #else # include # define SEPARATOR "/" #endif std::string zim::removeAccents(const std::string& text) { ucnv_setDefaultName("UTF-8"); static UErrorCode status = U_ZERO_ERROR; static std::unique_ptr removeAccentsTrans(icu::Transliterator::createInstance( "Lower; NFD; [:M:] remove; NFC", UTRANS_FORWARD, status)); icu::UnicodeString ustring(text.c_str()); removeAccentsTrans->transliterate(ustring); std::string unaccentedText; ustring.toUTF8String(unaccentedText); return unaccentedText; } libzim-4.0.4/src/tools.h000066400000000000000000000020011334353060400150770ustar00rootroot00000000000000/* * Copyright 2013-2016 Emmanuel Engelhart * Copyright 2016 Matthieu Gautier * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301, USA. */ #ifndef OPENZIM_LIBZIM_TOOLS_H #define OPENZIM_LIBZIM_TOOLS_H #include namespace zim { std::string removeAccents(const std::string& text); } #endif // OPENZIM_LIBZIM_TOOLS_H libzim-4.0.4/src/uuid.cpp000066400000000000000000000046431334353060400152560ustar00rootroot00000000000000/* * Copyright (C) 2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include #include #include #include // necessary to have the new types #include "log.h" #include "md5stream.h" #ifdef _WIN32 # include # include int gettimeofday(struct timeval* tp, void* tzp) { DWORD t; t = timeGetTime(); tp->tv_sec = t / 1000; tp->tv_usec = t % 1000; return 0; } #define getpid GetCurrentProcessId #else # include #endif log_define("zim.uuid") namespace zim { namespace { char hex[] = "0123456789abcdef"; inline char hi(char v) { return hex[(v >> 4) & 0xf]; } inline char lo(char v) { return hex[v & 0xf]; } } Uuid Uuid::generate(std::string value) { Uuid ret; Md5stream m; if ( value.empty() ) { struct timeval tv; gettimeofday(&tv, 0); clock_t c = clock(); m << c << tv.tv_sec << tv.tv_usec; } else { m << value; } m.getDigest(reinterpret_cast(&ret.data[0])); log_debug("generated uuid: " << ret.data); return ret; } std::ostream& operator<< (std::ostream& out, const Uuid& uuid) { for (unsigned n = 0; n < 4; ++n) out << hi(uuid.data[n]) << lo(uuid.data[n]); out << '-'; for (unsigned n = 4; n < 6; ++n) out << hi(uuid.data[n]) << lo(uuid.data[n]); out << '-'; for (unsigned n = 6; n < 8; ++n) out << hi(uuid.data[n]) << lo(uuid.data[n]); out << '-'; for (unsigned n = 8; n < 10; ++n) out << hi(uuid.data[n]) << lo(uuid.data[n]); out << '-'; for (unsigned n = 10; n < 16; ++n) out << hi(uuid.data[n]) << lo(uuid.data[n]); return out; } } libzim-4.0.4/src/writer/000077500000000000000000000000001334353060400151115ustar00rootroot00000000000000libzim-4.0.4/src/writer/_dirent.h000066400000000000000000000046071334353060400167150ustar00rootroot00000000000000/* * Copyright (C) 2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_WRITER_DIRENT_H #define ZIM_WRITER_DIRENT_H #include "../_dirent.h" #include "cluster.h" namespace zim { namespace writer { class Dirent : public zim::Dirent { Cluster* cluster = nullptr; std::string aid; std::string redirectAid; article_index_t idx = article_index_t(0); public: Dirent() {} Dirent(const std::string& aid_) : aid(aid_) {} Dirent(char ns, const std::string& url) { setUrl(ns, url); } void setAid(const std::string& aid_) { aid = aid_; } const std::string& getAid() const { return aid; } void setRedirectAid(const std::string& aid_) { redirectAid = aid_; } const std::string& getRedirectAid() const { return redirectAid; } void setIdx(article_index_t idx_) { idx = idx_; } article_index_t getIdx() const { return idx; } void setCluster(zim::writer::Cluster* _cluster) { cluster = _cluster; blobNumber = _cluster->count(); } cluster_index_t getClusterNumber() const { return cluster ? cluster->getClusterIndex() : clusterNumber; } }; std::ostream& operator<< (std::ostream& out, const Dirent& d); inline bool compareUrl(const Dirent& d1, const Dirent& d2) { return d1.getNamespace() < d2.getNamespace() || (d1.getNamespace() == d2.getNamespace() && d1.getUrl() < d2.getUrl()); } inline bool compareAid(const Dirent& d1, const Dirent& d2) { return d1.getAid() < d2.getAid(); } } } #endif // ZIM_WRITER_DIRENT_H libzim-4.0.4/src/writer/article.cpp000066400000000000000000000022771334353060400172500ustar00rootroot00000000000000/* * Copyright (C) 2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include #include namespace zim { namespace writer { bool Article::isLinktarget() const { return false; } bool Article::isDeleted() const { return false; } std::string Article::getParameter() const { return std::string(); } std::string Article::getNextCategory() { return std::string(); } } } libzim-4.0.4/src/writer/cluster.cpp000066400000000000000000000162231334353060400173020ustar00rootroot00000000000000/* * Copyright (C) 2017 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "cluster.h" #include "../log.h" #include "../endian_tools.h" #include "../debug.h" #include #include #if defined(ENABLE_ZLIB) #include "deflatestream.h" #endif #include "lzmastream.h" #ifdef _WIN32 #define SEPARATOR "\\" #else #define SEPARATOR "/" #endif namespace zim { namespace writer { Cluster::Cluster(CompressionType compression) : compression(compression), isExtended(false), _size(0) { offsets.push_back(offset_t(0)); pthread_mutex_init(&m_closedMutex,NULL); } void Cluster::clear() { offsets.clear(); _data.clear(); } void Cluster::close() { pthread_mutex_lock(&m_closedMutex); closed = true; pthread_mutex_unlock(&m_closedMutex); } bool Cluster::isClosed() const{ bool v; pthread_mutex_lock(&m_closedMutex); v = closed; pthread_mutex_unlock(&m_closedMutex); return v; } zsize_t Cluster::size() const { if (isClosed()) { throw std::runtime_error("oups"); } if (isExtended) { return zsize_t(offsets.size() * sizeof(uint64_t)) + _size; } else { return zsize_t(offsets.size() * sizeof(uint32_t)) + _size; } } zsize_t Cluster::getFinalSize() const { return finalSize; } template void Cluster::write_offsets(std::ostream& out) const { size_type delta = offsets.size() * sizeof(OFFSET_TYPE); for (auto offset : offsets) { offset.v += delta; char out_buf[sizeof(OFFSET_TYPE)]; toLittleEndian(static_cast(offset.v), out_buf); out.write(out_buf, sizeof(OFFSET_TYPE)); } } void Cluster::write_final(std::ostream& out) const { if(getCompression() == zim::zimcompNone) { dump(out); } else { std::ifstream clustersFile(tmp_filename, std::ios::binary); out << clustersFile.rdbuf(); } if (!out) { throw std::runtime_error("failed to write cluster"); } } void Cluster::dump_tmp(const std::string& directoryPath) { if(getCompression() == zim::zimcompNone) { //No real dump, store inmemory data in file size_t file_index = 0; for (auto& data: _data) { ASSERT(data.value.empty(), ==, false); if (data.type == DataType::plain) { std::ostringstream ss; ss << directoryPath << SEPARATOR << "file_" << index << "_" << file_index << ".tmp"; auto filename = ss.str(); { std::ofstream out(filename, std::ios::binary); out << data.value; if (!out) { throw std::runtime_error( std::string("failed to write temporary cluster file ") + filename); } } data.type = DataType::file; data.value = filename; } file_index++; } finalSize = zsize_t(size().v+1); } else { std::ostringstream ss; ss << directoryPath << SEPARATOR << "cluster_" << index << ".clt"; tmp_filename = ss.str(); std::ofstream out(tmp_filename, std::ios::binary); dump(out); if (!out) { throw std::runtime_error( std::string("failed to write temporary cluster file ") + tmp_filename); } finalSize = zsize_t(out.tellp()); clear(); } } void Cluster::write(std::ostream& out) const { if (isExtended) { write_offsets(out); } else { write_offsets(out); } write_data(out); } void Cluster::dump(std::ostream& out) const { // write clusterInfo char clusterInfo = 0; if (isExtended) { clusterInfo = 0x10; } clusterInfo += getCompression(); out.put(clusterInfo); // Open a comprestion stream if needed switch(getCompression()) { case zim::zimcompDefault: case zim::zimcompNone: write(out); break; case zim::zimcompZip: { #if defined(ENABLE_ZLIB) log_debug("compress data (zlib)"); zim::writer::DeflateStream os(out); os.exceptions(std::ios::failbit | std::ios::badbit); write(os); os.flush(); os.end(); #else throw std::runtime_error("zlib not enabled in this library"); #endif break; } case zim::zimcompBzip2: { throw std::runtime_error("bzip2 not enabled in this library"); break; } case zim::zimcompLzma: { uint32_t lzmaPreset = 3 | LZMA_PRESET_EXTREME; /** * read lzma preset from environment * ZIM_LZMA_PRESET is a number followed optionally by a * suffix 'e'. The number gives the preset and the suffix tells, * if LZMA_PRESET_EXTREME should be set. * e.g.: * ZIM_LZMA_LEVEL=9 => 9 * ZIM_LZMA_LEVEL=3e => 3 + extreme */ const char* e = ::getenv("ZIM_LZMA_LEVEL"); if (e) { char flag = '\0'; std::istringstream s(e); s >> lzmaPreset >> flag; if (flag == 'e') lzmaPreset |= LZMA_PRESET_EXTREME; } log_debug("compress data (lzma, " << std::hex << lzmaPreset << ")"); zim::writer::LzmaStream os(out, lzmaPreset); os.exceptions(std::ios::failbit | std::ios::badbit); write(os); os.end(); break; } default: std::ostringstream msg; msg << "invalid compression flag " << getCompression(); log_error(msg.str()); throw std::runtime_error(msg.str()); } } void Cluster::addArticle(const zim::writer::Article* article) { auto filename = article->getFilename(); auto size = article->getSize(); _size += size; offsets.push_back(offset_t(_size.v)); isExtended |= (size>UINT32_MAX); if (size == 0) return; if (filename.empty()) { _data.emplace_back(DataType::plain, article->getData()); } else { _data.emplace_back(DataType::file, filename); } } void Cluster::addData(const char* data, zsize_t size) { _size += size; offsets.push_back(offset_t(_size.v)); isExtended |= (size.v>UINT32_MAX); if (size.v == 0) return; _data.emplace_back(DataType::plain, data, size.v); } void Cluster::write_data(std::ostream& out) const { for (auto& data: _data) { ASSERT(data.value.empty(), ==, false); if (data.type == DataType::plain) { out << data.value; } else { std::ifstream stream(data.value, std::ios::binary); if (!stream) { throw std::runtime_error(std::string("cannot open ") + data.value); } out << stream.rdbuf(); if (!out) { throw std::runtime_error(std::string("failed to write file ") + data.value); } } } } } // writer } // zim libzim-4.0.4/src/writer/cluster.h000066400000000000000000000057011334353060400167460ustar00rootroot00000000000000/* * Copyright (C) 2017 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_WRITER_CLUSTER_H_ #define ZIM_WRITER_CLUSTER_H_ #include #include #include #include #include #include #include "../zim_types.h" namespace zim { namespace writer { enum class DataType { plain, file }; struct Data { Data(zim::writer::DataType type, const std::string& value) : type(type), value(value) {} Data(zim::writer::DataType type, const char* data, zim::size_type size) : type(type), value(data, size) {} DataType type; std::string value; }; class Cluster { typedef std::vector Offsets; typedef std::vector ClusterData; public: Cluster(CompressionType compression); virtual ~Cluster() { pthread_mutex_destroy(&m_closedMutex);} void setCompression(CompressionType c) { compression = c; } CompressionType getCompression() const { return compression; } void addArticle(const zim::writer::Article* article); void addData(const char* data, zsize_t size); blob_index_t count() const { return blob_index_t(offsets.size() - 1); } zsize_t size() const; zsize_t getFinalSize() const; bool is_extended() const { return isExtended; } void clear(); void close(); bool isClosed() const; void setClusterIndex(cluster_index_t idx) { index = idx; } cluster_index_t getClusterIndex() const { return index; } zsize_t getBlobSize(blob_index_t n) const { return zsize_t(offsets[blob_index_type(n)+1].v - offsets[blob_index_type(n)].v); } void write_final(std::ostream& out) const; void dump_tmp(const std::string& directoryPath); void dump(std::ostream& out) const; protected: CompressionType compression; cluster_index_t index; bool isExtended; Offsets offsets; zsize_t _size; zsize_t finalSize; ClusterData _data; std::string tmp_filename; mutable pthread_mutex_t m_closedMutex; bool closed = false; private: void write(std::ostream& out) const; template void write_offsets(std::ostream& out) const; void write_data(std::ostream& out) const; }; }; }; #endif //ZIM_WRITER_CLUSTER_H_ libzim-4.0.4/src/writer/deflatestream.cpp000066400000000000000000000113271334353060400204410ustar00rootroot00000000000000/* * Copyright (C) 2003-2005 Tommi Maekitalo * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "deflatestream.h" #include "log.h" #include #include log_define("zim.deflatestream") namespace zim { namespace writer { namespace { int checkError(int ret, z_stream& stream) { if (ret != Z_OK && ret != Z_STREAM_END) { log_error("DeflateError " << ret << ": \"" << (stream.msg ? stream.msg : "") << '"'); std::ostringstream msg; msg << "deflate-error " << ret; if (stream.msg) msg << ": " << stream.msg; throw DeflateError(ret, msg.str()); } return ret; } } DeflateStreamBuf::DeflateStreamBuf(std::streambuf* sink_, int level, unsigned bufsize_) : obuffer(bufsize_), sink(sink_) { memset(&stream, 0, sizeof(z_stream)); stream.zalloc = Z_NULL; stream.zfree = Z_NULL; stream.opaque = 0; stream.total_out = 0; stream.total_in = 0; stream.next_in = Z_NULL; stream.next_out = Z_NULL; stream.avail_in = 0; stream.avail_out = 0; checkError(::deflateInit(&stream, level), stream); setp(&obuffer[0], &obuffer[0] + obuffer.size()); } DeflateStreamBuf::~DeflateStreamBuf() { ::deflateEnd(&stream); } DeflateStreamBuf::int_type DeflateStreamBuf::overflow(int_type c) { // initialize input-stream stream.next_in = reinterpret_cast(&obuffer[0]); stream.avail_in = pptr() - &obuffer[0]; // initialize zbuffer for deflated data char zbuffer[8192]; stream.next_out = reinterpret_cast(zbuffer); stream.avail_out = sizeof(zbuffer); // deflate checkError(::deflate(&stream, Z_NO_FLUSH), stream); // copy zbuffer to sink / consume deflated data std::streamsize count = sizeof(zbuffer) - stream.avail_out; if (count > 0) { std::streamsize n = sink->sputn(zbuffer, count); if (n < count) return traits_type::eof(); } // move remaining characters to start of obuffer if (stream.avail_in > 0) memmove(&obuffer[0], stream.next_in, stream.avail_in); // reset outbuffer setp(&obuffer[0] + stream.avail_in, &obuffer[0] + obuffer.size()); if (c != traits_type::eof()) sputc(traits_type::to_char_type(c)); return 0; } DeflateStreamBuf::int_type DeflateStreamBuf::underflow() { return traits_type::eof(); } int DeflateStreamBuf::sync() { // initialize input-stream for stream.next_in = reinterpret_cast(&obuffer[0]); stream.avail_in = pptr() - &obuffer[0]; char zbuffer[8192]; while (stream.avail_in > 0) { // initialize zbuffer stream.next_out = (Bytef*)zbuffer; stream.avail_out = sizeof(zbuffer); checkError(::deflate(&stream, Z_SYNC_FLUSH), stream); // copy zbuffer to sink std::streamsize count = sizeof(zbuffer) - stream.avail_out; if (count > 0) { std::streamsize n = sink->sputn(zbuffer, count); if (n < count) return -1; } }; // reset outbuffer setp(&obuffer[0], &obuffer[0] + obuffer.size()); return 0; } int DeflateStreamBuf::end() { char zbuffer[8192]; // initialize input-stream for stream.next_in = reinterpret_cast(&obuffer[0]); stream.avail_in = pptr() - &obuffer[0]; while (true) { // initialize zbuffer stream.next_out = (Bytef*)zbuffer; stream.avail_out = sizeof(zbuffer); int ret = checkError(::deflate(&stream, Z_FINISH), stream); // copy zbuffer to sink std::streamsize count = sizeof(zbuffer) - stream.avail_out; if (count > 0) { std::streamsize n = sink->sputn(zbuffer, count); if (n < count) throw DeflateError(0, "failed to send compressed data to sink in deflatestream"); } if (ret == Z_STREAM_END) break; }; // reset outbuffer setp(&obuffer[0], &obuffer[0] + obuffer.size()); return 0; } void DeflateStream::end() { if (streambuf.end() != 0) setstate(failbit); } } } libzim-4.0.4/src/writer/deflatestream.h000066400000000000000000000051201334353060400201000ustar00rootroot00000000000000/* * Copyright (C) 2005-2008 Tommi Maekitalo * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_WRITER_DEFLATESTREAM_H #define ZIM_WRITER_DEFLATESTREAM_H #include #include #include #include namespace zim { namespace writer { class DeflateError : public std::runtime_error { int zRet; public: DeflateError(int zRet_, const std::string& msg) : std::runtime_error(msg), zRet(zRet_) { } int getRet() const { return zRet; } }; class DeflateStreamBuf : public std::streambuf { z_stream stream; std::vector obuffer; std::streambuf* sink; public: explicit DeflateStreamBuf(std::streambuf* sink_, int level = Z_DEFAULT_COMPRESSION, unsigned bufsize = 8192); ~DeflateStreamBuf(); /// see std::streambuf int_type overflow(int_type c); /// see std::streambuf int_type underflow(); /// see std::streambuf int sync(); /// end deflate-stream int end(); void setSink(std::streambuf* sink_) { sink = sink_; } uLong getAdler() const { return stream.adler; } }; class DeflateStream : public std::ostream { DeflateStreamBuf streambuf; public: explicit DeflateStream(std::streambuf* sink, int level = Z_DEFAULT_COMPRESSION) : std::ostream(0), streambuf(sink, level) { init(&streambuf); } explicit DeflateStream(std::ostream& sink, int level = Z_DEFAULT_COMPRESSION) : std::ostream(0), streambuf(sink.rdbuf(), level) { init(&streambuf); } void end(); void setSink(std::streambuf* sink) { streambuf.setSink(sink); } void setSink(std::ostream& sink) { streambuf.setSink(sink.rdbuf()); } uLong getAdler() const { return streambuf.getAdler(); } }; } } #endif // ZIM_WRITER_DEFLATESTREAM_H libzim-4.0.4/src/writer/dirent.cpp000066400000000000000000000040041334353060400171000ustar00rootroot00000000000000/* * Copyright (C) 2006 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "_dirent.h" #include #include "buffer.h" #include "endian_tools.h" #include "log.h" #include #include log_define("zim.dirent") std::ostream& zim::writer::operator<< (std::ostream& out, const zim::writer::Dirent& dirent) { union { char d[16]; long a; } header; zim::toLittleEndian(dirent.getMimeType(), header.d); header.d[2] = static_cast(dirent.getParameter().size()); header.d[3] = dirent.getNamespace(); log_debug("title=" << dirent.getTitle() << " title.size()=" << dirent.getTitle().size()); zim::toLittleEndian(dirent.getVersion(), header.d + 4); if (dirent.isRedirect()) { zim::toLittleEndian(dirent.getRedirectIndex().v, header.d + 8); out.write(header.d, 12); } else if (dirent.isLinktarget() || dirent.isDeleted()) { out.write(header.d, 8); } else { zim::toLittleEndian(zim::cluster_index_type(dirent.getClusterNumber()), header.d + 8); zim::toLittleEndian(zim::blob_index_type(dirent.getBlobNumber()), header.d + 12); out.write(header.d, 16); } out << dirent.getUrl() << '\0'; std::string t = dirent.getTitle(); if (t != dirent.getUrl()) out << t; out << '\0' << dirent.getParameter(); return out; } libzim-4.0.4/src/writer/lzmastream.cpp000066400000000000000000000124271334353060400200020ustar00rootroot00000000000000/* * Copyright (C) 2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "lzmastream.h" #include #include "log.h" #include #include log_define("zim.lzma.compress") namespace zim { namespace writer { namespace { lzma_ret checkError(lzma_ret ret) { if (ret != LZMA_OK && ret != LZMA_STREAM_END) { std::ostringstream msg; msg << "lzma-error " << ret; switch (ret) { case LZMA_OK: msg << ": LZMA_OK"; break; case LZMA_STREAM_END: msg << ": LZMA_STREAM_END"; break; case LZMA_NO_CHECK: msg << ": LZMA_NO_CHECK"; break; case LZMA_UNSUPPORTED_CHECK: msg << ": LZMA_UNSUPPORTED_CHECK"; break; case LZMA_GET_CHECK: msg << ": LZMA_GET_CHECK"; break; case LZMA_MEM_ERROR: msg << ": LZMA_MEM_ERROR"; break; case LZMA_MEMLIMIT_ERROR: msg << ": LZMA_MEMLIMIT_ERROR"; break; case LZMA_FORMAT_ERROR: msg << ": LZMA_FORMAT_ERROR"; break; case LZMA_OPTIONS_ERROR: msg << ": LZMA_OPTIONS_ERROR"; break; case LZMA_DATA_ERROR: msg << ": LZMA_DATA_ERROR"; break; case LZMA_BUF_ERROR: msg << ": LZMA_BUF_ERROR"; break; case LZMA_PROG_ERROR: msg << ": LZMA_PROG_ERROR"; break; } log_error(msg.str()); throw LzmaError(ret, msg.str()); } return ret; } } LzmaStreamBuf::LzmaStreamBuf(std::streambuf* sink_, uint32_t preset, lzma_check check, unsigned bufsize_) : obuffer(bufsize_), sink(sink_) { std::memset(reinterpret_cast(&stream), 0, sizeof(stream)); checkError( ::lzma_easy_encoder(&stream, preset, check)); setp(&obuffer[0], &obuffer[0] + obuffer.size()); } LzmaStreamBuf::~LzmaStreamBuf() { ::lzma_end(&stream); } LzmaStreamBuf::int_type LzmaStreamBuf::overflow(int_type c) { // initialize input-stream stream.next_in = reinterpret_cast(&obuffer[0]); stream.avail_in = pptr() - &obuffer[0]; // initialize zbuffer for compressed data char zbuffer[8192]; stream.next_out = reinterpret_cast(zbuffer); stream.avail_out = sizeof(zbuffer); // compress checkError(::lzma_code(&stream, LZMA_RUN)); // copy zbuffer to sink / consume deflated data std::streamsize count = sizeof(zbuffer) - stream.avail_out; if (count > 0) { std::streamsize n = sink->sputn(zbuffer, count); if (n < count) return traits_type::eof(); } // move remaining characters to start of obuffer if (stream.avail_in > 0) memmove(&obuffer[0], stream.next_in, stream.avail_in); // reset outbuffer setp(&obuffer[0] + stream.avail_in, &obuffer[0] + obuffer.size()); if (c != traits_type::eof()) sputc(traits_type::to_char_type(c)); return 0; } LzmaStreamBuf::int_type LzmaStreamBuf::underflow() { return traits_type::eof(); } int LzmaStreamBuf::sync() { // initialize input-stream for stream.next_in = reinterpret_cast(&obuffer[0]); stream.avail_in = pptr() - &obuffer[0]; char zbuffer[8192]; while (stream.avail_in > 0) { // initialize zbuffer stream.next_out = (uint8_t*)zbuffer; stream.avail_out = sizeof(zbuffer); checkError(::lzma_code(&stream, LZMA_FINISH)); // copy zbuffer to sink std::streamsize count = sizeof(zbuffer) - stream.avail_out; if (count > 0) { std::streamsize n = sink->sputn(zbuffer, count); if (n < count) return -1; } }; // reset outbuffer setp(&obuffer[0], &obuffer[0] + obuffer.size()); return 0; } int LzmaStreamBuf::end() { char zbuffer[8192]; // initialize input-stream for stream.next_in = reinterpret_cast(&obuffer[0]); stream.avail_in = pptr() - &obuffer[0]; lzma_ret ret; do { // initialize zbuffer stream.next_out = (uint8_t*)zbuffer; stream.avail_out = sizeof(zbuffer); ret = checkError(::lzma_code(&stream, LZMA_FINISH)); // copy zbuffer to sink std::streamsize count = sizeof(zbuffer) - stream.avail_out; if (count > 0) { std::streamsize n = sink->sputn(zbuffer, count); if (n < count) throw LzmaError(static_cast(0), "failed to send compressed data to sink in lzmastream"); } } while (ret != LZMA_STREAM_END); // reset outbuffer setp(&obuffer[0], &obuffer[0] + obuffer.size()); return 0; } void LzmaStream::end() { if (streambuf.end() != 0) setstate(failbit); } } } libzim-4.0.4/src/writer/lzmastream.h000066400000000000000000000054131334353060400174440ustar00rootroot00000000000000/* * Copyright (C) 2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_WRITER_LZMASTREAM_H #define ZIM_WRITER_LZMASTREAM_H #include #include #include #include namespace zim { namespace writer { class LzmaError : public std::runtime_error { lzma_ret ret; public: LzmaError(lzma_ret ret_, const std::string& msg) : std::runtime_error(msg), ret(ret_) { } lzma_ret getRetcode() const { return ret; } }; class LzmaStreamBuf : public std::streambuf { lzma_stream stream; std::vector obuffer; std::streambuf* sink; public: LzmaStreamBuf(std::streambuf* sink_, uint32_t preset = 3 | LZMA_PRESET_EXTREME, lzma_check check = LZMA_CHECK_CRC32 /* LZMA_CHECK_NONE */, unsigned bufsize = 8192); ~LzmaStreamBuf(); /// see std::streambuf int_type overflow(int_type c); /// see std::streambuf int_type underflow(); /// see std::streambuf int sync(); /// end stream int end(); void setSink(std::streambuf* sink_) { sink = sink_; } }; class LzmaStream : public std::ostream { LzmaStreamBuf streambuf; public: explicit LzmaStream(std::streambuf* sink, uint32_t preset = 3 | LZMA_PRESET_EXTREME, lzma_check check = LZMA_CHECK_CRC32 /* LZMA_CHECK_NONE */, unsigned bufsize = 8192) : std::ostream(0), streambuf(sink, preset, check, bufsize) { init(&streambuf); } explicit LzmaStream(std::ostream& sink, uint32_t preset = 3 | LZMA_PRESET_EXTREME, lzma_check check = LZMA_CHECK_CRC32 /* LZMA_CHECK_NONE */, unsigned bufsize = 8192) : std::ostream(0), streambuf(sink.rdbuf(), preset, check, bufsize) { init(&streambuf); } void end(); void setSink(std::streambuf* sink) { streambuf.setSink(sink); } void setSink(std::ostream& sink) { streambuf.setSink(sink.rdbuf()); } }; } } #endif // ZIM_WRITER_LZMASTREAM_H libzim-4.0.4/src/writer/queue.h000066400000000000000000000047261334353060400164170ustar00rootroot00000000000000/* * Copyright 2016 Matthieu Gautier * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301, USA. */ #ifndef OPENZIM_LIBZIM_QUEUE_H #define OPENZIM_LIBZIM_QUEUE_H #define MAX_QUEUE_SIZE 10 #include #include #include #include template class Queue { public: Queue() {pthread_mutex_init(&m_queueMutex,NULL);}; virtual ~Queue() {pthread_mutex_destroy(&m_queueMutex);}; virtual bool isEmpty(); virtual void pushToQueue(const T& element); virtual bool popFromQueue(T &filename); protected: std::queue m_realQueue; pthread_mutex_t m_queueMutex; private: // Make this queue non copyable Queue(const Queue&); Queue& operator=(const Queue&); }; template bool Queue::isEmpty() { pthread_mutex_lock(&m_queueMutex); bool retVal = m_realQueue.empty(); pthread_mutex_unlock(&m_queueMutex); return retVal; } template void Queue::pushToQueue(const T &element) { unsigned int wait = 0; unsigned int queueSize = 0; do { std::this_thread::sleep_for(std::chrono::microseconds(wait)); pthread_mutex_lock(&m_queueMutex); queueSize = m_realQueue.size(); pthread_mutex_unlock(&m_queueMutex); wait += 10; } while (queueSize > MAX_QUEUE_SIZE); pthread_mutex_lock(&m_queueMutex); m_realQueue.push(element); pthread_mutex_unlock(&m_queueMutex); } template bool Queue::popFromQueue(T &element) { pthread_mutex_lock(&m_queueMutex); if (m_realQueue.empty()) { pthread_mutex_unlock(&m_queueMutex); return false; } element = m_realQueue.front(); m_realQueue.pop(); pthread_mutex_unlock(&m_queueMutex); return true; } #endif // OPENZIM_LIBZIM_QUEUE_H libzim-4.0.4/src/writer/tee.cpp000066400000000000000000000046031334353060400163750ustar00rootroot00000000000000/* * Copyright (C) 2003 Tommi Maekitalo * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * As a special exception, you may use this file as part of a free * software library without restriction. Specifically, if other files * instantiate templates or use macros or inline functions from this * file, or you compile this file and link it with other files to * produce an executable, this file does not by itself cause the * resulting executable to be covered by the GNU General Public * License. This exception does not however invalidate any other * reasons why the executable file might be covered by the GNU Library * General Public License. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "tee.h" namespace zim { std::streambuf::int_type Teestreambuf::overflow(std::streambuf::int_type ch) { if(ch != traits_type::eof()) { if(streambuf1 && streambuf1->sputc(ch) == traits_type::eof()) return traits_type::eof(); if(streambuf2 && streambuf2->sputc(ch) == traits_type::eof()) return traits_type::eof(); } return 0; } std::streambuf::int_type Teestreambuf::underflow() { return traits_type::eof(); } int Teestreambuf::sync() { if(streambuf1 && streambuf1->pubsync() == traits_type::eof()) return traits_type::eof(); if(streambuf2 && streambuf2->pubsync() == traits_type::eof()) return traits_type::eof(); return 0; } ///////////////////////////////////////////////////////////////////////////// void Tee::assign(std::ostream& s1, std::ostream& s2) { Teestreambuf* buf = dynamic_cast(rdbuf()); if(buf) buf->tie(s1.rdbuf(), s2.rdbuf()); } void Tee::assign_single(std::ostream& s) { Teestreambuf* buf = dynamic_cast(rdbuf()); if(buf) buf->tie(s.rdbuf()); } } libzim-4.0.4/src/writer/tee.h000066400000000000000000000052521334353060400160430ustar00rootroot00000000000000/* * Copyright (C) 2003 Tommi Maekitalo * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * As a special exception, you may use this file as part of a free * software library without restriction. Specifically, if other files * instantiate templates or use macros or inline functions from this * file, or you compile this file and link it with other files to * produce an executable, this file does not by itself cause the * resulting executable to be covered by the GNU General Public * License. This exception does not however invalidate any other * reasons why the executable file might be covered by the GNU Library * General Public License. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef ZIM_TEE_H #define ZIM_TEE_H #include namespace zim { class Teestreambuf : public std::streambuf { public: Teestreambuf(std::streambuf* buf1 = 0, std::streambuf* buf2 = 0) : streambuf1(buf1), streambuf2(buf2) { setp(0, 0); } void tie(std::streambuf* buf1, std::streambuf* buf2 = 0) { streambuf1 = buf1; streambuf2 = buf2; } private: std::streambuf::int_type overflow(std::streambuf::int_type ch); std::streambuf::int_type underflow(); int sync(); std::streambuf* streambuf1; std::streambuf* streambuf2; }; ///////////////////////////////////////////////////////////////////////////// class Tee : public std::ostream { typedef std::ostream base_class; Teestreambuf streambuf; public: Tee() : std::ostream(0), streambuf(std::cout.rdbuf()) { init(&streambuf); } Tee(std::ostream& s1, std::ostream& s2) : std::ostream(0), streambuf(s1.rdbuf(), s2.rdbuf()) { init(&streambuf); } Tee(std::ostream& s) : std::ostream(0), streambuf(s.rdbuf(), std::cout.rdbuf()) { init(&streambuf); } void assign(std::ostream& s1, std::ostream& s2); void assign(std::ostream& s) { assign(s, std::cout); } void assign_single(std::ostream& s); }; } #endif // ZIM_TEE_H libzim-4.0.4/src/writer/xapianIndexer.cpp000066400000000000000000000127741334353060400204270ustar00rootroot00000000000000/* * Copyright 2011 Emmanuel Engelhart * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301, USA. */ #include "xapianIndexer.h" #include "libzim-resources.h" #include "fs.h" #include "tools.h" #include #include #include /* Count word */ unsigned int countWords(const string& text) { unsigned int numWords = 1; unsigned int length = text.size(); for (unsigned int i = 0; i < length;) { while (i < length && text[i] != ' ') { i++; } numWords++; i++; } return numWords; } /* Constructor */ XapianIndexer::XapianIndexer(const std::string& language, const bool verbose) : language(language) { /* Build ICU Local object to retrieve ISO-639 language code (from ISO-639-3) */ icu::Locale languageLocale(language.c_str()); /* Configuring language base steemming */ try { this->stemmer = Xapian::Stem(languageLocale.getLanguage()); this->indexer.set_stemmer(this->stemmer); this->indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_ALL); } catch (...) { std::cout << "No steemming for language '" << languageLocale.getLanguage() << "'" << std::endl; } /* Read the stopwords */ std::string stopWord; try { this->stopwords = getResource("stopwords/" + language); } catch(ResourceNotFound& e) {} std::istringstream file(this->stopwords); while (std::getline(file, stopWord, '\n')) { this->stopper.add(stopWord); } this->indexer.set_stopper(&(this->stopper)); this->indexer.set_stopper_strategy(Xapian::TermGenerator::STOP_ALL); } XapianIndexer::~XapianIndexer() { if (!indexPath.empty()) { try { #ifndef _WIN32 //[TODO] Implement remove for windows zim::DEFAULTFS::remove(indexPath + ".tmp"); zim::DEFAULTFS::remove(indexPath); #endif } catch (...) { /* Do not raise */ } } } void XapianIndexer::indexingPrelude(const string indexPath_) { indexPath = indexPath_; this->writableDatabase = Xapian::WritableDatabase( indexPath + ".tmp", Xapian::DB_CREATE_OR_OVERWRITE); this->writableDatabase.set_metadata("valuesmap", "title:0;wordcount:1;geo.position:2"); this->writableDatabase.set_metadata("language", language); this->writableDatabase.set_metadata("stopwords", stopwords); this->writableDatabase.set_metadata("prefixes", "S"); this->writableDatabase.begin_transaction(true); } void XapianIndexer::index(const zim::writer::Article* article) { /* Put the data in the document */ Xapian::Document currentDocument; currentDocument.clear_values(); currentDocument.set_data(std::string(1, article->getNamespace()) + "/" + article->getUrl()); indexer.set_document(currentDocument); zim::MyHtmlParser htmlParser; try { htmlParser.parse_html(article->getData(), "UTF-8", true); } catch (...) { } if (htmlParser.dump.find("NOINDEX") != string::npos) { return; } std::string accentedTitle = (htmlParser.title.empty() ? article->getTitle() : htmlParser.title); std::string title = zim::removeAccents(accentedTitle); std::string keywords = zim::removeAccents(htmlParser.keywords); std::string content = zim::removeAccents(htmlParser.dump); currentDocument.add_value(0, title); std::stringstream countWordStringStream; countWordStringStream << countWords(htmlParser.dump); currentDocument.add_value(1, countWordStringStream.str()); if (htmlParser.has_geoPosition) { auto geoPosition = Xapian::LatLongCoord( htmlParser.latitude, htmlParser.longitude).serialise(); currentDocument.add_value(2, geoPosition); } /* Index the title */ if (!title.empty()) { this->indexer.index_text_without_positions( title, this->getTitleBoostFactor(content.size())); this->indexer.index_text(title, 1, "S"); } /* Index the keywords */ if (!keywords.empty()) { this->indexer.index_text_without_positions(keywords, keywordsBoostFactor); } /* Index the content */ if (!content.empty()) { this->indexer.index_text_without_positions(content); } /* add to the database */ this->writableDatabase.add_document(currentDocument); } void XapianIndexer::flush() { this->writableDatabase.commit_transaction(); this->writableDatabase.begin_transaction(true); } void XapianIndexer::indexingPostlude() { this->flush(); this->writableDatabase.commit_transaction(); this->writableDatabase.commit(); this->writableDatabase.compact(indexPath, Xapian::DBCOMPACT_SINGLE_FILE); this->writableDatabase.close(); } XapianMetaArticle* XapianIndexer::getMetaArticle() { return new XapianMetaArticle(this); } zim::size_type XapianMetaArticle::getSize() const { std::ifstream in(indexer->getIndexPath(), std::ios::binary|std::ios::ate); return in.tellg(); } std::string XapianMetaArticle::getFilename() const { return indexer->getIndexPath(); } zim::Blob XapianMetaArticle::getData() const { throw std::logic_error("We should not pass here."); return zim::Blob(); } libzim-4.0.4/src/writer/xapianIndexer.h000066400000000000000000000053001334353060400200570ustar00rootroot00000000000000/* * Copyright 2011 Emmanuel Engelhart * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301, USA. */ #ifndef LIBZIM_WRITER_XAPIANINDEXER_H #define LIBZIM_WRITER_XAPIANINDEXER_H #include #include #include #include #include #include "xapian/myhtmlparse.h" class XapianIndexer; class XapianMetaArticle : public zim::writer::Article { private: XapianIndexer* indexer; mutable std::string data; public: XapianMetaArticle(XapianIndexer* indexer) : indexer(indexer) {} virtual ~XapianMetaArticle() = default; virtual zim::Blob getData() const; virtual std::string getAid() const { return "/fulltextIndex/xapian"; } virtual char getNamespace() const { return 'Z';} virtual std::string getUrl() const { return "/fulltextIndex/xapian"; } virtual std::string getTitle() const { return "Xapian Fulltext Index"; } virtual std::string getMimeType() const { return "application/octet-stream+xapian"; } virtual bool isRedirect() const { return false; } virtual bool shouldIndex() const { return false; } virtual bool shouldCompress() const { return false; } virtual std::string getRedirectAid() const { return ""; } virtual zim::size_type getSize() const; virtual std::string getFilename() const; }; class XapianIndexer { public: XapianIndexer(const std::string& language, bool verbose); virtual ~XapianIndexer(); std::string getIndexPath() { return indexPath; } void indexingPrelude(const string indexPath); void index(const zim::writer::Article* article); void flush(); void indexingPostlude(); XapianMetaArticle* getMetaArticle(); protected: unsigned int keywordsBoostFactor; inline unsigned int getTitleBoostFactor(const unsigned int contentLength) { return contentLength / 500 + 1; } Xapian::WritableDatabase writableDatabase; Xapian::Stem stemmer; Xapian::SimpleStopper stopper; Xapian::TermGenerator indexer; std::string indexPath; std::string language; std::string stopwords; }; #endif // LIBZIM_WRITER_XAPIANINDEXER_H libzim-4.0.4/src/writer/zimcreator.cpp000066400000000000000000000512411334353060400177770ustar00rootroot00000000000000/* * Copyright (C) 2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "config.h" #include "zimcreatordata.h" #include "cluster.h" #include #include #include "../endian_tools.h" #include #include #if defined(ENABLE_XAPIAN) #include "xapianIndexer.h" #endif #ifdef _WIN32 #include #else #include #endif #include #include #include #include #include #include #include "md5stream.h" #include "tee.h" #include "log.h" #include "../fs.h" log_define("zim.writer.creator") #define INFO(e) \ do { \ log_info(e); \ std::cout << e << std::endl; \ } while(false) namespace { class CompareTitle { zim::writer::ZimCreatorData::DirentsType& dirents; public: explicit CompareTitle(zim::writer::ZimCreatorData::DirentsType& dirents_) : dirents(dirents_) { } bool operator() (zim::article_index_t titleIdx1, zim::article_index_t titleIdx2) const { auto d1 = dirents[zim::article_index_type(titleIdx1)]; auto d2 = dirents[zim::article_index_type(titleIdx2)]; return d1.getNamespace() < d2.getNamespace() || (d1.getNamespace() == d2.getNamespace() && d1.getTitle() < d2.getTitle()); } }; } namespace zim { namespace writer { void* ZimCreator::clusterWriter(void* arg) { auto zimCreator = static_cast(arg); zim::writer::Cluster* clusterToWrite; unsigned int wait = 0; while(true) { std::this_thread::sleep_for(std::chrono::microseconds(wait)); if (zimCreator->data->clustersToWrite.popFromQueue(clusterToWrite)) { wait = 0; clusterToWrite->dump_tmp(zimCreator->data->tmpfname); clusterToWrite->close(); continue; } wait += 10; } return nullptr; } ZimCreator::ZimCreator(bool verbose) : verbose(verbose) {} ZimCreator::~ZimCreator() = default; void ZimCreator::startZimCreation(const std::string& fname) { data = std::unique_ptr(new ZimCreatorData(fname, verbose, withIndex, indexingLanguage)); data->setMinChunkSize(minChunkSize); for(unsigned i=0; irunningWriters.push_back(thread); } } void ZimCreator::addArticle(const Article& article) { Dirent dirent = data->createDirentFromArticle(&article); data->addDirent(dirent, &article); data->nbArticles++; if (article.shouldCompress()) data->nbCompArticles++; else data->nbUnCompArticles++; if (!article.getFilename().empty()) data->nbFileArticles++; if (article.shouldIndex()) data->nbIndexArticles++; if (verbose && data->nbArticles%1000 == 0){ std::cout << "A:" << data->nbArticles << "; CA:" << data->nbCompArticles << "; UA:" << data->nbUnCompArticles << "; FA:" << data->nbFileArticles << "; IA:" << data->nbIndexArticles << "; C:" << data->nbClusters << "; CC:" << data->nbCompClusters << "; UC:" << data->nbUnCompClusters << std::endl; } #if defined(ENABLE_XAPIAN) if(withIndex && article.shouldIndex()) { data->indexer->index(&article); } #endif } void ZimCreator::finishZimCreation() { if (verbose) { std::cout << "A:" << data->nbArticles << "; CA:" << data->nbCompArticles << "; UA:" << data->nbUnCompArticles << "; FA:" << data->nbFileArticles << "; IA:" << data->nbIndexArticles << "; C:" << data->nbClusters << "; CC:" << data->nbCompClusters << "; UC:" << data->nbUnCompClusters << std::endl; } #if defined(ENABLE_XAPIAN) if (withIndex) { data->indexer->indexingPostlude(); std::this_thread::sleep_for(std::chrono::microseconds(100)); auto article = data->indexer->getMetaArticle(); Dirent dirent = data->createDirentFromArticle(article); data->addDirent(dirent, article); delete article; } #endif // When we've seen all articles, write any remaining clusters. if (data->compCluster->count()) data->closeCluster(true); if (data->uncompCluster->count()) data->closeCluster(false); // wait all cluster writing has been done unsigned int wait = 0; do { std::this_thread::sleep_for(std::chrono::microseconds(wait)); wait += 10; } while(!data->clustersToWrite.isEmpty()); // Be sure that all cluster are closed wait = 0; bool closed = true; do { closed = true; std::this_thread::sleep_for(std::chrono::microseconds(wait)); wait += 10; for(auto cluster: data->clustersList) { if (!cluster->isClosed()) { closed = false; break; } } } while(!closed); // [FIXME] pthread_cancel is not defined in android NDK. // As we don't create zim on android platform, // let's simply skip this code to still allow // compilation of libzim on android. #if !defined(__ANDROID__) for(auto& thread: data->runningWriters) { pthread_cancel(thread); } #endif data->generateClustersOffsets(); data->removeInvalidRedirects(); data->setArticleIndexes(); data->resolveRedirectIndexes(); data->resolveMimeTypes(); INFO("create title index"); data->createTitleIndex(); INFO(data->dirents.size() << " title index created"); INFO(data->clusterOffsets.size() << " clusters created"); INFO("fill header"); Fileheader header; fillHeader(&header); // sort log_debug("sort " << dirents.size() << " directory entries (url)"); std::sort(data->dirents.begin(), data->dirents.end(), compareUrl); INFO("write zimfile"); write(header, data->basename + ".zim.tmp"); zim::DEFAULTFS::rename(data->basename + ".zim.tmp", data->basename + ".zim"); INFO("ready"); } void ZimCreator::fillHeader(Fileheader* header) { std::string mainAid = getMainPage(); std::string layoutAid = getLayoutPage(); log_debug("main aid=" << mainAid << " layout aid=" << layoutAid); if (data->isExtended) { header->setMajorVersion(Fileheader::zimExtendedMajorVersion); } else { header->setMajorVersion(Fileheader::zimClassicMajorVersion); } header->setMinorVersion(Fileheader::zimMinorVersion); header->setMainPage(std::numeric_limits::max()); header->setLayoutPage(std::numeric_limits::max()); if (!mainAid.empty() || !layoutAid.empty()) { for (auto& dirent: data->dirents) { if (mainAid == dirent.getAid()) { log_debug("main idx=" << dirent.getIdx()); header->setMainPage(article_index_type(dirent.getIdx())); } if (layoutAid == dirent.getAid()) { log_debug("layout idx=" << dirent.getIdx()); header->setLayoutPage(article_index_type(dirent.getIdx())); } } } header->setUuid( getUuid() ); header->setArticleCount( data->dirents.size() ); offset_type offset(Fileheader::size); header->setMimeListPos( offset ); offset += data->mimeListSize().v; header->setUrlPtrPos( offset ); offset += data->urlPtrSize().v; header->setTitleIdxPos( offset ); header->setClusterCount( data->clusterOffsets.size() ); offset += data->titleIdxSize().v + data->indexSize().v; header->setClusterPtrPos( offset ); offset += data->clusterPtrSize().v + data->clustersSize.v; header->setChecksumPos( offset ); } void ZimCreator::write(const Fileheader& header, const std::string& fname) const { std::ofstream zimfile(fname); Md5stream md5; Tee out(zimfile, md5); out << header; log_debug("after writing header - pos=" << zimfile.tellp()); // write mime type list for(auto& mimeType: data->mimeTypesList) { out << mimeType << '\0'; } out << '\0'; // write url ptr list offset_t off(header.getTitleIdxPos() + data->titleIdxSize().v); for (auto& dirent: data->dirents) { char tmp_buff[sizeof(offset_type)]; toLittleEndian(off.v, tmp_buff); out.write(tmp_buff, sizeof(offset_type)); off += dirent.getDirentSize(); } log_debug("after writing direntPtr - pos=" << out.tellp()); // write title index for (auto titleid: data->titleIdx) { char tmp_buff[sizeof(article_index_type)]; toLittleEndian(titleid.v, tmp_buff); out.write(tmp_buff, sizeof(article_index_type)); } log_debug("after writing fileIdxList - pos=" << out.tellp()); // write directory entries for (auto& dirent: data->dirents) { out << dirent; log_debug("write " << dirent.getTitle() << " dirent.size()=" << dirent.getDirentSize() << " pos=" << out.tellp()); } log_debug("after writing dirents - pos=" << out.tellp()); // write cluster offset list off += data->clusterPtrSize(); for (auto clusterOffset : data->clusterOffsets) { offset_t o(off + clusterOffset); char tmp_buff[sizeof(offset_type)]; toLittleEndian(o.v, tmp_buff); out.write(tmp_buff, sizeof(offset_type)); } log_debug("after writing clusterOffsets - pos=" << out.tellp()); // write cluster data if (!data->isEmpty) { for(auto& cluster: data->clustersList) { ASSERT(cluster->isClosed(), ==, true); cluster->write_final(out); } } else log_warn("no data found"); if (!out) throw std::runtime_error("failed to write zimfile"); log_debug("after writing clusterData - pos=" << out.tellp()); unsigned char digest[16]; md5.getDigest(digest); zimfile.write(reinterpret_cast(digest), 16); } ZimCreatorData::ZimCreatorData(const std::string& fname, bool verbose, bool withIndex, std::string language) : withIndex(withIndex), indexingLanguage(language), verbose(verbose) { basename = (fname.size() > 4 && fname.compare(fname.size() - 4, 4, ".zim") == 0) ? fname.substr(0, fname.size() - 4) : fname; tmpfname = basename + ".tmp"; if(!DEFAULTFS::makeDirectory(tmpfname)) { throw std::runtime_error( std::string("failed to create temporary directory ") + tmpfname); } // We keep both a "compressed cluster" and an "uncompressed cluster" // because we don't know which one will fill up first. We also need // to track the dirents currently in each, so we can fix up the // cluster index if the other one ends up written first. compCluster = new Cluster(compression); uncompCluster = new Cluster(zimcompNone); #if defined(ENABLE_XAPIAN) if (withIndex) { indexer = new XapianIndexer(indexingLanguage, true); indexer->indexingPrelude(tmpfname+".idx"); } #endif } ZimCreatorData::~ZimCreatorData() { if (compCluster) delete compCluster; if (uncompCluster) delete uncompCluster; for(auto& cluster: clustersList) { delete cluster; } #ifndef _WIN32 //[TODO] Implement remove for windows DEFAULTFS::remove(tmpfname); #endif #if defined(ENABLE_XAPIAN) if (indexer) delete indexer; #endif } void ZimCreatorData::addDirent(const Dirent& dirent, const Article* article) { dirents.push_back(dirent); // If this is a redirect, we're done: there's no blob to add. if (dirent.isRedirect()) { return; } // Add blob data to compressed or uncompressed cluster. auto articleSize = article->getSize(); if (articleSize > 0) { isEmpty = false; } Cluster *cluster; if (article->shouldCompress()) { cluster = compCluster; } else { cluster = uncompCluster; } // If cluster will be too large, write it to dis, and open a new // one for the content. if ( cluster->count() && cluster->size().v+articleSize >= minChunkSize * 1024 ) { log_info("cluster with " << cluster->count() << " articles, " << cluster->size() << " bytes; current title \"" << dirent.getTitle() << '\"'); cluster = closeCluster(article->shouldCompress()); } dirents.back().setCluster(cluster); cluster->addArticle(article); } Dirent ZimCreatorData::createDirentFromArticle(const Article* article) { Dirent dirent; dirent.setAid(article->getAid()); dirent.setUrl(article->getNamespace(), article->getUrl()); dirent.setTitle(article->getTitle()); dirent.setParameter(article->getParameter()); log_debug("article " << dirent.getLongUrl() << " fetched"); if (article->isRedirect()) { dirent.setRedirect(article_index_t(0)); dirent.setRedirectAid(article->getRedirectAid()); log_debug("is redirect to " << dirent.getRedirectAid()); } else if (article->isLinktarget()) { dirent.setLinktarget(); } else if (article->isDeleted()) { dirent.setDeleted(); } else { auto mimetype = article->getMimeType(); if (mimetype.empty()) { std::cerr << "Warning, " << article->getUrl() << " have empty mimetype." << std::endl; mimetype = "application/octet-stream"; } dirent.setMimeType(getMimeTypeIdx(mimetype)); log_debug("is article; mimetype " << dirent.getMimeType()); } return dirent; } Cluster* ZimCreatorData::closeCluster(bool compressed) { Cluster *cluster; nbClusters++; if (compressed ) { cluster = compCluster; nbCompClusters++; } else { cluster = uncompCluster; nbUnCompClusters++; } cluster->setClusterIndex(cluster_index_t(clustersList.size())); clustersList.push_back(cluster); clustersToWrite.pushToQueue(cluster); log_debug("cluster written"); if (cluster->is_extended() ) isExtended = true; if (compressed) { cluster = compCluster = new Cluster(compression); } else { cluster = uncompCluster = new Cluster(zimcompNone); } return cluster; } void ZimCreatorData::generateClustersOffsets() { clustersSize = zsize_t(0); for(auto& cluster: clustersList) { clusterOffsets.push_back(offset_t(clustersSize.v)); clustersSize += cluster->getFinalSize(); } } void ZimCreatorData::removeInvalidRedirects() { // sort INFO("sort " << dirents.size() << " directory entries (aid)"); std::sort(dirents.begin(), dirents.end(), compareAid); // remove invalid redirects INFO("remove invalid redirects from " << dirents.size() << " directory entries"); ZimCreatorData::DirentsType::size_type di = 0; while (di < dirents.size()) { if (di % 10000 == 0) INFO(di << "/" << dirents.size() << " directory entries checked for invalid redirects"); if (dirents[di].isRedirect()) { log_debug("check " << dirents[di].getTitle() << " redirect to " << dirents[di].getRedirectAid() << " (" << di << '/' << dirents.size() << ')'); if (!std::binary_search(dirents.begin(), dirents.end(), Dirent(dirents[di].getRedirectAid()), compareAid)) { INFO("remove invalid redirection " << dirents[di].getUrl() << " redirecting to (missing) " << dirents[di].getRedirectAid()); dirents.erase(dirents.begin() + di); continue; } } ++di; } } void ZimCreatorData::setArticleIndexes() { // sort INFO("sort " << dirents.size() << " directory entries (url)"); std::sort(dirents.begin(), dirents.end(), compareUrl); // set index INFO("set index"); article_index_t idx(0); for (auto& dirent: dirents) { dirent.setIdx(idx); idx += 1; } } void ZimCreatorData::resolveRedirectIndexes() { // sort log_debug("sort " << dirents.size() << " directory entries (aid)"); std::sort(dirents.begin(), dirents.end(), compareAid); // translate redirect aid to index INFO("translate redirect aid to index"); for (auto& di: dirents) { if (di.isRedirect()) { auto ddi = std::lower_bound(dirents.begin(), dirents.end(), di.getRedirectAid(), compareAid); if (ddi != dirents.end() && ddi->getAid() == di.getRedirectAid()) { log_debug("redirect aid=" << ddi->getAid() << " redirect index=" << ddi->getIdx()); di.setRedirect(ddi->getIdx()); } else { std::ostringstream msg; msg << "internal error: redirect aid " << di.getRedirectAid() << " not found"; log_fatal(msg.str()); throw std::runtime_error(msg.str()); } } } } void ZimCreatorData::createTitleIndex() { // Sort works on dirents sorted by url. std::sort(dirents.begin(), dirents.end(), compareUrl); titleIdx.resize(0); titleIdx.reserve(dirents.size()); for (auto dirent: dirents) titleIdx.push_back(dirent.getIdx()); CompareTitle compareTitle(dirents); std::sort(titleIdx.begin(), titleIdx.end(), compareTitle); } void ZimCreatorData::resolveMimeTypes() { std::vector oldMImeList; std::vector mapping; for (auto& rmimeType: rmimeTypesMap) { oldMImeList.push_back(rmimeType.second); mimeTypesList.push_back(rmimeType.second); } mapping.resize(oldMImeList.size()); std::sort(mimeTypesList.begin(), mimeTypesList.end()); for (unsigned i=0; i(j); } } for (auto& dirent: dirents) { if (dirent.isArticle()) dirent.setMimeType(mapping[dirent.getMimeType()]); } } uint16_t ZimCreatorData::getMimeTypeIdx(const std::string& mimeType) { auto it = mimeTypesMap.find(mimeType); if (it == mimeTypesMap.end()) { if (nextMimeIdx >= std::numeric_limits::max()) throw std::runtime_error("too many distinct mime types"); mimeTypesMap[mimeType] = nextMimeIdx; rmimeTypesMap[nextMimeIdx] = mimeType; return nextMimeIdx++; } return it->second; } const std::string& ZimCreatorData::getMimeType(uint16_t mimeTypeIdx) const { auto it = rmimeTypesMap.find(mimeTypeIdx); if (it == rmimeTypesMap.end()) throw std::runtime_error("mime type index not found"); return it->second; } zsize_t ZimCreatorData::mimeListSize() const { size_type ret = 1; for (auto& rmimeType: rmimeTypesMap) ret += (rmimeType.second.size() + 1); return zsize_t(ret); } zsize_t ZimCreatorData::indexSize() const { size_type s = 0; for (auto& dirent: dirents) s += dirent.getDirentSize(); return zsize_t(s); } } } libzim-4.0.4/src/writer/zimcreatordata.h000066400000000000000000000105761334353060400203040ustar00rootroot00000000000000/* * Copyright (C) 2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_WRITER_ZIMCREATOR_DATA_H #define ZIM_WRITER_ZIMCREATOR_DATA_H #include #include #include "queue.h" #include "_dirent.h" #include "xapianIndexer.h" #include #include #include #include "config.h" #if defined(ENABLE_XAPIAN) class XapianIndexer; #endif namespace zim { namespace writer { class Cluster; class ZimCreatorData { public: typedef std::vector DirentsType; typedef std::vector ArticleIdxVectorType; typedef std::vector OffsetsType; typedef std::map MimeTypesMap; typedef std::map RMimeTypesMap; typedef std::vector MimeTypesList; typedef std::vector ClusterList; typedef Queue ClusterQueue; typedef std::vector ThreadList; ZimCreatorData(const std::string& fname, bool verbose, bool withIndex, std::string language); virtual ~ZimCreatorData(); void addDirent(const Dirent& dirent, const Article* article); Dirent createDirentFromArticle(const Article* article); Cluster* closeCluster(bool compressed); void generateClustersOffsets(); void removeInvalidRedirects(); void setArticleIndexes(); void resolveRedirectIndexes(); void createTitleIndex(); void resolveMimeTypes(); uint16_t getMimeTypeIdx(const std::string& mimeType); const std::string& getMimeType(uint16_t mimeTypeIdx) const; size_t minChunkSize = 1024-64; DirentsType dirents; ArticleIdxVectorType titleIdx; OffsetsType clusterOffsets; MimeTypesMap mimeTypesMap; RMimeTypesMap rmimeTypesMap; MimeTypesList mimeTypesList; uint16_t nextMimeIdx = 0; ClusterList clustersList; ClusterQueue clustersToWrite; ThreadList runningWriters; CompressionType compression = zimcompLzma; std::string basename; bool isEmpty = true; bool isExtended = false; zsize_t clustersSize; Cluster *compCluster = nullptr; Cluster *uncompCluster = nullptr; std::string tmpfname; bool withIndex; std::string indexingLanguage; #if defined(ENABLE_XAPIAN) XapianIndexer* indexer = nullptr; #endif // Some stats bool verbose; article_index_type nbArticles; article_index_type nbCompArticles; article_index_type nbUnCompArticles; article_index_type nbFileArticles; article_index_type nbIndexArticles; cluster_index_type nbClusters; cluster_index_type nbCompClusters; cluster_index_type nbUnCompClusters; cluster_index_t clusterCount() const { return cluster_index_t(clusterOffsets.size()); } article_index_t articleCount() const { return article_index_t(dirents.size()); } zsize_t mimeListSize() const; zsize_t urlPtrSize() const { return zsize_t(article_index_type(articleCount()) * sizeof(offset_type)); } zsize_t titleIdxSize() const { return zsize_t(article_index_type(articleCount()) * sizeof(article_index_type)); } zsize_t indexSize() const; zsize_t clusterPtrSize() const { return zsize_t(cluster_index_type(clusterCount()) * sizeof(offset_type)); } size_t getMinChunkSize() { return minChunkSize; } void setMinChunkSize(size_t s) { minChunkSize = s; } }; } } #endif // ZIM_WRITER_ZIMCREATOR_DATA_H libzim-4.0.4/src/xapian/000077500000000000000000000000001334353060400150555ustar00rootroot00000000000000libzim-4.0.4/src/xapian/htmlparse.cc000066400000000000000000000233211334353060400173640ustar00rootroot00000000000000/* htmlparse.cc: simple HTML parser for omega indexer * * Copyright 1999,2000,2001 BrightStation PLC * Copyright 2001 Ananova Ltd * Copyright 2002,2006,2007,2008 Olly Betts * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 * USA */ // #include #include "htmlparse.h" #include // #include "utf8convert.h" #include #include #include #include #include using namespace std; inline void lowercase_string(string &str) { for (string::iterator i = str.begin(); i != str.end(); ++i) { *i = tolower(static_cast(*i)); } } map zim::HtmlParser::named_ents; inline static bool p_notdigit(char c) { return !isdigit(static_cast(c)); } inline static bool p_notxdigit(char c) { return !isxdigit(static_cast(c)); } inline static bool p_notalnum(char c) { return !isalnum(static_cast(c)); } inline static bool p_notwhitespace(char c) { return !isspace(static_cast(c)); } inline static bool p_nottag(char c) { return !isalnum(static_cast(c)) && c != '.' && c != '-' && c != ':'; // ':' for XML namespaces. } inline static bool p_whitespacegt(char c) { return isspace(static_cast(c)) || c == '>'; } inline static bool p_whitespaceeqgt(char c) { return isspace(static_cast(c)) || c == '=' || c == '>'; } bool zim::HtmlParser::get_parameter(const string & param, string & value) { map::const_iterator i = parameters.find(param); if (i == parameters.end()) return false; value = i->second; return true; } zim::HtmlParser::HtmlParser() { static const struct ent { const char *n; unsigned int v; } ents[] = { #include "namedentities.h" { NULL, 0 } }; if (named_ents.empty()) { const struct ent *i = ents; while (i->n) { named_ents[string(i->n)] = i->v; ++i; } } } void zim::HtmlParser::decode_entities(string &s) { // We need a const_iterator version of s.end() - otherwise the // find() and find_if() templates don't work... string::const_iterator amp = s.begin(), s_end = s.end(); while ((amp = find(amp, s_end, '&')) != s_end) { unsigned int val = 0; string::const_iterator end, p = amp + 1; if (p != s_end && *p == '#') { p++; if (p != s_end && (*p == 'x' || *p == 'X')) { // hex p++; end = find_if(p, s_end, p_notxdigit); sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val); } else { // number end = find_if(p, s_end, p_notdigit); val = atoi(s.substr(p - s.begin(), end - p).c_str()); } } else { end = find_if(p, s_end, p_notalnum); string code = s.substr(p - s.begin(), end - p); map::const_iterator i; i = named_ents.find(code); if (i != named_ents.end()) val = i->second; } if (end < s_end && *end == ';') end++; if (val) { string::size_type amp_pos = amp - s.begin(); if (val < 0x80) { s.replace(amp_pos, end - amp, 1u, char(val)); } else { // Convert unicode value val to UTF-8. char seq[4]; unsigned len = Xapian::Unicode::nonascii_to_utf8(val, seq); s.replace(amp_pos, end - amp, seq, len); } s_end = s.end(); // We've modified the string, so the iterators are no longer // valid... amp = s.begin() + amp_pos + 1; } else { amp = end; } } } void zim::HtmlParser::parse_html(const string &body) { in_script = false; parameters.clear(); string::const_iterator start = body.begin(); while (true) { // Skip through until we find an HTML tag, a comment, or the end of // document. Ignore isolated occurrences of `<' which don't start // a tag or comment. string::const_iterator p = start; while (true) { p = find(p, body.end(), '<'); if (p == body.end()) break; unsigned char ch = *(p + 1); // Tag, closing tag, or comment (or SGML declaration). if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break; if (ch == '?') { // PHP code or XML declaration. // XML declaration is only valid at the start of the first line. // FIXME: need to deal with BOMs... if (p != body.begin() || body.size() < 20) break; // XML declaration looks something like this: // if (p[2] != 'x' || p[3] != 'm' || p[4] != 'l') break; if (strchr(" \t\r\n", p[5]) == NULL) break; string::const_iterator decl_end = find(p + 6, body.end(), '?'); if (decl_end == body.end()) break; // Default charset for XML is UTF-8. charset = "UTF-8"; string decl(p + 6, decl_end); size_t enc = decl.find("encoding"); if (enc == string::npos) break; enc = decl.find_first_not_of(" \t\r\n", enc + 8); if (enc == string::npos || enc == decl.size()) break; if (decl[enc] != '=') break; enc = decl.find_first_not_of(" \t\r\n", enc + 1); if (enc == string::npos || enc == decl.size()) break; if (decl[enc] != '"' && decl[enc] != '\'') break; char quote = decl[enc++]; size_t enc_end = decl.find(quote, enc); if (enc != string::npos) charset = decl.substr(enc, enc_end - enc); break; } p++; } // Process text up to start of tag. if (p > start) { string text = body.substr(start - body.begin(), p - start); // convert_to_utf8(text, charset); decode_entities(text); process_text(text); } if (p == body.end()) break; start = p + 1; if (start == body.end()) break; if (*start == '!') { if (++start == body.end()) break; if (++start == body.end()) break; // comment or SGML declaration if (*(start - 1) == '-' && *start == '-') { ++start; string::const_iterator close = find(start, body.end(), '>'); // An unterminated comment swallows rest of document // (like Netscape, but unlike MSIE IIRC) if (close == body.end()) break; p = close; // look for --> while (p != body.end() && (*(p - 1) != '-' || *(p - 2) != '-')) p = find(p + 1, body.end(), '>'); if (p != body.end()) { // Check for htdig's "ignore this bit" comments. if (p - start == 15 && string(start, p - 2) == "htdig_noindex") { string::size_type i; i = body.find("", p + 1 - body.begin()); if (i == string::npos) break; start = body.begin() + i + 21; continue; } // If we found --> skip to there. start = p; } else { // Otherwise skip to the first > we found (as Netscape does). start = close; } } else { // just an SGML declaration, perhaps giving the DTD - ignore it start = find(start - 1, body.end(), '>'); if (start == body.end()) break; } ++start; } else if (*start == '?') { if (++start == body.end()) break; // PHP - swallow until ?> or EOF start = find(start + 1, body.end(), '>'); // look for ?> while (start != body.end() && *(start - 1) != '?') start = find(start + 1, body.end(), '>'); // unterminated PHP swallows rest of document (rather arbitrarily // but it avoids polluting the database when things go wrong) if (start != body.end()) ++start; } else { // opening or closing tag int closing = 0; if (*start == '/') { closing = 1; start = find_if(start + 1, body.end(), p_notwhitespace); } p = start; start = find_if(start, body.end(), p_nottag); string tag = body.substr(p - body.begin(), start - p); // convert tagname to lowercase lowercase_string(tag); if (closing) { closing_tag(tag); if (in_script && tag == "script") in_script = false; /* ignore any bogus parameters on closing tags */ p = find(start, body.end(), '>'); if (p == body.end()) break; start = p + 1; } else { // FIXME: parse parameters lazily. while (start < body.end() && *start != '>') { string name, value; p = find_if(start, body.end(), p_whitespaceeqgt); name.assign(body, start - body.begin(), p - start); p = find_if(p, body.end(), p_notwhitespace); start = p; if (start != body.end() && *start == '=') { start = find_if(start + 1, body.end(), p_notwhitespace); p = body.end(); int quote = *start; if (quote == '"' || quote == '\'') { start++; p = find(start, body.end(), quote); } if (p == body.end()) { // unquoted or no closing quote p = find_if(start, body.end(), p_whitespacegt); } value.assign(body, start - body.begin(), p - start); start = find_if(p, body.end(), p_notwhitespace); if (!name.empty()) { // convert parameter name to lowercase lowercase_string(name); // in case of multiple entries, use the first // (as Netscape does) parameters.insert(make_pair(name, value)); } } } #if 0 cout << "<" << tag; map::const_iterator x; for (x = parameters.begin(); x != parameters.end(); x++) { cout << " " << x->first << "=\"" << x->second << "\""; } cout << ">\n"; #endif opening_tag(tag); parameters.clear(); // In