pax_global_header00006660000000000000000000000064121230236620014507gustar00rootroot0000000000000052 comment=637a2899b2b0b04c548f7d381b411d2fefce8627 libm4ri-20130416/000077500000000000000000000000001212302366200133135ustar00rootroot00000000000000libm4ri-20130416/.hgignore000066400000000000000000000005531212302366200151210ustar00rootroot00000000000000syntax: glob INSTALL Makefile.in aclocal.m4 autogen.sh autom4te.cache compile config.guess config.sub src/config.h.in configure depcomp install-sh missing ltmain.sh m4/libtool.m4 m4/ltoptions.m4 m4/ltsugar.m4 m4/ltversion.m4 m4/lt~obsolete.m4 .deps Makefile config.log config.status libtool m4ri.pc src/config.h src/m4ri_config.h src/stamp-h1 testsuite/Makefile libm4ri-20130416/AUTHORS000066400000000000000000000024001212302366200143570ustar00rootroot00000000000000 * Tim Abbott: Debian-isation & advice on correct libtool versioning; * Martin Albrecht: maintainer, release manager, peformance tuning (M4RM, M4RI, Strassen, PLE), initial M4RM implementation, parallelisation, PLE factorisation (MMPF algorithm); * Gregory Bard: initial author, M4RI algorithm and initial implementation; * Marco Bodrato: new Strassen-like sequence for matrix multiplication and squaring which improves performance for squaring; * Michael Brickenstein: PolyBoRi author, standard conformity contributions for ANSIC, test data, discussion/suggestion of performance improvements, fast vector-matrix products; * Alexander Dreyer: PolyBoRi author, standard conformity contributions for ANSIC; * Jean-Guillaume Dumas: linear system resolution; * William Hart: many performance improvements for matrix multiplication and in general; * David Harvey: parallel parity function used in classical multiplication; * David Kirkby: portability issues (Solaris, HP Unix); * Clément Pernet: PLS factorisation, triangular system solving (TRSM); * Wael Said: test cases, feedback; * Carlo Wood: bit-level optimisation (transpose, column swaps), refactoring, benchmark(et)ing framework, test code, build system clean-up;libm4ri-20130416/COPYING000066400000000000000000000431301212302366200143470ustar00rootroot00000000000000 GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Library General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) 19yy This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) 19yy name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. , 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Library General Public License instead of this License. libm4ri-20130416/ChangeLog000066400000000000000000000000001212302366200150530ustar00rootroot00000000000000libm4ri-20130416/Makefile.am000066400000000000000000000053701212302366200153540ustar00rootroot00000000000000AUTOMAKE_OPTIONS = gnu ACLOCAL_AMFLAGS = -I m4 AM_CFLAGS=${SIMD_CFLAGS} ${OPENMP_CFLAGS} ${DEBUG_FLAGS} lib_LTLIBRARIES = libm4ri.la libm4ri_la_SOURCES = \ m4ri/brilliantrussian.c \ m4ri/misc.c \ m4ri/mzd.c \ m4ri/graycode.c \ m4ri/strassen.c \ m4ri/mzp.c \ m4ri/triangular.c \ m4ri/triangular_russian.c \ m4ri/ple.c \ m4ri/ple_russian.c \ m4ri/solve.c \ m4ri/echelonform.c \ m4ri/mmc.c \ m4ri/debug_dump.c \ m4ri/io.c BUILT_SOURCES = m4ri/m4ri_config.h pkgincludesubdir = $(includedir)/m4ri pkgincludesub_HEADERS = m4ri/m4ri.h \ m4ri/brilliantrussian.h \ m4ri/misc.h \ m4ri/mzd.h \ m4ri/graycode.h \ m4ri/strassen.h \ m4ri/parity.h \ m4ri/mzp.h \ m4ri/triangular.h \ m4ri/triangular_russian.h \ m4ri/ple.h \ m4ri/ple_russian.h \ m4ri/solve.h \ m4ri/echelonform.h \ m4ri/xor.h \ m4ri/mmc.h \ m4ri/debug_dump.h \ m4ri/io.h nodist_pkgincludesub_HEADERS = m4ri/m4ri_config.h EXTRA_DIST=m4ri/Doxyfile pkgconfigdir = $(libdir)/pkgconfig pkgconfig_DATA = m4ri.pc libm4ri_la_LDFLAGS = -release 0.0.$(RELEASE) -no-undefined libm4ri_la_LIBADD = $(LIBPNG_LIBADD) check_PROGRAMS=test_multiplication test_elimination test_trsm test_ple test_solve test_kernel test_random test_smallops test_transpose test_colswap test_invert test_misc test_multiplication_SOURCES=testsuite/test_multiplication.c test_multiplication_LDFLAGS=-lm4ri -lm test_multiplication_CFLAGS=$(AM_CFLAGS) test_elimination_SOURCES=testsuite/test_elimination.c test_elimination_LDFLAGS=-lm4ri -lm test_elimination_CFLAGS=$(AM_CFLAGS) test_trsm_SOURCES=testsuite/test_trsm.c test_trsm_LDFLAGS=-lm4ri -lm test_trsm_CFLAGS=$(AM_CFLAGS) test_ple_SOURCES=testsuite/test_ple.c test_ple_LDFLAGS=-lm4ri -lm test_ple_CFLAGS=$(AM_CFLAGS) test_solve_SOURCES=testsuite/test_solve.c test_solve_LDFLAGS=-lm4ri -lm test_solve_CFLAGS=$(AM_CFLAGS) test_kernel_SOURCES=testsuite/test_kernel.c test_kernel_LDFLAGS=-lm4ri -lm test_kernel_CFLAGS=$(AM_CFLAGS) test_random_SOURCES=testsuite/test_random.c test_random_LDFLAGS=-lm4ri -lm test_random_CFLAGS=$(AM_CFLAGS) test_smallops_SOURCES=testsuite/test_smallops.c testsuite/testing.c testsuite/testing.h test_smallops_LDFLAGS=-lm4ri -lm test_smallops_CFLAGS=$(AM_CFLAGS) test_transpose_SOURCES=testsuite/test_transpose.c test_transpose_LDFLAGS=-lm4ri -lm test_transpose_CFLAGS=$(AM_CFLAGS) test_colswap_SOURCES=testsuite/test_colswap.c test_colswap_LDFLAGS=-lm4ri -lm test_colswap_CFLAGS=$(AM_CFLAGS) test_invert_SOURCES=testsuite/test_invert.c test_invert_LDFLAGS=-lm4ri -lm test_invert_CFLAGS=$(AM_CFLAGS) test_misc_SOURCES=testsuite/test_misc.c test_misc_LDFLAGS=-lm4ri -lm test_misc_CFLAGS=$(AM_CFLAGS) TESTS = test_multiplication test_elimination test_trsm test_ple test_solve test_kernel test_random test_smallops test_transpose test_colswap test_invert test_misc libm4ri-20130416/NEWS000066400000000000000000000000001212302366200140000ustar00rootroot00000000000000libm4ri-20130416/README000066400000000000000000000050341212302366200141750ustar00rootroot00000000000000INTRODUCTION ============ M4RI is a library for fast arithmetic with dense matrices over F2. The name M4RI comes from the first implemented algorithm: The "Method of the Four Russian"” inversion algorithm published by Gregory Bard. This algorithm in turn is named after the "Method of the Four Russians" multiplication algorithm which is probably better referred to as Kronrod's method. M4RI is available at http://m4ri.sagemath.org FEATURES ======== * basic arithmetic with dense matrices over F2 (addition, equality testing, stacking, augmenting, sub-matrices, randomisation); * asymptotically fast O(n^log_2(7)) matrix multiplication via the "Method of the Four Russians" (M4RM) & Strassen-Winograd algorithm; * asymptotically fast O(n^log_2(7)) PLE factorisation (Gaussian elimination, system solving, ...); * fast row echelon form computation and matrix inversion via the "Method of the Four Russians" (M4RI, O(n^3/log(n))); * support for the x86/x86_64 SSE2 instruction set where available; * preliminary support for parallelisation on shared memory systems via OpenMP; * and support for Linux, Solaris, and OS X (GCC) and limited support for Windows (Visual Studio 2010). OPENMP SUPPORT ============== OpenMP support for parallel multiplication and elimination is enabled with the --enable-openmp configure switch. If GCC is used to compile the library it is advised to use at least GCC 4.3 since earlier versions have problems with OpenMP in shared libraries. OpenMP support was introduced in GCC 4.2. Both MSVC and SunCC support OpenMP but we have no experience with these yet. Generally speaking better performance improvements can be expected on multi-core AMD CPUs than on multi-core Intel CPUs. This is because the later has a shared L2 cache which is already almost fully utilised in the single-core implementation. INSTALL ======= If you downloaded M4RI as a compressed tarball from its website, installation instructions are contained in the file INSTALL. If you downloaded M4RI by cloning the mainline tree at https://bitbucket.org/malb/m4ri you need to first run the following command: $ autoreconf --install Then follow the instructions in the file INSTALL. DOCUMENTATION ============= To build the reference manual, ensure that you have Doxygen installed. The HTML version of the reference manual can be built as follows: $ cd src/ $ doxygen The built documentation is contained under the doc subdirectory of m4ri/. Once the HTML version is built, you can build the PDF version as follows: $ cd doc/latex/ $ make libm4ri-20130416/configure.ac000066400000000000000000000167211212302366200156100ustar00rootroot00000000000000AC_INIT(m4ri,20130416) AC_CANONICAL_HOST AC_CONFIG_SRCDIR(m4ri/brilliantrussian.c) AM_INIT_AUTOMAKE dnl Include maintainer mode targets. AM_MAINTAINER_MODE dnl Needed when reconfiguring with 'autoreconf -i -s' AC_CONFIG_MACRO_DIR([m4]) dnl Compiling with per-target flags (test_elimination.c) requires AM_PROG_CC_C_O. AM_PROG_CC_C_O AC_PROG_LIBTOOL AC_PROG_INSTALL AC_CONFIG_HEADERS(m4ri/config.h) dnl Check if a C++ compiler was specified. If so, assume we want to wrap word in a C++ class. AC_EGREP_CPP(YES, [ #ifdef __cplusplus YES #endif ], [m4ri_wrapword="yes"], [m4ri_wrapword="no"]) if test "$m4ri_wrapword" = "yes"; then AC_DEFINE([M4RI_WRAPWORD], [/**/], [Defined when compiling with a C++ compiler and word should be a C++ class.]) else dnl We can only run this test when we're using a C compiler. AC_PROG_CC_C99() if test "$ac_cv_prog_cc_c99" = "no"; then AC_MSG_ERROR([C99 support is required but not found.]) fi fi # SSE2 support AC_ARG_ENABLE([sse2], AS_HELP_STRING([--disable-sse2], [don't use SSE2 instruction set.]), , [if test "$m4ri_wrapword" = "yes"; then enable_sse2="no"; else enable_sse2="yes"; fi]) AS_IF([test "x$enable_sse2" != "xno"], [ if test "$m4ri_wrapword" = "yes"; then AC_MSG_ERROR([SSE2 cannot be supported when wrapping word in a C++ class.]) fi case $host_cpu in i[[3456]]86*|x86_64*) AX_CPU_VENDOR() if test "x$ax_cv_cpu_vendor" = "xIntel"; then AX_EXT() # SSE2 is slower on the Opteron fi esac ]) if test x"$ax_cv_have_sse2_ext" = x"yes"; then M4RI_HAVE_SSE2=1 else M4RI_HAVE_SSE2=0 fi AC_SUBST(M4RI_HAVE_SSE2) AC_ARG_WITH(papi, AS_HELP_STRING([--with-papi@<:@=PATH@:>@], [The PAPI install prefix, if configure can't find it.]), [m4ri_config_papi=$withval]) AC_ARG_WITH(cachesize, AS_HELP_STRING([--with-cachesize@<:@=VALUE@:>@], [L1,L2 and L3 cache sizes in bytes, separated by a colon. Overrides cache tuning.]),[m4ri_config_cachesize=$withval]) AC_CHECK_HEADER([mm_malloc.h],AC_DEFINE(HAVE_MM_MALLOC,,[Support aligned allocations]),) if test "$ac_cv_header_mm_malloc_h" = "yes"; then M4RI_HAVE_MM_MALLOC=1 else M4RI_HAVE_MM_MALLOC=0 fi AC_SUBST(M4RI_HAVE_MM_MALLOC) # Correctly working posix_memalign AX_FUNC_POSIX_MEMALIGN if test "$ax_cv_func_posix_memalign_works" = "yes"; then M4RI_HAVE_POSIX_MEMALIGN=1 else M4RI_HAVE_POSIX_MEMALIGN=0 fi AC_SUBST(M4RI_HAVE_POSIX_MEMALIGN) # OpenMP support AC_ARG_ENABLE([openmp], AS_HELP_STRING( [--enable-openmp],[add support for OpenMP multicore support.])) AS_IF([test "x$enable_openmp" = "xyes"], [ AX_OPENMP() ]) AC_SUBST(OPENMP_CFLAGS) if test -n "$OPENMP_CFLAGS"; then M4RI_HAVE_OPENMP=1 else M4RI_HAVE_OPENMP=0 fi AC_SUBST(M4RI_HAVE_OPENMP) # Debugging support AC_ARG_ENABLE([debug], AS_HELP_STRING([--enable-debug], [Enable assert() statements for debugging.])) AC_ARG_ENABLE([debug-dump], AS_HELP_STRING([--enable-debug-dump], [Dump output at exit of every function.])) if test "x$enable_debug_dump" = "xyes"; then M4RI_DEBUG_DUMP=1 else M4RI_DEBUG_DUMP=0 fi AC_SUBST(M4RI_DEBUG_DUMP) AC_ARG_ENABLE([debug-mzd], AS_HELP_STRING([--enable-debug-mzd], [Add consistency checks on matrix structures.])) if test "x$enable_debug_mzd" = "xyes"; then M4RI_DEBUG_MZD=1 else M4RI_DEBUG_MZD=0 fi AC_SUBST(M4RI_DEBUG_MZD) if test "x$enable_debug" = x"yes"; then DEBUG_FLAGS="-g" AC_SUBST(DEBUG_FLAGS) else if test "x$enable_debug_mzd" != "xyes"; then AC_DEFINE(NDEBUG,1,[Define whether debugging is enabled]) fi fi # For the testsuite. Detect if PAPI is installed. See http://icl.cs.utk.edu/papi/ . if test -z "$m4ri_config_papi"; then AC_CHECK_LIB(papi, PAPI_start_counters, [ AX_GUESS_PATH_LIB(papi) AX_GUESS_PATH_HEADER(papi.h) if test -n "$LIBPAPI_PATH"; then PAPI_LDFLAGS="-Wl,-rpath,$LIBPAPI_PATH" PAPI_LIBS="-L$LIBPAPI_PATH -lpapi" else PAPI_LIBS="-lpapi" if ! test -e "/usr/lib/libpapi.so"; then AC_MSG_WARN([Could not find libpapi.so. Use --with-papi= or set LD_LIBRARY_PATH correctly before running benchmark applications.]) fi fi if test -n "$PAPI_H_PATH"; then PAPI_CFLAGS="-I$PAPI_H_PATH" AC_DEFINE_UNQUOTED([HAVE_LIBPAPI], 1, [Define when libpapi is available.]) else AC_MSG_WARN([Could not find papi.h; Use --with-papi= or add -I/include to either CPPFLAGS or CFLAGS, or turn off papi all together by configuring with --without-papi.]) fi ]) fi if test x"$m4ri_config_papi" != x"no" && test -n "$m4ri_config_papi"; then LIBPAPI_PATH="`realpath -s $m4ri_config_papi/lib`" PAPI_H_PATH="`realpath -s $m4ri_config_papi/include`" PAPI_CFLAGS="-I$PAPI_H_PATH" PAPI_LDFLAGS="-Wl,-rpath,$LIBPAPI_PATH" PAPI_LIBS="-L$LIBPAPI_PATH -lpapi" AC_DEFINE_UNQUOTED([HAVE_LIBPAPI], 1, [Define when libpapi is available.]) fi AC_SUBST(PAPI_LIBS) AC_SUBST(PAPI_LDFLAGS) AC_SUBST(PAPI_CFLAGS) AC_ARG_ENABLE([cachetune], AS_HELP_STRING([--enable-cachetune],[calculate cache size from timing information (deprecated).])) # Cache Sizes if test -z $m4ri_config_cachesize; then AX_CACHE_SIZE() AS_IF([test "x$enable_cachetune" = "xyes"], [AC_MSG_WARN(--enable-cachetune is deprecated since it usually does not provide optimal parameters.) AX_CACHE_SIZE_TUNE()]) else AS_IF([test "x$enable_cachetune" = "xyes"], [AC_MSG_WARN(Ignoring cache tuning since --with-cachesize was given.)]) ax_l1_size=`echo $m4ri_config_cachesize | cut -d ":" -f 1` ax_l2_size=`echo $m4ri_config_cachesize | cut -d ":" -f 2` ax_l3_size=`echo $m4ri_config_cachesize | cut -d ":" -f 3` M4RI_CPU_L1_CACHE=${ax_l1_size} M4RI_CPU_L2_CACHE=${ax_l2_size} M4RI_CPU_L3_CACHE=${ax_l3_size} AC_SUBST(M4RI_CPU_L1_CACHE) AC_SUBST(M4RI_CPU_L2_CACHE) AC_SUBST(M4RI_CPU_L3_CACHE) fi # PNG have_libpng="no" AC_ARG_ENABLE([png], [AC_HELP_STRING([--disable-png], [disable PNG support @<:@default=enabled@:>@])], [ if test "x${enableval}" = "xyes" ; then want_png="yes" else want_png="no" fi ], [want_png="yes"]) AC_MSG_CHECKING([whether to build with PNG support]) AC_MSG_RESULT([${want_png}]) if test "x${want_png}" = "xyes" ; then PKG_CHECK_MODULES([PNG], [libpng], [have_libpng="yes"; LIBPNG_LIBADD=`pkg-config --libs libpng`], [have_libpng="no"]) if ! test "x${have_libpng}" = "xyes" ; then AC_CHECK_LIB([png], [png_create_write_struct], [have_libpng="yes"; LIBPNG_LIBADD="-lpng"], [AC_CHECK_LIB([png14], [png_create_write_struct], [have_libpng="yes"; LIBPNG_LIBADD="-lpng14"], [AC_CHECK_LIB([png12], [png_create_write_struct], [have_libpng="yes"; LIBPNG_LIBADD="-lpng12"], [AC_CHECK_LIB([png10], [png_create_write_struct], [have_libpng="yes"; LIBPNG_LIBADD="-lpng10"], [have_libpng="no"]) ]) ]) ]) fi if test "x${have_libpng}" = "xno" ; then AC_MSG_WARN([Can not find a usuable PNG library. Make sure that CPPFLAGS and LDFLAGS are correctly set.]) fi fi if test "x${have_libpng}" = "xyes" ; then M4RI_HAVE_LIBPNG=1 AC_SUBST(M4RI_HAVE_LIBPNG) AC_SUBST(LIBPNG_LIBADD) else M4RI_HAVE_LIBPNG=0 AC_SUBST(M4RI_HAVE_LIBPNG) fi RELEASE="AC_PACKAGE_VERSION" AC_SUBST(RELEASE) AC_PROG_MAKE_SET AC_CONFIG_FILES([Makefile testsuite/Makefile m4ri/m4ri_config.h m4ri.pc]) AC_OUTPUT libm4ri-20130416/m4/000077500000000000000000000000001212302366200136335ustar00rootroot00000000000000libm4ri-20130416/m4/ax_cache_size.m4000066400000000000000000000107201212302366200166620ustar00rootroot00000000000000# =========================================================================== # http://autoconf-archive.cryp.to/ax_cache_size.html # =========================================================================== # # SYNOPSIS # # AX_CACHE_SIZE # # DESCRIPTION # # Find L1 and L2 caches size by reading the corresponding file on UNIX or # by requesting cpuid. The results are available in the substituted variables # M4RI_CPU_L1_CACHE and M4RI_CPU_L2_CACHE. # # This macro depends on AX_GCC_X86_CPUID, AC_PROG_SED, and AX_CPU_VENDOR. # # LAST MODIFICATION # # 2011-04-11 # # COPYLEFT # # Copyright (c) 2008 Christophe Tournayre # # Patched by: # # Copyright (c) 2008 Martin Albrecht # Copyright (c) 2008 Arnaud Bergeron # # Copying and distribution of this file, with or without modification, are # permitted in any medium without royalty provided the copyright notice # and this notice are preserved. AC_DEFUN([AX_CACHE_SIZE], [ AC_REQUIRE([AC_PROG_SED]) AC_REQUIRE([AX_GCC_X86_CPUID]) AC_REQUIRE([AX_CPU_VENDOR]) AX_CPU_VENDOR ax_l1_size= ax_l2_size= #Check if the variable is present if test -e /sys/devices/system/cpu/cpu0/cache/index0/size; then for idx in `seq 0 3`; do if test -e /sys/devices/system/cpu/cpu0/cache/index$idx/size ; then level=`cat /sys/devices/system/cpu/cpu0/cache/index$idx/level` size=`cat /sys/devices/system/cpu/cpu0/cache/index$idx/size` eval CPU0\_L$level\_CACHE="$size" fi done ax_l1_size=$CPU0_L1_CACHE ax_l2_size=$CPU0_L2_CACHE ax_l3_size=$CPU0_L3_CACHE else if test "x$ax_cv_cpu_vendor" != "xUnknown"; then #Or use CPUID AX_GCC_X86_CPUID(0x80000000) cpu_exthigh=`echo $ax_cv_gcc_x86_cpuid_0x80000000 | cut -d ":" -f 1` if test "x$cpu_exthi" > "x80000004"; then AX_GCC_X86_CPUID(0x80000005) # For L1 cache l1_hexval=`echo $ax_cv_gcc_x86_cpuid_0x80000005 | cut -d ":" -f 4` ax_l1_size=$((0x$l1_hexval >> 24)) fi if test "x$cpu_exthi" > "x80000005"; then AX_GCC_X86_CPUID(0x80000006) # For L2 cache l2_hexval=`echo $ax_cv_gcc_x86_cpuid_0x80000006 | cut -d ":" -f 3` ax_l2_size=$((0x$l2_hexval >> 16)) fi if test "x$cpu_exthi" > "x80000005"; then AX_GCC_X86_CPUID(0x80000006) # For L3 cache l2_hexval=`echo $ax_cv_gcc_x86_cpuid_0x80000006 | cut -d ":" -f 4` ax_l2_size=$((0x$l2_hexval >> 18))*512 fi fi #Or use sysctl sysctl_exe= if test -x /usr/sbin/sysctl ; then sysctl_exe=/usr/sbin/sysctl elif test -x /sbin/sysctl ; then sysctl_exe=/sbin/sysctl fi if test -n "$sysctl_exe"; then if test -z "$ax_l2_size" -o "$ax_l2_size" = "0"; then sysctl_out=`$sysctl_exe -n hw.l2cachesize 2>/dev/null`; if test ! -z "$sysctl_out"; then ax_l2_size=$(($sysctl_out / 1024)) fi; fi if test -z "$ax_l1_size" -o "$ax_l1_size" = "0" ; then sysctl_out=`$sysctl_exe -n hw.l1dcachesize 2>/dev/null`; if test ! -z "$sysctl_out"; then ax_l1_size=$(($sysctl_out / 1024)) fi; fi if test -z "$ax_l1_size" -o "ax_l1_size" = "0" ; then sysctl_out=`$sysctl_exe -n hw.l1cachesize 2>/dev/null`; if test ! -z "$sysctl_out"; then ax_l1_size=$(($sysctl_out / 1024)) fi; fi fi fi test -z "$ax_l1_size" && ax_l1_size=0 test -z "$ax_l2_size" && ax_l2_size=0 test -z "$ax_l3_size" && ax_l3_size=$ax_l2_size # Keep only digits if there is a unit (ie 1024K -> 1024) and convert in Bytes AC_MSG_CHECKING(the L1 cache size) ax_l1_size=`echo $ax_l1_size | $SED 's/\([[0-9]]\)[[A-Za-z]]$/\1/g'` ax_l1_size=$(($ax_l1_size*1024)) AC_MSG_RESULT( $ax_l1_size Bytes) AC_MSG_CHECKING(the L2 cache size) ax_l2_size=`echo $ax_l2_size | $SED 's/\([[0-9]]\)[[A-Za-z]]$/\1/g'` ax_l2_size=$(($ax_l2_size*1024)) AC_MSG_RESULT( $ax_l2_size Bytes) AC_MSG_CHECKING(the L3 cache size) ax_l3_size=`echo $ax_l3_size | $SED 's/\([[0-9]]\)[[A-Za-z]]$/\1/g'` ax_l3_size=$(($ax_l3_size*1024)) AC_MSG_RESULT( $ax_l3_size Bytes) M4RI_CPU_L1_CACHE=${ax_l1_size} M4RI_CPU_L2_CACHE=${ax_l2_size} M4RI_CPU_L3_CACHE=${ax_l3_size} AC_SUBST(M4RI_CPU_L1_CACHE) AC_SUBST(M4RI_CPU_L2_CACHE) AC_SUBST(M4RI_CPU_L3_CACHE) ]) libm4ri-20130416/m4/ax_cache_size_tune.m4000066400000000000000000000121141212302366200177140ustar00rootroot00000000000000# SYNOPSIS # # AX_CACHE_SIZE_TUNE # # DESCRIPTION # # Find L1, L2, L3 caches size by running some timing experiments. # The results are available in the defines __M4RI_CPU_L1_CACHE, # __M4RI_CPU_L2_CACHE and __M4RI_CPU_L3_CACHE. # # This macro depends on AC_PROG_SED, AC_PROG_CC. # # LAST MODIFICATION # # 2011-04-11 # # COPYLEFT # # Copyright (c) 2009,2010 Martin Albrecht # # Copying and distribution of this file, with or without modification, are # permitted in any medium without royalty provided the copyright notice # and this notice are preserved. AC_DEFUN([AX_CACHE_SIZE_TUNE], [ AC_REQUIRE([AC_PROG_CC]) AC_REQUIRE([AC_PROG_SED]) AC_LANG_PUSH([C]) AC_CACHE_CHECK(for cache sizes, ax_cv_cache_sizes, [AC_RUN_IFELSE([AC_LANG_PROGRAM([[ #include #include #include #include double walltime(double t0) { double mic, time; double mega = 0.000001; struct timeval tp; static long base_sec = 0; static long base_usec = 0; (void) gettimeofday(&tp,NULL); if (base_sec == 0) { base_sec = tp.tv_sec; base_usec = tp.tv_usec; } time = (double) (tp.tv_sec - base_sec); mic = (double) (tp.tv_usec - base_usec); time = (time + mic * mega) - t0; return(time); } double run_experiment(size_t size, size_t trials) { size_t i,j; unsigned long *a = (unsigned long*)malloc(size/4); unsigned long *b = (unsigned long*)malloc(size/4); unsigned long *c = (unsigned long*)malloc(size/4); unsigned long *d = (unsigned long*)malloc(size/4); size_t n = size/4/(sizeof(unsigned long)); /* we setup a lookup table with a random-ish pattern */ a[0] = 1337; b[0] = 5345345; for(j=1; j 0.25) { _trials = _trials/2; mult = 2*mult; wt /= 2.0; result /= 2.0; } } printf("\n"); } for(i=0;i dtimes[0][max] ) { max = i; } } return candidates[max-1]; } ]], [[ const size_t c1[] = { 4, 8, 16, 32, 64, 128}; const size_t c2[] = { 128, 256, 512}; const size_t c3[] = {1024,1536,2048,3072,4096,6144,8192,16384,32768}; FILE *f; printf("\n"); size_t _l1 = cache_size(c1, 6, 1ULL<<15); size_t _l2 = cache_size(c2, 3, 1ULL<<12); size_t _l3 = cache_size(c3, 9, 1ULL<< 9); f = fopen("conftest_cache_sizes", "w"); if (!f) return 1; fprintf(f,"%lu:%lu:%lu\n",(unsigned long)(_l1*1024),(unsigned long)(_l2*1024),(unsigned long)(_l3*1024)); fclose(f); return 0; ]])], [ax_cv_cache_sizes=`cat conftest_cache_sizes`; rm -f conftest_cache_sizes], [ax_cv_cache_sizes=unknown; rm -f conftest_cache_sizes], [ax_cv_cache_sizes=unknown])]) AC_LANG_POP([C]) AC_MSG_CHECKING(the L1 cache size) ax_l1_size=`echo $ax_cv_cache_sizes | cut -d ':' -f 1` AC_MSG_RESULT( $ax_l1_size Bytes) AC_MSG_CHECKING(the L2 cache size) ax_l2_size=`echo $ax_cv_cache_sizes | cut -d ':' -f 2` AC_MSG_RESULT( $ax_l2_size Bytes) AC_MSG_CHECKING(the L3 cache size) ax_l3_size=`echo $ax_cv_cache_sizes | cut -d ':' -f 3` AC_MSG_RESULT( $ax_l3_size Bytes) M4RI_CPU_L1_CACHE=${ax_l1_size} M4RI_CPU_L2_CACHE=${ax_l2_size} M4RI_CPU_L3_CACHE=${ax_l3_size} AC_SUBST(M4RI_CPU_L1_CACHE) AC_SUBST(M4RI_CPU_L2_CACHE) AC_SUBST(M4RI_CPU_L3_CACHE) ]) libm4ri-20130416/m4/ax_check_compiler_flags.m4000066400000000000000000000063771212302366200207250ustar00rootroot00000000000000# =========================================================================== # http://autoconf-archive.cryp.to/ax_check_compiler_flags.html # =========================================================================== # # SYNOPSIS # # AX_CHECK_COMPILER_FLAGS(FLAGS, [ACTION-SUCCESS], [ACTION-FAILURE]) # # DESCRIPTION # # Check whether the given compiler FLAGS work with the current language's # compiler, or whether they give an error. (Warnings, however, are # ignored.) # # ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on # success/failure. # # LAST MODIFICATION # # 2008-04-12 # # COPYLEFT # # Copyright (c) 2008 Steven G. Johnson # Copyright (c) 2008 Matteo Frigo # # This program is free software: you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. If not, see . # # As a special exception, the respective Autoconf Macro's copyright owner # gives unlimited permission to copy, distribute and modify the configure # scripts that are the output of Autoconf when processing the Macro. You # need not follow the terms of the GNU General Public License when using # or distributing such scripts, even though portions of the text of the # Macro appear in them. The GNU General Public License (GPL) does govern # all other use of the material that constitutes the Autoconf Macro. # # This special exception to the GPL applies to versions of the Autoconf # Macro released by the Autoconf Macro Archive. When you make and # distribute a modified version of the Autoconf Macro, you may extend this # special exception to the GPL to apply to your modified version as well. AC_DEFUN([AX_CHECK_COMPILER_FLAGS], [AC_PREREQ(2.59) dnl for _AC_LANG_PREFIX AC_MSG_CHECKING([whether _AC_LANG compiler accepts $1]) dnl Some hackery here since AC_CACHE_VAL can't handle a non-literal varname: AS_LITERAL_IF([$1], [AC_CACHE_VAL(AS_TR_SH(ax_cv_[]_AC_LANG_ABBREV[]_flags_$1), [ ax_save_FLAGS=$[]_AC_LANG_PREFIX[]FLAGS _AC_LANG_PREFIX[]FLAGS="$1" AC_COMPILE_IFELSE([AC_LANG_PROGRAM()], AS_TR_SH(ax_cv_[]_AC_LANG_ABBREV[]_flags_$1)=yes, AS_TR_SH(ax_cv_[]_AC_LANG_ABBREV[]_flags_$1)=no) _AC_LANG_PREFIX[]FLAGS=$ax_save_FLAGS])], [ax_save_FLAGS=$[]_AC_LANG_PREFIX[]FLAGS _AC_LANG_PREFIX[]FLAGS="$1" AC_COMPILE_IFELSE([AC_LANG_PROGRAM()], eval AS_TR_SH(ax_cv_[]_AC_LANG_ABBREV[]_flags_$1)=yes, eval AS_TR_SH(ax_cv_[]_AC_LANG_ABBREV[]_flags_$1)=no) _AC_LANG_PREFIX[]FLAGS=$ax_save_FLAGS]) eval ax_check_compiler_flags=$AS_TR_SH(ax_cv_[]_AC_LANG_ABBREV[]_flags_$1) AC_MSG_RESULT($ax_check_compiler_flags) if test "x$ax_check_compiler_flags" = xyes; then m4_default([$2], :) else m4_default([$3], :) fi ])dnl AX_CHECK_COMPILER_FLAGS libm4ri-20130416/m4/ax_cpu_vendor.m4000066400000000000000000000027121212302366200167330ustar00rootroot00000000000000# =========================================================================== # http://autoconf-archive.cryp.to/ax_cpu_vendor.html # =========================================================================== # # SYNOPSIS # # AX_CPU_VENDOR # # DESCRIPTION # # Find your CPU's vendor by requesting cpuid and define "ax_cv_cpu_vendor" # accordingly. This macro depends on AX_GCC_X86_CPUID. # # LAST MODIFICATION # # 2008-04-12 # # COPYLEFT # # Copyright (c) 2008 Christophe Tournayre # # Copying and distribution of this file, with or without modification, are # permitted in any medium without royalty provided the copyright notice # and this notice are preserved. AC_DEFUN([AX_CPU_VENDOR], [ AC_REQUIRE([AX_GCC_X86_CPUID]) AX_GCC_X86_CPUID(0x0) AC_CACHE_CHECK(for the processor vendor, ax_cv_cpu_vendor, [ vendor=`echo $ax_cv_gcc_x86_cpuid_0x0 | cut -d ":" -f 2` case $vendor in 756e6547*) ax_cv_cpu_vendor="Intel" ;; 68747541*) ax_cv_cpu_vendor="AMD" ;; 69727943*) ax_cv_cpu_vendor="Cyrix" ;; 746e6543*) ax_cv_cpu_vendor="IDT" ;; 646f6547*) ax_cv_cpu_vendor="Natsemi Geode" ;; 52697365*) ax_cv_cpu_vendor="Rise" ;; 65736952*) ax_cv_cpu_vendor="Rise" ;; 20536953*) ax_cv_cpu_vendor="SiS" ;; *) ax_cv_cpu_vendor="Unknown" ;; esac ]) ]) libm4ri-20130416/m4/ax_ext.m4000066400000000000000000000060111212302366200153630ustar00rootroot00000000000000# =========================================================================== # http://autoconf-archive.cryp.to/ax_ext.html # =========================================================================== # # SYNOPSIS # # AX_EXT # # DESCRIPTION # /# Find supported SIMD extensions by requesting cpuid. When an SIMD # extension is found, the -m"simdextensionname" is added to SIMD_CFLAGS # (only if compilator support it) (ie : if "sse2" is available "-msse2" is # added to SIMD_CFLAGS) # # This macro calls: # # AC_SUBST(SIMD_CFLAGS) # # And defines: # # HAVE_MMX / HAVE_SSE / HAVE_SSE2 / HAVE_SSE3 / HAVE_SSSE3 # # LAST MODIFICATION # # 2008-04-12 # # COPYLEFT # # Copyright (c) 2008 Christophe Tournayre # # Copying and distribution of this file, with or without modification, are # permitted in any medium without royalty provided the copyright notice # and this notice are preserved. AC_DEFUN([AX_EXT], [ #AC_REQUIRE([AX_GCC_X86_CPUID]) AX_GCC_X86_CPUID(0x00000001) ecx=`echo $ax_cv_gcc_x86_cpuid_0x00000001 | cut -d ":" -f 3` edx=`echo $ax_cv_gcc_x86_cpuid_0x00000001 | cut -d ":" -f 4` AC_CACHE_CHECK([whether mmx is supported], [ax_cv_have_mmx_ext], [ ax_cv_have_mmx_ext=no if test "$((0x$edx>>23&0x01))" = 1; then ax_cv_have_mmx_ext=yes fi ]) AC_CACHE_CHECK([whether sse is supported], [ax_cv_have_sse_ext], [ ax_cv_have_sse_ext=no if test "$((0x$edx>>25&0x01))" = 1; then ax_cv_have_sse_ext=yes fi ]) AC_CACHE_CHECK([whether sse2 is supported], [ax_cv_have_sse2_ext], [ ax_cv_have_sse2_ext=no if test "$((0x$edx>>26&0x01))" = 1; then ax_cv_have_sse2_ext=yes fi ]) AC_CACHE_CHECK([whether sse3 is supported], [ax_cv_have_sse3_ext], [ ax_cv_have_sse3_ext=no if test "$((0x$ecx&0x01))" = 1; then ax_cv_have_sse3_ext=yes fi ]) AC_CACHE_CHECK([whether ssse3 is supported], [ax_cv_have_ssse3_ext], [ ax_cv_have_ssse3_ext=no if test "$((0x$ecx>>9&0x01))" = 1; then ax_cv_have_ssse3_ext=yes fi ]) if test "$ax_cv_have_mmx_ext" = yes; then AC_DEFINE(HAVE_MMX,,[Support mmx instructions]) AX_CHECK_COMPILER_FLAGS(-mmmx, SIMD_CFLAGS="$SIMD_CFLAGS -mmmx", []) fi if test "$ax_cv_have_sse_ext" = yes; then AC_DEFINE(HAVE_SSE,,[Support SSE (Streaming SIMD Extensions) instructions]) AX_CHECK_COMPILER_FLAGS(-msse, SIMD_CFLAGS="$SIMD_CFLAGS -msse", []) fi if test "$ax_cv_have_sse2_ext" = yes; then AC_DEFINE(HAVE_SSE2,,[Support SSE2 (Streaming SIMD Extensions 2) instructions]) AX_CHECK_COMPILER_FLAGS(-msse2, SIMD_CFLAGS="$SIMD_CFLAGS -msse2", []) fi if test "$ax_cv_have_sse3_ext" = yes; then AC_DEFINE(HAVE_SSE3,,[Support SSE3 (Streaming SIMD Extensions 3) instructions]) AX_CHECK_COMPILER_FLAGS(-msse3, SIMD_CFLAGS="$SIMD_CFLAGS -msse3", []) fi if test "$ax_cv_have_ssse3_ext" = yes; then AC_DEFINE(HAVE_SSSE3,,[Support SSSE3 (Supplemental Streaming SIMD Extensions 3) instructions]) fi AC_SUBST(SIMD_CFLAGS) ]) libm4ri-20130416/m4/ax_func_posix_memalign.m4000066400000000000000000000027721212302366200206230ustar00rootroot00000000000000# =========================================================================== # http://www.gnu.org/software/autoconf-archive/ax_func_posix_memalign.html # =========================================================================== # # SYNOPSIS # # AX_FUNC_POSIX_MEMALIGN # # DESCRIPTION # # Some versions of posix_memalign (notably glibc 2.2.5) incorrectly apply # their power-of-two check to the size argument, not the alignment # argument. AX_FUNC_POSIX_MEMALIGN defines HAVE_POSIX_MEMALIGN if the # power-of-two check is correctly applied to the alignment argument. # # LICENSE # # Copyright (c) 2008 Scott Pakin # # Copying and distribution of this file, with or without modification, are # permitted in any medium without royalty provided the copyright notice # and this notice are preserved. This file is offered as-is, without any # warranty. #serial 7 AC_DEFUN([AX_FUNC_POSIX_MEMALIGN], [AC_CACHE_CHECK([for working posix_memalign], [ax_cv_func_posix_memalign_works], [AC_TRY_RUN([ #include int main () { void *buffer; /* Some versions of glibc incorrectly perform the alignment check on * the size word. */ exit (posix_memalign (&buffer, sizeof(void *), 123) != 0); } ], [ax_cv_func_posix_memalign_works=yes], [ax_cv_func_posix_memalign_works=no], [ax_cv_func_posix_memalign_works=no])]) if test "$ax_cv_func_posix_memalign_works" = "yes" ; then AC_DEFINE([HAVE_POSIX_MEMALIGN], [1], [Define to 1 if `posix_memalign' works.]) fi ]) libm4ri-20130416/m4/ax_gcc_x86_cpuid.m4000066400000000000000000000071251212302366200172170ustar00rootroot00000000000000# =========================================================================== # http://autoconf-archive.cryp.to/ax_gcc_x86_cpuid.html # =========================================================================== # # SYNOPSIS # # AX_GCC_X86_CPUID(OP) # # DESCRIPTION # # On Pentium and later x86 processors, with gcc or a compiler that has a # compatible syntax for inline assembly instructions, run a small program # that executes the cpuid instruction with input OP. This can be used to # detect the CPU type. # # On output, the values of the eax, ebx, ecx, and edx registers are stored # as hexadecimal strings as "eax:ebx:ecx:edx" in the cache variable # ax_cv_gcc_x86_cpuid_OP. # # If the cpuid instruction fails (because you are running a # cross-compiler, or because you are not using gcc, or because you are on # a processor that doesn't have this instruction), ax_cv_gcc_x86_cpuid_OP # is set to the string "unknown". # # This macro mainly exists to be used in AX_GCC_ARCHFLAG. # # LAST MODIFICATION # # 2008-04-12 # # COPYLEFT # # Copyright (c) 2008 Steven G. Johnson # Copyright (c) 2008 Matteo Frigo # # This program is free software: you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. If not, see . # # As a special exception, the respective Autoconf Macro's copyright owner # gives unlimited permission to copy, distribute and modify the configure # scripts that are the output of Autoconf when processing the Macro. You # need not follow the terms of the GNU General Public License when using # or distributing such scripts, even though portions of the text of the # Macro appear in them. The GNU General Public License (GPL) does govern # all other use of the material that constitutes the Autoconf Macro. # # This special exception to the GPL applies to versions of the Autoconf # Macro released by the Autoconf Macro Archive. When you make and # distribute a modified version of the Autoconf Macro, you may extend this # special exception to the GPL to apply to your modified version as well. AC_DEFUN([AX_GCC_X86_CPUID], [AC_REQUIRE([AC_PROG_CC]) AC_LANG_PUSH([C]) AC_CACHE_CHECK(for x86 cpuid $1 output, ax_cv_gcc_x86_cpuid_$1, [AC_RUN_IFELSE([AC_LANG_PROGRAM([#include ], [ int op = $1+0, eax, ebx, ecx, edx; FILE *f; /* 64-bit code is easy */ if (sizeof(long) == 8) { __asm__("cpuid" : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) : "a" (op)); } else { __asm__("pushl %%ebx \n\t" "cpuid \n\t" "movl %%ebx, %1 \n\t" "popl %%ebx \n\t" : "=a" (eax), "=r" (ebx), "=c" (ecx), "=d" (edx) : "a" (op)); } f = fopen("conftest_cpuid", "w"); if (!f) return 1; fprintf(f, "%x:%x:%x:%x\n", eax, ebx, ecx, edx); fclose(f); return 0; ])], [ax_cv_gcc_x86_cpuid_$1=`cat conftest_cpuid`; rm -f conftest_cpuid], [ax_cv_gcc_x86_cpuid_$1=unknown; rm -f conftest_cpuid], [ax_cv_gcc_x86_cpuid_$1=unknown])]) AC_LANG_POP([C]) ]) libm4ri-20130416/m4/ax_guess_path_header.m4000066400000000000000000000035441212302366200202450ustar00rootroot00000000000000# # SYNOPSIS # # AX_GUESS_PATH_HEADER([foo.h]) # # DESCRIPTION # # Search for header foo.h in -Ipath's found in CPPFLAGS and CFLAGS and set FOO_H_PATH to # the full directory path where foo.h was found. # If no header is found in the paths given in CPPFLAGS and CFLAGS, then lastly it looks in /usr/local/include. # # LAST MODIFICATION # # 2011-04-11 # # COPYLEFT # # Copyright (c) 2011 Carlo Wood # # Copying and distribution of this file, with or without modification, are # permitted in any medium without royalty provided the copyright notice # and this notice are preserved. AC_DEFUN([AX_GUESS_PATH_HEADER], [ function cw_search_header_path { n=2 while test $n -le [$]#; do eval arg=\$"$n" case "$arg" in -I*) path="`echo "$arg" | sed -e 's/-I//'`" if test -e "$path/$1"; then echo "$path" return fi ;; esac n=$((n+1)) done if test -e "/usr/local/include/$1"; then echo "/usr/local/include" fi } have_realpath=`which realpath` cw_headername_uppercase=`echo "m4_toupper([$1])" | sed -e 's/[[^A-Z]]/_/g'` AC_CACHE_CHECK([if we can find [$1]], [cw_cv_"$[]cw_headername_uppercase"_path], [ cw_header_path=`eval cw_search_header_path [$1] $CPPFLAGS $CFLAGS` if test -n "$cw_header_path"; then if test "x$have_realpath" != "x"; then eval cw_cv_"$cw_headername_uppercase"_path=`realpath -s "$cw_header_path"` else eval cw_cv_"$cw_headername_uppercase"_path="$cw_header_path" fi else eval cw_cv_"$cw_headername_uppercase"_path="no" fi ]) if eval test \"\$cw_cv_"$cw_headername_uppercase"_path\" = "no"; then eval "$cw_headername_uppercase"_PATH="" else eval "$cw_headername_uppercase"_PATH=\"\$cw_cv_"$cw_headername_uppercase"_path\" fi ]) libm4ri-20130416/m4/ax_guess_path_lib.m4000066400000000000000000000034431212302366200175610ustar00rootroot00000000000000# # SYNOPSIS # # AX_GUESS_PATH_LIB([foo]) # # DESCRIPTION # # Search for library foo in -Lpath's found in LDFLAGS and set LIBFOO_PATH to # the full directory path where libfoo.so was found. # If no library is found in paths given in LDFLAGS, then lastly it looks in /usr/local/lib. # # LAST MODIFICATION # # 2011-04-11 # # COPYLEFT # # Copyright (c) 2011 Carlo Wood # # Copying and distribution of this file, with or without modification, are # permitted in any medium without royalty provided the copyright notice # and this notice are preserved. AC_DEFUN([AX_GUESS_PATH_LIB], [ function cw_search_library_path { n=2 while test $n -le [$]#; do eval arg=\$"$n" case "$arg" in -L*) path="`echo "$arg" | sed -e 's/-L//'`" if test -e "$path/lib$1.so"; then echo "$path" return fi ;; esac n=$((n+1)) done if test -e "/usr/local/lib/lib$1.so"; then echo "/usr/local/lib" fi } have_realpath=`which realpath` cw_libname_uppercase="m4_toupper([$1])" AC_CACHE_CHECK([if we can find lib[$1].so], [cw_cv_lib"$[]cw_libname_uppercase"_path], [ cw_library_path=`eval cw_search_library_path [$1] $LDFLAGS` if test -n "$cw_library_path"; then if test "x$have_realpath" != "x"; then eval cw_cv_lib"$cw_libname_uppercase"_path=`realpath -s "$cw_library_path"` else eval cw_cv_lib"$cw_libname_uppercase"_path="$cw_library_path" fi else eval cw_cv_lib"$cw_libname_uppercase"_path="no" fi ]) if eval test \"\$cw_cv_lib"$cw_libname_uppercase"_path\" = "no"; then eval LIB"$cw_libname_uppercase"_PATH="" else eval LIB"$cw_libname_uppercase"_PATH=\"\$cw_cv_lib"$cw_libname_uppercase"_path\" fi ]) libm4ri-20130416/m4/ax_openmp.m4000066400000000000000000000103361212302366200160660ustar00rootroot00000000000000# =========================================================================== # http://autoconf-archive.cryp.to/ax_openmp.html # =========================================================================== # # SYNOPSIS # # AX_OPENMP([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]]) # # DESCRIPTION # # This macro tries to find out how to compile programs that use OpenMP a # standard API and set of compiler directives for parallel programming # (see http://www-unix.mcs/) # # On success, it sets the OPENMP_CFLAGS/OPENMP_CXXFLAGS/OPENMP_F77FLAGS # output variable to the flag (e.g. -omp) used both to compile *and* link # OpenMP programs in the current language. # # NOTE: You are assumed to not only compile your program with these flags, # but also link it with them as well. # # If you want to compile everything with OpenMP, you should set: # # CFLAGS="$CFLAGS $OPENMP_CFLAGS" # #OR# CXXFLAGS="$CXXFLAGS $OPENMP_CXXFLAGS" # #OR# FFLAGS="$FFLAGS $OPENMP_FFLAGS" # # (depending on the selected language). # # The user can override the default choice by setting the corresponding # environment variable (e.g. OPENMP_CFLAGS). # # ACTION-IF-FOUND is a list of shell commands to run if an OpenMP flag is # found, and ACTION-IF-NOT-FOUND is a list of commands to run it if it is # not found. If ACTION-IF-FOUND is not specified, the default action will # define HAVE_OPENMP. # # LAST MODIFICATION # # 2008-04-12 # # COPYLEFT # # Copyright (c) 2008 Steven G. Johnson # # This program is free software: you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. If not, see . # # As a special exception, the respective Autoconf Macro's copyright owner # gives unlimited permission to copy, distribute and modify the configure # scripts that are the output of Autoconf when processing the Macro. You # need not follow the terms of the GNU General Public License when using # or distributing such scripts, even though portions of the text of the # Macro appear in them. The GNU General Public License (GPL) does govern # all other use of the material that constitutes the Autoconf Macro. # # This special exception to the GPL applies to versions of the Autoconf # Macro released by the Autoconf Macro Archive. When you make and # distribute a modified version of the Autoconf Macro, you may extend this # special exception to the GPL to apply to your modified version as well. AC_DEFUN([AX_OPENMP], [ AC_PREREQ(2.59) dnl for _AC_LANG_PREFIX AC_CACHE_CHECK([for OpenMP flag of _AC_LANG compiler], ax_cv_[]_AC_LANG_ABBREV[]_openmp, [save[]_AC_LANG_PREFIX[]FLAGS=$[]_AC_LANG_PREFIX[]FLAGS ax_cv_[]_AC_LANG_ABBREV[]_openmp=unknown # Flags to try: -fopenmp (gcc), -openmp (icc), -mp (SGI & PGI), # -xopenmp (Sun), -omp (Tru64), -qsmp=omp (AIX), none ax_openmp_flags="-fopenmp -openmp -mp -xopenmp -omp -qsmp=omp none" if test "x$OPENMP_[]_AC_LANG_PREFIX[]FLAGS" != x; then ax_openmp_flags="$OPENMP_[]_AC_LANG_PREFIX[]FLAGS $ax_openmp_flags" fi for ax_openmp_flag in $ax_openmp_flags; do case $ax_openmp_flag in none) []_AC_LANG_PREFIX[]FLAGS=$save[]_AC_LANG_PREFIX[] ;; *) []_AC_LANG_PREFIX[]FLAGS="$save[]_AC_LANG_PREFIX[]FLAGS $ax_openmp_flag" ;; esac AC_TRY_LINK_FUNC(omp_set_num_threads, [ax_cv_[]_AC_LANG_ABBREV[]_openmp=$ax_openmp_flag; break]) done []_AC_LANG_PREFIX[]FLAGS=$save[]_AC_LANG_PREFIX[]FLAGS ]) if test "x$ax_cv_[]_AC_LANG_ABBREV[]_openmp" = "xunknown"; then m4_default([$2],:) else if test "x$ax_cv_[]_AC_LANG_ABBREV[]_openmp" != "xnone"; then OPENMP_[]_AC_LANG_PREFIX[]FLAGS=$ax_cv_[]_AC_LANG_ABBREV[]_openmp fi m4_default([$1], [AC_DEFINE(HAVE_OPENMP,1,[Define if OpenMP is enabled])]) fi ])dnl AX_OPENMP libm4ri-20130416/m4/pkg.m4000066400000000000000000000130231212302366200146550ustar00rootroot00000000000000# pkg.m4 - Macros to locate and utilise pkg-config. -*- Autoconf -*- # serial 1 (pkg-config-0.24) # # Copyright © 2004 Scott James Remnant . # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # # As a special exception to the GNU General Public License, if you # distribute this file as part of a program that contains a # configuration script generated by Autoconf, you may include it under # the same distribution terms that you use for the rest of that program. # PKG_PROG_PKG_CONFIG([MIN-VERSION]) # ---------------------------------- AC_DEFUN([PKG_PROG_PKG_CONFIG], [m4_pattern_forbid([^_?PKG_[A-Z_]+$]) m4_pattern_allow([^PKG_CONFIG(_(PATH|LIBDIR|SYSROOT_DIR|ALLOW_SYSTEM_(CFLAGS|LIBS)))?$]) m4_pattern_allow([^PKG_CONFIG_(DISABLE_UNINSTALLED|TOP_BUILD_DIR|DEBUG_SPEW)$]) AC_ARG_VAR([PKG_CONFIG], [path to pkg-config utility]) AC_ARG_VAR([PKG_CONFIG_PATH], [directories to add to pkg-config's search path]) AC_ARG_VAR([PKG_CONFIG_LIBDIR], [path overriding pkg-config's built-in search path]) if test "x$ac_cv_env_PKG_CONFIG_set" != "xset"; then AC_PATH_TOOL([PKG_CONFIG], [pkg-config]) fi if test -n "$PKG_CONFIG"; then _pkg_min_version=m4_default([$1], [0.9.0]) AC_MSG_CHECKING([pkg-config is at least version $_pkg_min_version]) if $PKG_CONFIG --atleast-pkgconfig-version $_pkg_min_version; then AC_MSG_RESULT([yes]) else AC_MSG_RESULT([no]) PKG_CONFIG="" fi fi[]dnl ])# PKG_PROG_PKG_CONFIG # PKG_CHECK_EXISTS(MODULES, [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) # # Check to see whether a particular set of modules exists. Similar # to PKG_CHECK_MODULES(), but does not set variables or print errors. # # Please remember that m4 expands AC_REQUIRE([PKG_PROG_PKG_CONFIG]) # only at the first occurence in configure.ac, so if the first place # it's called might be skipped (such as if it is within an "if", you # have to call PKG_CHECK_EXISTS manually # -------------------------------------------------------------- AC_DEFUN([PKG_CHECK_EXISTS], [AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl if test -n "$PKG_CONFIG" && \ AC_RUN_LOG([$PKG_CONFIG --exists --print-errors "$1"]); then m4_default([$2], [:]) m4_ifvaln([$3], [else $3])dnl fi]) # _PKG_CONFIG([VARIABLE], [COMMAND], [MODULES]) # --------------------------------------------- m4_define([_PKG_CONFIG], [if test -n "$$1"; then pkg_cv_[]$1="$$1" elif test -n "$PKG_CONFIG"; then PKG_CHECK_EXISTS([$3], [pkg_cv_[]$1=`$PKG_CONFIG --[]$2 "$3" 2>/dev/null` test "x$?" != "x0" && pkg_failed=yes ], [pkg_failed=yes]) else pkg_failed=untried fi[]dnl ])# _PKG_CONFIG # _PKG_SHORT_ERRORS_SUPPORTED # ----------------------------- AC_DEFUN([_PKG_SHORT_ERRORS_SUPPORTED], [AC_REQUIRE([PKG_PROG_PKG_CONFIG]) if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then _pkg_short_errors_supported=yes else _pkg_short_errors_supported=no fi[]dnl ])# _PKG_SHORT_ERRORS_SUPPORTED # PKG_CHECK_MODULES(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND], # [ACTION-IF-NOT-FOUND]) # # # Note that if there is a possibility the first call to # PKG_CHECK_MODULES might not happen, you should be sure to include an # explicit call to PKG_PROG_PKG_CONFIG in your configure.ac # # # -------------------------------------------------------------- AC_DEFUN([PKG_CHECK_MODULES], [AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl AC_ARG_VAR([$1][_CFLAGS], [C compiler flags for $1, overriding pkg-config])dnl AC_ARG_VAR([$1][_LIBS], [linker flags for $1, overriding pkg-config])dnl pkg_failed=no AC_MSG_CHECKING([for $1]) _PKG_CONFIG([$1][_CFLAGS], [cflags], [$2]) _PKG_CONFIG([$1][_LIBS], [libs], [$2]) m4_define([_PKG_TEXT], [Alternatively, you may set the environment variables $1[]_CFLAGS and $1[]_LIBS to avoid the need to call pkg-config. See the pkg-config man page for more details.]) if test $pkg_failed = yes; then AC_MSG_RESULT([no]) _PKG_SHORT_ERRORS_SUPPORTED if test $_pkg_short_errors_supported = yes; then $1[]_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "$2" 2>&1` else $1[]_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "$2" 2>&1` fi # Put the nasty error message in config.log where it belongs echo "$$1[]_PKG_ERRORS" >&AS_MESSAGE_LOG_FD m4_default([$4], [AC_MSG_ERROR( [Package requirements ($2) were not met: $$1_PKG_ERRORS Consider adjusting the PKG_CONFIG_PATH environment variable if you installed software in a non-standard prefix. _PKG_TEXT])[]dnl ]) elif test $pkg_failed = untried; then AC_MSG_RESULT([no]) m4_default([$4], [AC_MSG_FAILURE( [The pkg-config script could not be found or is too old. Make sure it is in your PATH or set the PKG_CONFIG environment variable to the full path to pkg-config. _PKG_TEXT To get pkg-config, see .])[]dnl ]) else $1[]_CFLAGS=$pkg_cv_[]$1[]_CFLAGS $1[]_LIBS=$pkg_cv_[]$1[]_LIBS AC_MSG_RESULT([yes]) $3 fi[]dnl ])# PKG_CHECK_MODULES libm4ri-20130416/m4ri000077700000000000000000000000001212302366200146752src/ustar00rootroot00000000000000libm4ri-20130416/m4ri.pc.in000066400000000000000000000004101212302366200151120ustar00rootroot00000000000000prefix=@prefix@ exec_prefix=@exec_prefix@ libdir=@libdir@ includedir=@includedir@ Name: M4RI Description: Dense linear algebra over GF(2). Version: @PACKAGE_VERSION@ Libs: -L${libdir} -lm4ri -lm Cflags: -I${includedir}/m4ri @CFLAGS@ @SIMD_CFLAGS@ @OPENMP_CFLAGS@ libm4ri-20130416/m4ri.vcxproj000066400000000000000000000137601212302366200156120ustar00rootroot00000000000000 Debug Win32 Release Win32 {B7057015-3433-44CD-8D13-F5AA62A1AC9E} Win32Proj StaticLibrary StaticLibrary <_ProjectFileVersion>10.0.30319.1 Debug\ Debug\ true Release\ Release\ false .lib .lib Disabled WIN32;_DEBUG;_WINDOWS;_USRDLL;M4RI_EXPORTS;%(PreprocessorDefinitions) true EnableFastChecks MultiThreadedDebugDLL Level3 EditAndContinue CompileAsCpp true Windows MachineX86 WIN32;NDEBUG;_WINDOWS;_USRDLL;M4RI_EXPORTS;%(PreprocessorDefinitions) MultiThreadedDLL Level3 ProgramDatabase CompileAsCpp true Windows true true MachineX86 libm4ri-20130416/src/000077500000000000000000000000001212302366200141025ustar00rootroot00000000000000libm4ri-20130416/src/Doxyfile000066400000000000000000002046521212302366200156210ustar00rootroot00000000000000# Doxyfile 1.7.1 # This file describes the settings to be used by the documentation system # doxygen (www.doxygen.org) for a project # # All text after a hash (#) is considered a comment and will be ignored # The format is: # TAG = value [value, ...] # For lists items can also be appended using: # TAG += value [value, ...] # Values that contain spaces should be placed between quotes (" ") #--------------------------------------------------------------------------- # Project related configuration options #--------------------------------------------------------------------------- # This tag specifies the encoding used for all characters in the config file # that follow. The default is UTF-8 which is also the encoding used for all # text before the first occurrence of this tag. Doxygen uses libiconv (or the # iconv built into libc) for the transcoding. See # http://www.gnu.org/software/libiconv for the list of possible encodings. DOXYFILE_ENCODING = UTF-8 # The PROJECT_NAME tag is a single word (or a sequence of words surrounded # by quotes) that should identify the project. PROJECT_NAME = M4RI # The PROJECT_NUMBER tag can be used to enter a project or revision number. # This could be handy for archiving the generated documentation or # if some version control system is used. PROJECT_NUMBER = 1.0.1 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) # base path where the generated documentation will be put. # If a relative path is entered, it will be relative to the location # where doxygen was started. If left blank the current directory will be used. OUTPUT_DIRECTORY = ../doc/ # If the CREATE_SUBDIRS tag is set to YES, then doxygen will create # 4096 sub-directories (in 2 levels) under the output directory of each output # format and will distribute the generated files over these directories. # Enabling this option can be useful when feeding doxygen a huge amount of # source files, where putting all generated files in the same directory would # otherwise cause performance problems for the file system. CREATE_SUBDIRS = NO # The OUTPUT_LANGUAGE tag is used to specify the language in which all # documentation generated by doxygen is written. Doxygen will use this # information to generate all constant output in the proper language. # The default language is English, other supported languages are: # Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, # Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German, # Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English # messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, # Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrilic, Slovak, # Slovene, Spanish, Swedish, Ukrainian, and Vietnamese. OUTPUT_LANGUAGE = English # If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will # include brief member descriptions after the members that are listed in # the file and class documentation (similar to JavaDoc). # Set to NO to disable this. BRIEF_MEMBER_DESC = YES # If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend # the brief description of a member or function before the detailed description. # Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the # brief descriptions will be completely suppressed. REPEAT_BRIEF = YES # This tag implements a quasi-intelligent brief description abbreviator # that is used to form the text in various listings. Each string # in this list, if found as the leading text of the brief description, will be # stripped from the text and the result after processing the whole list, is # used as the annotated text. Otherwise, the brief description is used as-is. # If left blank, the following values are used ("$name" is automatically # replaced with the name of the entity): "The $name class" "The $name widget" # "The $name file" "is" "provides" "specifies" "contains" # "represents" "a" "an" "the" ABBREVIATE_BRIEF = # If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then # Doxygen will generate a detailed section even if there is only a brief # description. ALWAYS_DETAILED_SEC = NO # If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all # inherited members of a class in the documentation of that class as if those # members were ordinary class members. Constructors, destructors and assignment # operators of the base classes will not be shown. INLINE_INHERITED_MEMB = NO # If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full # path before files name in the file list and in the header files. If set # to NO the shortest path that makes the file name unique will be used. FULL_PATH_NAMES = YES # If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag # can be used to strip a user-defined part of the path. Stripping is # only done if one of the specified strings matches the left-hand part of # the path. The tag can be used to show relative paths in the file list. # If left blank the directory from which doxygen is run is used as the # path to strip. STRIP_FROM_PATH = # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of # the path mentioned in the documentation of a class, which tells # the reader which header file to include in order to use a class. # If left blank only the name of the header file containing the class # definition is used. Otherwise one should specify the include paths that # are normally passed to the compiler using the -I flag. STRIP_FROM_INC_PATH = # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter # (but less readable) file names. This can be useful is your file systems # doesn't support long names like on DOS, Mac, or CD-ROM. SHORT_NAMES = NO # If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen # will interpret the first line (until the first dot) of a JavaDoc-style # comment as the brief description. If set to NO, the JavaDoc # comments will behave just like regular Qt-style comments # (thus requiring an explicit @brief command for a brief description.) JAVADOC_AUTOBRIEF = NO # If the QT_AUTOBRIEF tag is set to YES then Doxygen will # interpret the first line (until the first dot) of a Qt-style # comment as the brief description. If set to NO, the comments # will behave just like regular Qt-style comments (thus requiring # an explicit \brief command for a brief description.) QT_AUTOBRIEF = NO # The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen # treat a multi-line C++ special comment block (i.e. a block of //! or /// # comments) as a brief description. This used to be the default behaviour. # The new default is to treat a multi-line C++ comment block as a detailed # description. Set this tag to YES if you prefer the old behaviour instead. MULTILINE_CPP_IS_BRIEF = NO # If the INHERIT_DOCS tag is set to YES (the default) then an undocumented # member inherits the documentation from any documented member that it # re-implements. INHERIT_DOCS = YES # If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce # a new page for each member. If set to NO, the documentation of a member will # be part of the file/class/namespace that contains it. SEPARATE_MEMBER_PAGES = NO # The TAB_SIZE tag can be used to set the number of spaces in a tab. # Doxygen uses this value to replace tabs by spaces in code fragments. TAB_SIZE = 8 # This tag can be used to specify a number of aliases that acts # as commands in the documentation. An alias has the form "name=value". # For example adding "sideeffect=\par Side Effects:\n" will allow you to # put the command \sideeffect (or @sideeffect) in the documentation, which # will result in a user-defined paragraph with heading "Side Effects:". # You can put \n's in the value part of an alias to insert newlines. ALIASES = "wordoffset=\par Ignores offset atrtribute of packedmatrix.\n" # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C # sources only. Doxygen will then generate output that is more tailored for C. # For instance, some of the names that are used will be different. The list # of all members will be omitted, etc. OPTIMIZE_OUTPUT_FOR_C = YES # Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java # sources only. Doxygen will then generate output that is more tailored for # Java. For instance, namespaces will be presented as packages, qualified # scopes will look different, etc. OPTIMIZE_OUTPUT_JAVA = NO # Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran # sources only. Doxygen will then generate output that is more tailored for # Fortran. OPTIMIZE_FOR_FORTRAN = NO # Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL # sources. Doxygen will then generate output that is tailored for # VHDL. OPTIMIZE_OUTPUT_VHDL = NO # Doxygen selects the parser to use depending on the extension of the files it # parses. With this tag you can assign which parser to use for a given extension. # Doxygen has a built-in mapping, but you can override or extend it using this # tag. The format is ext=language, where ext is a file extension, and language # is one of the parsers supported by doxygen: IDL, Java, Javascript, CSharp, C, # C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, C++. For instance to make # doxygen treat .inc files as Fortran files (default is PHP), and .f files as C # (default is Fortran), use: inc=Fortran f=C. Note that for custom extensions # you also need to set FILE_PATTERNS otherwise the files are not read by doxygen. EXTENSION_MAPPING = # If you use STL classes (i.e. std::string, std::vector, etc.) but do not want # to include (a tag file for) the STL sources as input, then you should # set this tag to YES in order to let doxygen match functions declarations and # definitions whose arguments contain STL classes (e.g. func(std::string); v.s. # func(std::string) {}). This also make the inheritance and collaboration # diagrams that involve STL classes more complete and accurate. BUILTIN_STL_SUPPORT = NO # If you use Microsoft's C++/CLI language, you should set this option to YES to # enable parsing support. CPP_CLI_SUPPORT = NO # Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. # Doxygen will parse them like normal C++ but will assume all classes use public # instead of private inheritance when no explicit protection keyword is present. SIP_SUPPORT = NO # For Microsoft's IDL there are propget and propput attributes to indicate getter # and setter methods for a property. Setting this option to YES (the default) # will make doxygen to replace the get and set methods by a property in the # documentation. This will only work if the methods are indeed getting or # setting a simple type. If this is not the case, or you want to show the # methods anyway, you should set this option to NO. IDL_PROPERTY_SUPPORT = YES # If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC # tag is set to YES, then doxygen will reuse the documentation of the first # member in the group (if any) for the other members of the group. By default # all members of a group must be documented explicitly. DISTRIBUTE_GROUP_DOC = NO # Set the SUBGROUPING tag to YES (the default) to allow class member groups of # the same type (for instance a group of public functions) to be put as a # subgroup of that type (e.g. under the Public Functions section). Set it to # NO to prevent subgrouping. Alternatively, this can be done per class using # the \nosubgrouping command. SUBGROUPING = YES # When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum # is documented as struct, union, or enum with the name of the typedef. So # typedef struct TypeS {} TypeT, will appear in the documentation as a struct # with name TypeT. When disabled the typedef will appear as a member of a file, # namespace, or class. And the struct will be named TypeS. This can typically # be useful for C code in case the coding convention dictates that all compound # types are typedef'ed and only the typedef is referenced, never the tag name. TYPEDEF_HIDES_STRUCT = NO # The SYMBOL_CACHE_SIZE determines the size of the internal cache use to # determine which symbols to keep in memory and which to flush to disk. # When the cache is full, less often used symbols will be written to disk. # For small to medium size projects (<1000 input files) the default value is # probably good enough. For larger projects a too small cache size can cause # doxygen to be busy swapping symbols to and from disk most of the time # causing a significant performance penality. # If the system has enough physical memory increasing the cache will improve the # performance by keeping more symbols in memory. Note that the value works on # a logarithmic scale so increasing the size by one will rougly double the # memory usage. The cache size is given by this formula: # 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0, # corresponding to a cache size of 2^16 = 65536 symbols SYMBOL_CACHE_SIZE = 0 #--------------------------------------------------------------------------- # Build related configuration options #--------------------------------------------------------------------------- # If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in # documentation are documented, even if no documentation was available. # Private class members and static file members will be hidden unless # the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES EXTRACT_ALL = NO # If the EXTRACT_PRIVATE tag is set to YES all private members of a class # will be included in the documentation. EXTRACT_PRIVATE = NO # If the EXTRACT_STATIC tag is set to YES all static members of a file # will be included in the documentation. EXTRACT_STATIC = YES # If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) # defined locally in source files will be included in the documentation. # If set to NO only classes defined in header files are included. EXTRACT_LOCAL_CLASSES = YES # This flag is only useful for Objective-C code. When set to YES local # methods, which are defined in the implementation section but not in # the interface are included in the documentation. # If set to NO (the default) only methods in the interface are included. EXTRACT_LOCAL_METHODS = NO # If this flag is set to YES, the members of anonymous namespaces will be # extracted and appear in the documentation as a namespace called # 'anonymous_namespace{file}', where file will be replaced with the base # name of the file that contains the anonymous namespace. By default # anonymous namespace are hidden. EXTRACT_ANON_NSPACES = NO # If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all # undocumented members of documented classes, files or namespaces. # If set to NO (the default) these members will be included in the # various overviews, but no documentation section is generated. # This option has no effect if EXTRACT_ALL is enabled. HIDE_UNDOC_MEMBERS = NO # If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all # undocumented classes that are normally visible in the class hierarchy. # If set to NO (the default) these classes will be included in the various # overviews. This option has no effect if EXTRACT_ALL is enabled. HIDE_UNDOC_CLASSES = NO # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all # friend (class|struct|union) declarations. # If set to NO (the default) these declarations will be included in the # documentation. HIDE_FRIEND_COMPOUNDS = NO # If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any # documentation blocks found inside the body of a function. # If set to NO (the default) these blocks will be appended to the # function's detailed documentation block. HIDE_IN_BODY_DOCS = NO # The INTERNAL_DOCS tag determines if documentation # that is typed after a \internal command is included. If the tag is set # to NO (the default) then the documentation will be excluded. # Set it to YES to include the internal documentation. INTERNAL_DOCS = NO # If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate # file names in lower-case letters. If set to YES upper-case letters are also # allowed. This is useful if you have classes or files whose names only differ # in case and if your file system supports case sensitive file names. Windows # and Mac users are advised to set this option to NO. CASE_SENSE_NAMES = YES # If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen # will show members with their full class and namespace scopes in the # documentation. If set to YES the scope will be hidden. HIDE_SCOPE_NAMES = NO # If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen # will put a list of the files that are included by a file in the documentation # of that file. SHOW_INCLUDE_FILES = YES # If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen # will list include files with double quotes in the documentation # rather than with sharp brackets. FORCE_LOCAL_INCLUDES = NO # If the INLINE_INFO tag is set to YES (the default) then a tag [inline] # is inserted in the documentation for inline members. INLINE_INFO = YES # If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen # will sort the (detailed) documentation of file and class members # alphabetically by member name. If set to NO the members will appear in # declaration order. SORT_MEMBER_DOCS = YES # If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the # brief documentation of file, namespace and class members alphabetically # by member name. If set to NO (the default) the members will appear in # declaration order. SORT_BRIEF_DOCS = NO # If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen # will sort the (brief and detailed) documentation of class members so that # constructors and destructors are listed first. If set to NO (the default) # the constructors will appear in the respective orders defined by # SORT_MEMBER_DOCS and SORT_BRIEF_DOCS. # This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO # and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO. SORT_MEMBERS_CTORS_1ST = NO # If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the # hierarchy of group names into alphabetical order. If set to NO (the default) # the group names will appear in their defined order. SORT_GROUP_NAMES = NO # If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be # sorted by fully-qualified names, including namespaces. If set to # NO (the default), the class list will be sorted only by class name, # not including the namespace part. # Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. # Note: This option applies only to the class list, not to the # alphabetical list. SORT_BY_SCOPE_NAME = NO # The GENERATE_TODOLIST tag can be used to enable (YES) or # disable (NO) the todo list. This list is created by putting \todo # commands in the documentation. GENERATE_TODOLIST = YES # The GENERATE_TESTLIST tag can be used to enable (YES) or # disable (NO) the test list. This list is created by putting \test # commands in the documentation. GENERATE_TESTLIST = YES # The GENERATE_BUGLIST tag can be used to enable (YES) or # disable (NO) the bug list. This list is created by putting \bug # commands in the documentation. GENERATE_BUGLIST = YES # The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or # disable (NO) the deprecated list. This list is created by putting # \deprecated commands in the documentation. GENERATE_DEPRECATEDLIST= YES # The ENABLED_SECTIONS tag can be used to enable conditional # documentation sections, marked by \if sectionname ... \endif. ENABLED_SECTIONS = # The MAX_INITIALIZER_LINES tag determines the maximum number of lines # the initial value of a variable or define consists of for it to appear in # the documentation. If the initializer consists of more lines than specified # here it will be hidden. Use a value of 0 to hide initializers completely. # The appearance of the initializer of individual variables and defines in the # documentation can be controlled using \showinitializer or \hideinitializer # command in the documentation regardless of this setting. MAX_INITIALIZER_LINES = 30 # Set the SHOW_USED_FILES tag to NO to disable the list of files generated # at the bottom of the documentation of classes and structs. If set to YES the # list will mention the files that were used to generate the documentation. SHOW_USED_FILES = YES # If the sources in your project are distributed over multiple directories # then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy # in the documentation. The default is NO. SHOW_DIRECTORIES = NO # Set the SHOW_FILES tag to NO to disable the generation of the Files page. # This will remove the Files entry from the Quick Index and from the # Folder Tree View (if specified). The default is YES. SHOW_FILES = YES # Set the SHOW_NAMESPACES tag to NO to disable the generation of the # Namespaces page. # This will remove the Namespaces entry from the Quick Index # and from the Folder Tree View (if specified). The default is YES. SHOW_NAMESPACES = YES # The FILE_VERSION_FILTER tag can be used to specify a program or script that # doxygen should invoke to get the current version for each file (typically from # the version control system). Doxygen will invoke the program by executing (via # popen()) the command , where is the value of # the FILE_VERSION_FILTER tag, and is the name of an input file # provided by doxygen. Whatever the program writes to standard output # is used as the file version. See the manual for examples. FILE_VERSION_FILTER = # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed # by doxygen. The layout file controls the global structure of the generated # output files in an output format independent way. The create the layout file # that represents doxygen's defaults, run doxygen with the -l option. # You can optionally specify a file name after the option, if omitted # DoxygenLayout.xml will be used as the name of the layout file. LAYOUT_FILE = #--------------------------------------------------------------------------- # configuration options related to warning and progress messages #--------------------------------------------------------------------------- # The QUIET tag can be used to turn on/off the messages that are generated # by doxygen. Possible values are YES and NO. If left blank NO is used. QUIET = NO # The WARNINGS tag can be used to turn on/off the warning messages that are # generated by doxygen. Possible values are YES and NO. If left blank # NO is used. WARNINGS = YES # If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings # for undocumented members. If EXTRACT_ALL is set to YES then this flag will # automatically be disabled. WARN_IF_UNDOCUMENTED = YES # If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for # potential errors in the documentation, such as not documenting some # parameters in a documented function, or documenting parameters that # don't exist or using markup commands wrongly. WARN_IF_DOC_ERROR = YES # This WARN_NO_PARAMDOC option can be abled to get warnings for # functions that are documented, but have no documentation for their parameters # or return value. If set to NO (the default) doxygen will only warn about # wrong or incomplete parameter documentation, but not about the absence of # documentation. WARN_NO_PARAMDOC = NO # The WARN_FORMAT tag determines the format of the warning messages that # doxygen can produce. The string should contain the $file, $line, and $text # tags, which will be replaced by the file and line number from which the # warning originated and the warning text. Optionally the format may contain # $version, which will be replaced by the version of the file (if it could # be obtained via FILE_VERSION_FILTER) WARN_FORMAT = "$file:$line: $text" # The WARN_LOGFILE tag can be used to specify a file to which warning # and error messages should be written. If left blank the output is written # to stderr. WARN_LOGFILE = #--------------------------------------------------------------------------- # configuration options related to the input files #--------------------------------------------------------------------------- # The INPUT tag can be used to specify the files and/or directories that contain # documented source files. You may enter file names like "myfile.cpp" or # directories like "/usr/src/myproject". Separate the files or directories # with spaces. INPUT = # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is # also the default input encoding. Doxygen uses libiconv (or the iconv built # into libc) for the transcoding. See http://www.gnu.org/software/libiconv for # the list of possible encodings. INPUT_ENCODING = UTF-8 # If the value of the INPUT tag contains directories, you can use the # FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp # and *.h) to filter out the source-files in the directories. If left # blank the following patterns are tested: # *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx # *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py *.f90 FILE_PATTERNS = # The RECURSIVE tag can be used to turn specify whether or not subdirectories # should be searched for input files as well. Possible values are YES and NO. # If left blank NO is used. RECURSIVE = NO # The EXCLUDE tag can be used to specify files and/or directories that should # excluded from the INPUT source files. This way you can easily exclude a # subdirectory from a directory tree whose root is specified with the INPUT tag. EXCLUDE = # The EXCLUDE_SYMLINKS tag can be used select whether or not files or # directories that are symbolic links (a Unix filesystem feature) are excluded # from the input. EXCLUDE_SYMLINKS = NO # If the value of the INPUT tag contains directories, you can use the # EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude # certain files from those directories. Note that the wildcards are matched # against the file with absolute path, so to exclude all test directories # for example use the pattern */test/* EXCLUDE_PATTERNS = # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the # output. The symbol name can be a fully qualified name, a word, or if the # wildcard * is used, a substring. Examples: ANamespace, AClass, # AClass::ANamespace, ANamespace::*Test EXCLUDE_SYMBOLS = # The EXAMPLE_PATH tag can be used to specify one or more files or # directories that contain example code fragments that are included (see # the \include command). EXAMPLE_PATH = ../testsuite # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp # and *.h) to filter out the source-files in the directories. If left # blank all files are included. EXAMPLE_PATTERNS = # If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be # searched for input files to be used with the \include or \dontinclude # commands irrespective of the value of the RECURSIVE tag. # Possible values are YES and NO. If left blank NO is used. EXAMPLE_RECURSIVE = NO # The IMAGE_PATH tag can be used to specify one or more files or # directories that contain image that are included in the documentation (see # the \image command). IMAGE_PATH = # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program # by executing (via popen()) the command , where # is the value of the INPUT_FILTER tag, and is the name of an # input file. Doxygen will then use the output that the filter program writes # to standard output. # If FILTER_PATTERNS is specified, this tag will be # ignored. INPUT_FILTER = # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern # basis. # Doxygen will compare the file name with each pattern and apply the # filter if there is a match. # The filters are a list of the form: # pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further # info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER # is applied to all files. FILTER_PATTERNS = # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using # INPUT_FILTER) will be used to filter the input files when producing source # files to browse (i.e. when SOURCE_BROWSER is set to YES). FILTER_SOURCE_FILES = NO #--------------------------------------------------------------------------- # configuration options related to source browsing #--------------------------------------------------------------------------- # If the SOURCE_BROWSER tag is set to YES then a list of source files will # be generated. Documented entities will be cross-referenced with these sources. # Note: To get rid of all source code in the generated output, make sure also # VERBATIM_HEADERS is set to NO. SOURCE_BROWSER = NO # Setting the INLINE_SOURCES tag to YES will include the body # of functions and classes directly in the documentation. INLINE_SOURCES = NO # Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct # doxygen to hide any special comment blocks from generated source code # fragments. Normal C and C++ comments will always remain visible. STRIP_CODE_COMMENTS = YES # If the REFERENCED_BY_RELATION tag is set to YES # then for each documented function all documented # functions referencing it will be listed. REFERENCED_BY_RELATION = NO # If the REFERENCES_RELATION tag is set to YES # then for each documented function all documented entities # called/used by that function will be listed. REFERENCES_RELATION = NO # If the REFERENCES_LINK_SOURCE tag is set to YES (the default) # and SOURCE_BROWSER tag is set to YES, then the hyperlinks from # functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will # link to the source code. # Otherwise they will link to the documentation. REFERENCES_LINK_SOURCE = YES # If the USE_HTAGS tag is set to YES then the references to source code # will point to the HTML generated by the htags(1) tool instead of doxygen # built-in source browser. The htags tool is part of GNU's global source # tagging system (see http://www.gnu.org/software/global/global.html). You # will need version 4.8.6 or higher. USE_HTAGS = NO # If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen # will generate a verbatim copy of the header file for each class for # which an include is specified. Set to NO to disable this. VERBATIM_HEADERS = YES #--------------------------------------------------------------------------- # configuration options related to the alphabetical class index #--------------------------------------------------------------------------- # If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index # of all compounds will be generated. Enable this if the project # contains a lot of classes, structs, unions or interfaces. ALPHABETICAL_INDEX = NO # If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then # the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns # in which this list will be split (can be a number in the range [1..20]) COLS_IN_ALPHA_INDEX = 5 # In case all classes in a project start with a common prefix, all # classes will be put under the same header in the alphabetical index. # The IGNORE_PREFIX tag can be used to specify one or more prefixes that # should be ignored while generating the index headers. IGNORE_PREFIX = #--------------------------------------------------------------------------- # configuration options related to the HTML output #--------------------------------------------------------------------------- # If the GENERATE_HTML tag is set to YES (the default) Doxygen will # generate HTML output. GENERATE_HTML = YES # The HTML_OUTPUT tag is used to specify where the HTML docs will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `html' will be used as the default path. HTML_OUTPUT = html # The HTML_FILE_EXTENSION tag can be used to specify the file extension for # each generated HTML page (for example: .htm,.php,.asp). If it is left blank # doxygen will generate files with .html extension. HTML_FILE_EXTENSION = .html # The HTML_HEADER tag can be used to specify a personal HTML header for # each generated HTML page. If it is left blank doxygen will generate a # standard header. HTML_HEADER = # The HTML_FOOTER tag can be used to specify a personal HTML footer for # each generated HTML page. If it is left blank doxygen will generate a # standard footer. HTML_FOOTER = # The HTML_STYLESHEET tag can be used to specify a user-defined cascading # style sheet that is used by each HTML page. It can be used to # fine-tune the look of the HTML output. If the tag is left blank doxygen # will generate a default style sheet. Note that doxygen will try to copy # the style sheet file to the HTML output directory, so don't put your own # stylesheet in the HTML output directory as well, or it will be erased! HTML_STYLESHEET = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. # Doxygen will adjust the colors in the stylesheet and background images # according to this color. Hue is specified as an angle on a colorwheel, # see http://en.wikipedia.org/wiki/Hue for more information. # For instance the value 0 represents red, 60 is yellow, 120 is green, # 180 is cyan, 240 is blue, 300 purple, and 360 is red again. # The allowed range is 0 to 359. HTML_COLORSTYLE_HUE = 220 # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of # the colors in the HTML output. For a value of 0 the output will use # grayscales only. A value of 255 will produce the most vivid colors. HTML_COLORSTYLE_SAT = 100 # The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to # the luminance component of the colors in the HTML output. Values below # 100 gradually make the output lighter, whereas values above 100 make # the output darker. The value divided by 100 is the actual gamma applied, # so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2, # and 100 does not change the gamma. HTML_COLORSTYLE_GAMMA = 80 # If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML # page will contain the date and time when the page was generated. Setting # this to NO can help when comparing the output of multiple runs. HTML_TIMESTAMP = YES # If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, # files or namespaces will be aligned in HTML using tables. If set to # NO a bullet list will be used. HTML_ALIGN_MEMBERS = YES # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML # documentation will contain sections that can be hidden and shown after the # page has loaded. For this to work a browser that supports # JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox # Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari). HTML_DYNAMIC_SECTIONS = NO # If the GENERATE_DOCSET tag is set to YES, additional index files # will be generated that can be used as input for Apple's Xcode 3 # integrated development environment, introduced with OSX 10.5 (Leopard). # To create a documentation set, doxygen will generate a Makefile in the # HTML output directory. Running make will produce the docset in that # directory and running "make install" will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find # it at startup. # See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html # for more information. GENERATE_DOCSET = NO # When GENERATE_DOCSET tag is set to YES, this tag determines the name of the # feed. A documentation feed provides an umbrella under which multiple # documentation sets from a single provider (such as a company or product suite) # can be grouped. DOCSET_FEEDNAME = "Doxygen generated docs" # When GENERATE_DOCSET tag is set to YES, this tag specifies a string that # should uniquely identify the documentation set bundle. This should be a # reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen # will append .docset to the name. DOCSET_BUNDLE_ID = org.doxygen.Project # When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely identify # the documentation publisher. This should be a reverse domain-name style # string, e.g. com.mycompany.MyDocSet.documentation. DOCSET_PUBLISHER_ID = org.doxygen.Publisher # The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher. DOCSET_PUBLISHER_NAME = Publisher # If the GENERATE_HTMLHELP tag is set to YES, additional index files # will be generated that can be used as input for tools like the # Microsoft HTML help workshop to generate a compiled HTML help file (.chm) # of the generated HTML documentation. GENERATE_HTMLHELP = NO # If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can # be used to specify the file name of the resulting .chm file. You # can add a path in front of the file if the result should not be # written to the html output directory. CHM_FILE = # If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can # be used to specify the location (absolute path including file name) of # the HTML help compiler (hhc.exe). If non-empty doxygen will try to run # the HTML help compiler on the generated index.hhp. HHC_LOCATION = # If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag # controls if a separate .chi index file is generated (YES) or that # it should be included in the master .chm file (NO). GENERATE_CHI = NO # If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING # is used to encode HtmlHelp index (hhk), content (hhc) and project file # content. CHM_INDEX_ENCODING = # If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag # controls whether a binary table of contents is generated (YES) or a # normal table of contents (NO) in the .chm file. BINARY_TOC = NO # The TOC_EXPAND flag can be set to YES to add extra items for group members # to the contents of the HTML help documentation and to the tree view. TOC_EXPAND = NO # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and # QHP_VIRTUAL_FOLDER are set, an additional index file will be generated # that can be used as input for Qt's qhelpgenerator to generate a # Qt Compressed Help (.qch) of the generated HTML documentation. GENERATE_QHP = NO # If the QHG_LOCATION tag is specified, the QCH_FILE tag can # be used to specify the file name of the resulting .qch file. # The path specified is relative to the HTML output folder. QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating # Qt Help Project output. For more information please see # http://doc.trolltech.com/qthelpproject.html#namespace QHP_NAMESPACE = org.doxygen.Project # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating # Qt Help Project output. For more information please see # http://doc.trolltech.com/qthelpproject.html#virtual-folders QHP_VIRTUAL_FOLDER = doc # If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to # add. For more information please see # http://doc.trolltech.com/qthelpproject.html#custom-filters QHP_CUST_FILTER_NAME = # The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see # # Qt Help Project / Custom Filters. QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's # filter section matches. # # Qt Help Project / Filter Attributes. QHP_SECT_FILTER_ATTRS = # If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can # be used to specify the location of Qt's qhelpgenerator. # If non-empty doxygen will try to run qhelpgenerator on the generated # .qhp file. QHG_LOCATION = # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files # will be generated, which together with the HTML files, form an Eclipse help # plugin. To install this plugin and make it available under the help contents # menu in Eclipse, the contents of the directory containing the HTML and XML # files needs to be copied into the plugins directory of eclipse. The name of # the directory within the plugins directory should be the same as # the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before # the help appears. GENERATE_ECLIPSEHELP = NO # A unique identifier for the eclipse help plugin. When installing the plugin # the directory name containing the HTML and XML files should also have # this name. ECLIPSE_DOC_ID = org.doxygen.Project # The DISABLE_INDEX tag can be used to turn on/off the condensed index at # top of each HTML page. The value NO (the default) enables the index and # the value YES disables it. DISABLE_INDEX = NO # This tag can be used to set the number of enum values (range [1..20]) # that doxygen will group on one line in the generated HTML documentation. ENUM_VALUES_PER_LINE = 4 # The GENERATE_TREEVIEW tag is used to specify whether a tree-like index # structure should be generated to display hierarchical information. # If the tag value is set to YES, a side panel will be generated # containing a tree-like index structure (just like the one that # is generated for HTML Help). For this to work a browser that supports # JavaScript, DHTML, CSS and frames is required (i.e. any modern browser). # Windows users are probably better off using the HTML help feature. GENERATE_TREEVIEW = NO # By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories, # and Class Hierarchy pages using a tree view instead of an ordered list. USE_INLINE_TREES = NO # If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be # used to set the initial width (in pixels) of the frame in which the tree # is shown. TREEVIEW_WIDTH = 250 # When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open # links to external symbols imported via tag files in a separate window. EXT_LINKS_IN_WINDOW = NO # Use this tag to change the font size of Latex formulas included # as images in the HTML documentation. The default is 10. Note that # when you change the font size after a successful doxygen run you need # to manually remove any form_*.png images from the HTML output directory # to force them to be regenerated. FORMULA_FONTSIZE = 10 # Use the FORMULA_TRANPARENT tag to determine whether or not the images # generated for formulas are transparent PNGs. Transparent PNGs are # not supported properly for IE 6.0, but are supported on all modern browsers. # Note that when changing this option you need to delete any form_*.png files # in the HTML output before the changes have effect. FORMULA_TRANSPARENT = YES # When the SEARCHENGINE tag is enabled doxygen will generate a search box # for the HTML output. The underlying search engine uses javascript # and DHTML and should work on any modern browser. Note that when using # HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets # (GENERATE_DOCSET) there is already a search function so this one should # typically be disabled. For large projects the javascript based search engine # can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution. SEARCHENGINE = NO # When the SERVER_BASED_SEARCH tag is enabled the search engine will be # implemented using a PHP enabled web server instead of at the web client # using Javascript. Doxygen will generate the search PHP script and index # file to put on the web server. The advantage of the server # based approach is that it scales better to large projects and allows # full text search. The disadvances is that it is more difficult to setup # and does not have live searching capabilities. SERVER_BASED_SEARCH = NO #--------------------------------------------------------------------------- # configuration options related to the LaTeX output #--------------------------------------------------------------------------- # If the GENERATE_LATEX tag is set to YES (the default) Doxygen will # generate Latex output. GENERATE_LATEX = YES # The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `latex' will be used as the default path. LATEX_OUTPUT = latex # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be # invoked. If left blank `latex' will be used as the default command name. # Note that when enabling USE_PDFLATEX this option is only used for # generating bitmaps for formulas in the HTML output, but not in the # Makefile that is written to the output directory. LATEX_CMD_NAME = latex # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to # generate index for LaTeX. If left blank `makeindex' will be used as the # default command name. MAKEINDEX_CMD_NAME = makeindex # If the COMPACT_LATEX tag is set to YES Doxygen generates more compact # LaTeX documents. This may be useful for small projects and may help to # save some trees in general. COMPACT_LATEX = YES # The PAPER_TYPE tag can be used to set the paper type that is used # by the printer. Possible values are: a4, a4wide, letter, legal and # executive. If left blank a4wide will be used. PAPER_TYPE = a4wide # The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX # packages that should be included in the LaTeX output. EXTRA_PACKAGES = # The LATEX_HEADER tag can be used to specify a personal LaTeX header for # the generated latex document. The header should contain everything until # the first chapter. If it is left blank doxygen will generate a # standard header. Notice: only use this tag if you know what you are doing! LATEX_HEADER = # If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated # is prepared for conversion to pdf (using ps2pdf). The pdf file will # contain links (just like the HTML output) instead of page references # This makes the output suitable for online browsing using a pdf viewer. PDF_HYPERLINKS = YES # If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of # plain latex in the generated Makefile. Set this option to YES to get a # higher quality PDF documentation. USE_PDFLATEX = YES # If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. # command to the generated LaTeX files. This will instruct LaTeX to keep # running if errors occur, instead of asking the user for help. # This option is also used when generating formulas in HTML. LATEX_BATCHMODE = NO # If LATEX_HIDE_INDICES is set to YES then doxygen will not # include the index chapters (such as File Index, Compound Index, etc.) # in the output. LATEX_HIDE_INDICES = NO # If LATEX_SOURCE_CODE is set to YES then doxygen will include # source code with syntax highlighting in the LaTeX output. # Note that which sources are shown also depends on other settings # such as SOURCE_BROWSER. LATEX_SOURCE_CODE = NO #--------------------------------------------------------------------------- # configuration options related to the RTF output #--------------------------------------------------------------------------- # If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output # The RTF output is optimized for Word 97 and may not look very pretty with # other RTF readers or editors. GENERATE_RTF = NO # The RTF_OUTPUT tag is used to specify where the RTF docs will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `rtf' will be used as the default path. RTF_OUTPUT = rtf # If the COMPACT_RTF tag is set to YES Doxygen generates more compact # RTF documents. This may be useful for small projects and may help to # save some trees in general. COMPACT_RTF = NO # If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated # will contain hyperlink fields. The RTF file will # contain links (just like the HTML output) instead of page references. # This makes the output suitable for online browsing using WORD or other # programs which support those fields. # Note: wordpad (write) and others do not support links. RTF_HYPERLINKS = NO # Load stylesheet definitions from file. Syntax is similar to doxygen's # config file, i.e. a series of assignments. You only have to provide # replacements, missing definitions are set to their default value. RTF_STYLESHEET_FILE = # Set optional variables used in the generation of an rtf document. # Syntax is similar to doxygen's config file. RTF_EXTENSIONS_FILE = #--------------------------------------------------------------------------- # configuration options related to the man page output #--------------------------------------------------------------------------- # If the GENERATE_MAN tag is set to YES (the default) Doxygen will # generate man pages GENERATE_MAN = NO # The MAN_OUTPUT tag is used to specify where the man pages will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `man' will be used as the default path. MAN_OUTPUT = man # The MAN_EXTENSION tag determines the extension that is added to # the generated man pages (default is the subroutine's section .3) MAN_EXTENSION = .3 # If the MAN_LINKS tag is set to YES and Doxygen generates man output, # then it will generate one additional man file for each entity # documented in the real man page(s). These additional files # only source the real man page, but without them the man command # would be unable to find the correct page. The default is NO. MAN_LINKS = NO #--------------------------------------------------------------------------- # configuration options related to the XML output #--------------------------------------------------------------------------- # If the GENERATE_XML tag is set to YES Doxygen will # generate an XML file that captures the structure of # the code including all documentation. GENERATE_XML = NO # The XML_OUTPUT tag is used to specify where the XML pages will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `xml' will be used as the default path. XML_OUTPUT = xml # The XML_SCHEMA tag can be used to specify an XML schema, # which can be used by a validating XML parser to check the # syntax of the XML files. XML_SCHEMA = # The XML_DTD tag can be used to specify an XML DTD, # which can be used by a validating XML parser to check the # syntax of the XML files. XML_DTD = # If the XML_PROGRAMLISTING tag is set to YES Doxygen will # dump the program listings (including syntax highlighting # and cross-referencing information) to the XML output. Note that # enabling this will significantly increase the size of the XML output. XML_PROGRAMLISTING = YES #--------------------------------------------------------------------------- # configuration options for the AutoGen Definitions output #--------------------------------------------------------------------------- # If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will # generate an AutoGen Definitions (see autogen.sf.net) file # that captures the structure of the code including all # documentation. Note that this feature is still experimental # and incomplete at the moment. GENERATE_AUTOGEN_DEF = NO #--------------------------------------------------------------------------- # configuration options related to the Perl module output #--------------------------------------------------------------------------- # If the GENERATE_PERLMOD tag is set to YES Doxygen will # generate a Perl module file that captures the structure of # the code including all documentation. Note that this # feature is still experimental and incomplete at the # moment. GENERATE_PERLMOD = NO # If the PERLMOD_LATEX tag is set to YES Doxygen will generate # the necessary Makefile rules, Perl scripts and LaTeX code to be able # to generate PDF and DVI output from the Perl module output. PERLMOD_LATEX = NO # If the PERLMOD_PRETTY tag is set to YES the Perl module output will be # nicely formatted so it can be parsed by a human reader. # This is useful # if you want to understand what is going on. # On the other hand, if this # tag is set to NO the size of the Perl module output will be much smaller # and Perl will parse it just the same. PERLMOD_PRETTY = YES # The names of the make variables in the generated doxyrules.make file # are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. # This is useful so different doxyrules.make files included by the same # Makefile don't overwrite each other's variables. PERLMOD_MAKEVAR_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the preprocessor #--------------------------------------------------------------------------- # If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will # evaluate all C-preprocessor directives found in the sources and include # files. ENABLE_PREPROCESSING = YES # If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro # names in the source code. If set to NO (the default) only conditional # compilation will be performed. Macro expansion can be done in a controlled # way by setting EXPAND_ONLY_PREDEF to YES. MACRO_EXPANSION = NO # If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES # then the macro expansion is limited to the macros specified with the # PREDEFINED and EXPAND_AS_DEFINED tags. EXPAND_ONLY_PREDEF = NO # If the SEARCH_INCLUDES tag is set to YES (the default) the includes files # in the INCLUDE_PATH (see below) will be search if a #include is found. SEARCH_INCLUDES = YES # The INCLUDE_PATH tag can be used to specify one or more directories that # contain include files that are not input files but should be processed by # the preprocessor. INCLUDE_PATH = # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard # patterns (like *.h and *.hpp) to filter out the header-files in the # directories. If left blank, the patterns specified with FILE_PATTERNS will # be used. INCLUDE_FILE_PATTERNS = # The PREDEFINED tag can be used to specify one or more macro names that # are defined before the preprocessor is started (similar to the -D option of # gcc). The argument of the tag is a list of macros of the form: name # or name=definition (no spaces). If the definition and the = are # omitted =1 is assumed. To prevent a macro definition from being # undefined via #undef or recursively expanded use the := operator # instead of the = operator. PREDEFINED = M4RI_DOXYGEN # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then # this tag can be used to specify a list of macro names that should be expanded. # The macro definition that is found in the sources will be used. # Use the PREDEFINED tag if you want to use a different macro definition. EXPAND_AS_DEFINED = # If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then # doxygen's preprocessor will remove all function-like macros that are alone # on a line, have an all uppercase name, and do not end with a semicolon. Such # function macros are typically used for boiler-plate code, and will confuse # the parser if not removed. SKIP_FUNCTION_MACROS = YES #--------------------------------------------------------------------------- # Configuration::additions related to external references #--------------------------------------------------------------------------- # The TAGFILES option can be used to specify one or more tagfiles. # Optionally an initial location of the external documentation # can be added for each tagfile. The format of a tag file without # this location is as follows: # # TAGFILES = file1 file2 ... # Adding location for the tag files is done as follows: # # TAGFILES = file1=loc1 "file2 = loc2" ... # where "loc1" and "loc2" can be relative or absolute paths or # URLs. If a location is present for each tag, the installdox tool # does not have to be run to correct the links. # Note that each tag file must have a unique name # (where the name does NOT include the path) # If a tag file is not located in the directory in which doxygen # is run, you must also specify the path to the tagfile here. TAGFILES = # When a file name is specified after GENERATE_TAGFILE, doxygen will create # a tag file that is based on the input files it reads. GENERATE_TAGFILE = # If the ALLEXTERNALS tag is set to YES all external classes will be listed # in the class index. If set to NO only the inherited external classes # will be listed. ALLEXTERNALS = NO # If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed # in the modules index. If set to NO, only the current project's groups will # be listed. EXTERNAL_GROUPS = YES # The PERL_PATH should be the absolute path and name of the perl script # interpreter (i.e. the result of `which perl'). PERL_PATH = /usr/bin/perl #--------------------------------------------------------------------------- # Configuration options related to the dot tool #--------------------------------------------------------------------------- # If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will # generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base # or super classes. Setting the tag to NO turns the diagrams off. Note that # this option is superseded by the HAVE_DOT option below. This is only a # fallback. It is recommended to install and use dot, since it yields more # powerful graphs. CLASS_DIAGRAMS = YES # You can define message sequence charts within doxygen comments using the \msc # command. Doxygen will then run the mscgen tool (see # http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the # documentation. The MSCGEN_PATH tag allows you to specify the directory where # the mscgen tool resides. If left empty the tool is assumed to be found in the # default search path. MSCGEN_PATH = # If set to YES, the inheritance and collaboration graphs will hide # inheritance and usage relations if the target is undocumented # or is not a class. HIDE_UNDOC_RELATIONS = YES # If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is # available from the path. This tool is part of Graphviz, a graph visualization # toolkit from AT&T and Lucent Bell Labs. The other options in this section # have no effect if this option is set to NO (the default) HAVE_DOT = NO # The DOT_NUM_THREADS specifies the number of dot invocations doxygen is # allowed to run in parallel. When set to 0 (the default) doxygen will # base this on the number of processors available in the system. You can set it # explicitly to a value larger than 0 to get control over the balance # between CPU load and processing speed. DOT_NUM_THREADS = 0 # By default doxygen will write a font called FreeSans.ttf to the output # directory and reference it in all dot files that doxygen generates. This # font does not include all possible unicode characters however, so when you need # these (or just want a differently looking font) you can specify the font name # using DOT_FONTNAME. You need need to make sure dot is able to find the font, # which can be done by putting it in a standard location or by setting the # DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory # containing the font. DOT_FONTNAME = FreeSans.ttf # The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs. # The default size is 10pt. DOT_FONTSIZE = 10 # By default doxygen will tell dot to use the output directory to look for the # FreeSans.ttf font (which doxygen will put there itself). If you specify a # different font using DOT_FONTNAME you can set the path where dot # can find it using this tag. DOT_FONTPATH = # If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen # will generate a graph for each documented class showing the direct and # indirect inheritance relations. Setting this tag to YES will force the # the CLASS_DIAGRAMS tag to NO. CLASS_GRAPH = YES # If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen # will generate a graph for each documented class showing the direct and # indirect implementation dependencies (inheritance, containment, and # class references variables) of the class with other documented classes. COLLABORATION_GRAPH = YES # If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen # will generate a graph for groups, showing the direct groups dependencies GROUP_GRAPHS = YES # If the UML_LOOK tag is set to YES doxygen will generate inheritance and # collaboration diagrams in a style similar to the OMG's Unified Modeling # Language. UML_LOOK = NO # If set to YES, the inheritance and collaboration graphs will show the # relations between templates and their instances. TEMPLATE_RELATIONS = NO # If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT # tags are set to YES then doxygen will generate a graph for each documented # file showing the direct and indirect include dependencies of the file with # other documented files. INCLUDE_GRAPH = YES # If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and # HAVE_DOT tags are set to YES then doxygen will generate a graph for each # documented header file showing the documented files that directly or # indirectly include this file. INCLUDED_BY_GRAPH = YES # If the CALL_GRAPH and HAVE_DOT options are set to YES then # doxygen will generate a call dependency graph for every global function # or class method. Note that enabling this option will significantly increase # the time of a run. So in most cases it will be better to enable call graphs # for selected functions only using the \callgraph command. CALL_GRAPH = NO # If the CALLER_GRAPH and HAVE_DOT tags are set to YES then # doxygen will generate a caller dependency graph for every global function # or class method. Note that enabling this option will significantly increase # the time of a run. So in most cases it will be better to enable caller # graphs for selected functions only using the \callergraph command. CALLER_GRAPH = NO # If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen # will graphical hierarchy of all classes instead of a textual one. GRAPHICAL_HIERARCHY = YES # If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES # then doxygen will show the dependencies a directory has on other directories # in a graphical way. The dependency relations are determined by the #include # relations between the files in the directories. DIRECTORY_GRAPH = YES # The DOT_IMAGE_FORMAT tag can be used to set the image format of the images # generated by dot. Possible values are png, jpg, or gif # If left blank png will be used. DOT_IMAGE_FORMAT = png # The tag DOT_PATH can be used to specify the path where the dot tool can be # found. If left blank, it is assumed the dot tool can be found in the path. DOT_PATH = # The DOTFILE_DIRS tag can be used to specify one or more directories that # contain dot files that are included in the documentation (see the # \dotfile command). DOTFILE_DIRS = # The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of # nodes that will be shown in the graph. If the number of nodes in a graph # becomes larger than this value, doxygen will truncate the graph, which is # visualized by representing a node as a red box. Note that doxygen if the # number of direct children of the root node in a graph is already larger than # DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note # that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH. DOT_GRAPH_MAX_NODES = 50 # The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the # graphs generated by dot. A depth value of 3 means that only nodes reachable # from the root by following a path via at most 3 edges will be shown. Nodes # that lay further from the root node will be omitted. Note that setting this # option to 1 or 2 may greatly reduce the computation time needed for large # code bases. Also note that the size of a graph can be further restricted by # DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction. MAX_DOT_GRAPH_DEPTH = 0 # Set the DOT_TRANSPARENT tag to YES to generate images with a transparent # background. This is disabled by default, because dot on Windows does not # seem to support this out of the box. Warning: Depending on the platform used, # enabling this option may lead to badly anti-aliased labels on the edges of # a graph (i.e. they become hard to read). DOT_TRANSPARENT = YES # Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output # files in one run (i.e. multiple -o and -T options on the command line). This # makes dot run faster, but since only newer versions of dot (>1.8.10) # support this, this feature is disabled by default. DOT_MULTI_TARGETS = NO # If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will # generate a legend page explaining the meaning of the various boxes and # arrows in the dot generated graphs. GENERATE_LEGEND = YES # If the DOT_CLEANUP tag is set to YES (the default) Doxygen will # remove the intermediate dot files that are used to generate # the various graphs. DOT_CLEANUP = YES USE_MATHJAX = YESlibm4ri-20130416/src/brilliantrussian.c000066400000000000000000001164461212302366200176470ustar00rootroot00000000000000/******************************************************************* * * M4RI: Linear Algebra over GF(2) * * Copyright (C) 2007, 2008 Gregory Bard * Copyright (C) 2008-2010 Martin Albrecht * * Distributed under the terms of the GNU General Public License (GPL) * version 2 or higher. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ * ********************************************************************/ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include "brilliantrussian.h" #include "xor.h" #include "graycode.h" #include "echelonform.h" #include "ple_russian.h" /** * \brief Perform Gaussian reduction to reduced row echelon form on a * submatrix. * * The submatrix has dimension at most k starting at r x c of A. Checks * for pivot rows up to row endrow (exclusive). Terminates as soon as * finding a pivot column fails. * * \param A Matrix. * \param r First row. * \param c First column. * \param k Maximal dimension of identity matrix to produce. * \param end_row Maximal row index (exclusive) for rows to consider * for inclusion. */ static inline int _mzd_gauss_submatrix_full(mzd_t *A, rci_t r, rci_t c, rci_t end_row, int k) { assert(k <= m4ri_radix); rci_t start_row = r; rci_t j; for (j = c; j < c + k; ++j) { int found = 0; for (rci_t i = start_row; i < end_row; ++i) { /* first we need to clear the first columns */ word const tmp = mzd_read_bits(A, i, c, j - c + 1); if(tmp) { for (int l = 0; l < j - c; ++l) if (__M4RI_GET_BIT(tmp, l)) mzd_row_add_offset(A, i, r+l, c+l); /* pivot? */ if (mzd_read_bit(A, i, j)) { mzd_row_swap(A, i, start_row); /* clear above */ for (rci_t l = r; l < start_row; ++l) { if (mzd_read_bit(A, l, j)) { mzd_row_add_offset(A, l, start_row, j); } } ++start_row; found = 1; break; } } } if (found == 0) { break; } } __M4RI_DD_MZD(A); __M4RI_DD_INT(j - c); return j - c; } /** * \brief Perform Gaussian reduction to upper triangular matrix on a * submatrix. * * The submatrix has dimension at most k starting at r x c of A. Checks * for pivot rows up to row end_row (exclusive). Terminates as soon as * finding a pivot column fails. * * \param A Matrix. * \param r First row. * \param c First column. * \param k Maximal dimension of identity matrix to produce. * \param end_row Maximal row index (exclusive) for rows to consider * for inclusion. */ static inline int _mzd_gauss_submatrix(mzd_t *A, rci_t r, rci_t c, rci_t end_row, int k) { rci_t start_row = r; int found; rci_t j; for (j = c; j < c+k; ++j) { found = 0; for (rci_t i = start_row; i < end_row; ++i) { /* first we need to clear the first columns */ for (int l = 0; l < j - c; ++l) if (mzd_read_bit(A, i, c+l)) mzd_row_add_offset(A, i, r+l, c+l); /* pivot? */ if (mzd_read_bit(A, i, j)) { mzd_row_swap(A, i, start_row); start_row++; found = 1; break; } } if (found == 0) { break; } } __M4RI_DD_MZD(A); __M4RI_DD_INT(j - c); return j - c; } /** * \brief Given a submatrix in upper triangular form compute the * reduced row echelon form. * * The submatrix has dimension at most k starting at r x c of A. Checks * for pivot rows up to row end_row (exclusive). Terminates as soon as * finding a pivot column fails. * * \param A Matrix. * \param r First row. * \param c First column. * \param k Maximal dimension of identity matrix to produce. * \param end_row Maximal row index (exclusive) for rows to consider * for inclusion. */ static inline int _mzd_gauss_submatrix_top(mzd_t *A, rci_t r, rci_t c, int k) { rci_t start_row = r; for (rci_t j = c; j < c + k; ++j) { for (rci_t l = r; l < start_row; ++l) { if (mzd_read_bit(A, l, j)) { mzd_row_add_offset(A, l, start_row, j); } } ++start_row; } __M4RI_DD_MZD(A); __M4RI_DD_INT(k); return k; } static inline void _mzd_copy_back_rows(mzd_t *A, mzd_t const *U, rci_t r, rci_t c, int k) { wi_t const startblock = c / m4ri_radix; wi_t const width = A->width - startblock; for (int i = 0; i < k; ++i) { word const *const src = U->rows[i] + startblock; word *const dst = A->rows[r+i] + startblock; for (wi_t j = 0; j < width; ++j) { dst[j] = src[j]; } } __M4RI_DD_MZD(A); } void mzd_make_table(mzd_t const *M, rci_t r, rci_t c, int k, mzd_t *T, rci_t *L) { wi_t const homeblock = (c + M->offset) / m4ri_radix; word const mask_end = __M4RI_LEFT_BITMASK((M->ncols + M->offset) % m4ri_radix); word const pure_mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix - ((c + M->offset) % m4ri_radix)); word const mask_begin = (M->width - homeblock != 1) ? pure_mask_begin : pure_mask_begin & mask_end; wi_t const wide = M->width - homeblock; int const twokay = __M4RI_TWOPOW(k); L[0] = 0; for (rci_t i = 1; i < twokay; ++i) { word *ti = T->rows[i] + homeblock; word *ti1 = T->rows[i-1] + homeblock; rci_t const rowneeded = r + m4ri_codebook[k]->inc[i - 1]; int const id = m4ri_codebook[k]->ord[i]; L[id] = i; if (rowneeded >= M->nrows) continue; word *m = M->rows[rowneeded] + homeblock; *ti++ = (*m++ ^ *ti1++) & mask_begin; wi_t j; for(j = 1; j + 8 <= wide - 1; j += 8) { *ti++ = *m++ ^ *ti1++; *ti++ = *m++ ^ *ti1++; *ti++ = *m++ ^ *ti1++; *ti++ = *m++ ^ *ti1++; *ti++ = *m++ ^ *ti1++; *ti++ = *m++ ^ *ti1++; *ti++ = *m++ ^ *ti1++; *ti++ = *m++ ^ *ti1++; } switch(wide - j) { case 8: *ti++ = *m++ ^ *ti1++; case 7: *ti++ = *m++ ^ *ti1++; case 6: *ti++ = *m++ ^ *ti1++; case 5: *ti++ = *m++ ^ *ti1++; case 4: *ti++ = *m++ ^ *ti1++; case 3: *ti++ = *m++ ^ *ti1++; case 2: *ti++ = *m++ ^ *ti1++; case 1: *ti++ = (*m++ ^ *ti1++) & mask_end; } } __M4RI_DD_MZD(T); __M4RI_DD_RCI_ARRAY(L, twokay); } void mzd_process_rows(mzd_t *M, rci_t startrow, rci_t stoprow, rci_t startcol, int k, mzd_t const *T, rci_t const *L) { wi_t const block = startcol / m4ri_radix; wi_t const wide = M->width - block; wi_t const count = (wide + 7) / 8; /* Unrolled loop count */ int const entry_point = wide % 8; /* Unrolled loop entry point */ if(k == 1) { word const bm = m4ri_one << (startcol % m4ri_radix); rci_t r; for (r = startrow; r + 2 <= stoprow; r += 2) { word const b0 = M->rows[r+0][block] & bm; word const b1 = M->rows[r+1][block] & bm; word *m0 = M->rows[r+0] + block; word *m1 = M->rows[r+1] + block; word *t = T->rows[1] + block; wi_t n = count; if((b0 & b1)) { switch (entry_point) { case 0: do { *m0++ ^= *t; *m1++ ^= *t++; case 7: *m0++ ^= *t; *m1++ ^= *t++; case 6: *m0++ ^= *t; *m1++ ^= *t++; case 5: *m0++ ^= *t; *m1++ ^= *t++; case 4: *m0++ ^= *t; *m1++ ^= *t++; case 3: *m0++ ^= *t; *m1++ ^= *t++; case 2: *m0++ ^= *t; *m1++ ^= *t++; case 1: *m0++ ^= *t; *m1++ ^= *t++; } while (--n > 0); } } else if(b0) { switch (entry_point) { case 0: do { *m0++ ^= *t++; case 7: *m0++ ^= *t++; case 6: *m0++ ^= *t++; case 5: *m0++ ^= *t++; case 4: *m0++ ^= *t++; case 3: *m0++ ^= *t++; case 2: *m0++ ^= *t++; case 1: *m0++ ^= *t++; } while (--n > 0); } } else if(b1) { switch (entry_point) { case 0: do { *m1++ ^= *t++; case 7: *m1++ ^= *t++; case 6: *m1++ ^= *t++; case 5: *m1++ ^= *t++; case 4: *m1++ ^= *t++; case 3: *m1++ ^= *t++; case 2: *m1++ ^= *t++; case 1: *m1++ ^= *t++; } while (--n > 0); } } } /* TODO: this code is a bit silly/overkill, it just takes care of the last row */ for( ; r < stoprow; ++r) { rci_t const x0 = L[ mzd_read_bits_int(M, r, startcol, k) ]; word *m0 = M->rows[r] + block; word *t0 = T->rows[x0] + block; wi_t n = count; switch (entry_point) { case 0: do { *m0++ ^= *t0++; case 7: *m0++ ^= *t0++; case 6: *m0++ ^= *t0++; case 5: *m0++ ^= *t0++; case 4: *m0++ ^= *t0++; case 3: *m0++ ^= *t0++; case 2: *m0++ ^= *t0++; case 1: *m0++ ^= *t0++; } while (--n > 0); } } __M4RI_DD_MZD(M); return; } rci_t r; for (r = startrow; r + 2 <= stoprow; r += 2) { rci_t const x0 = L[ mzd_read_bits_int(M, r+0, startcol, k) ]; rci_t const x1 = L[ mzd_read_bits_int(M, r+1, startcol, k) ]; word *m0 = M->rows[r+0] + block; word *t0 = T->rows[x0] + block; word *m1 = M->rows[r+1] + block; word *t1 = T->rows[x1] + block; wi_t n = count; switch (entry_point) { case 0: do { *m0++ ^= *t0++; *m1++ ^= *t1++; case 7: *m0++ ^= *t0++; *m1++ ^= *t1++; case 6: *m0++ ^= *t0++; *m1++ ^= *t1++; case 5: *m0++ ^= *t0++; *m1++ ^= *t1++; case 4: *m0++ ^= *t0++; *m1++ ^= *t1++; case 3: *m0++ ^= *t0++; *m1++ ^= *t1++; case 2: *m0++ ^= *t0++; *m1++ ^= *t1++; case 1: *m0++ ^= *t0++; *m1++ ^= *t1++; } while (--n > 0); } } for( ; r < stoprow; ++r) { rci_t const x0 = L[ mzd_read_bits_int(M, r, startcol, k) ]; word *m0 = M->rows[r] + block; word *t0 = T->rows[x0] + block; wi_t n = count; switch (entry_point) { case 0: do { *m0++ ^= *t0++; case 7: *m0++ ^= *t0++; case 6: *m0++ ^= *t0++; case 5: *m0++ ^= *t0++; case 4: *m0++ ^= *t0++; case 3: *m0++ ^= *t0++; case 2: *m0++ ^= *t0++; case 1: *m0++ ^= *t0++; } while (--n > 0); } } __M4RI_DD_MZD(M); } void mzd_process_rows2(mzd_t *M, rci_t startrow, rci_t stoprow, rci_t startcol, int k, mzd_t const *T0, rci_t const *L0, mzd_t const *T1, rci_t const *L1) { assert(k <= m4ri_radix); wi_t const blocknum = startcol / m4ri_radix; wi_t const wide = M->width - blocknum; wi_t const count = (wide + 7) / 8; /* Unrolled loop count */ int const entry_point = wide % 8; /* Unrolled loop entry point */ int const ka = k / 2; int const kb = k - k / 2; rci_t r; word const ka_bm = __M4RI_LEFT_BITMASK(ka); word const kb_bm = __M4RI_LEFT_BITMASK(kb); #if __M4RI_HAVE_OPENMP #pragma omp parallel for private(r) shared(startrow, stoprow) schedule(static,512) // MAX((__M4RI_CPU_L1_CACHE >> 3) / wide, #endif for(r = startrow; r < stoprow; ++r) { word bits = mzd_read_bits(M, r, startcol, k); rci_t const x0 = L0[ bits & ka_bm ]; bits>>=ka; rci_t const x1 = L1[ bits & kb_bm ]; if((x0 | x1) == 0) // x0 == 0 && x1 == 0 continue; word *m0 = M->rows[r] + blocknum; word const *t0 = T0->rows[x0] + blocknum; word const *t1 = T1->rows[x1] + blocknum; wi_t n = count; switch (entry_point) { case 0: do { *m0++ ^= *t0++ ^ *t1++; case 7: *m0++ ^= *t0++ ^ *t1++; case 6: *m0++ ^= *t0++ ^ *t1++; case 5: *m0++ ^= *t0++ ^ *t1++; case 4: *m0++ ^= *t0++ ^ *t1++; case 3: *m0++ ^= *t0++ ^ *t1++; case 2: *m0++ ^= *t0++ ^ *t1++; case 1: *m0++ ^= *t0++ ^ *t1++; } while (--n > 0); } } __M4RI_DD_MZD(M); } void mzd_process_rows3(mzd_t *M, rci_t startrow, rci_t stoprow, rci_t startcol, int k, mzd_t const *T0, rci_t const *L0, mzd_t const *T1, rci_t const *L1, mzd_t const *T2, rci_t const *L2) { assert(k <= m4ri_radix); wi_t const blocknum = startcol / m4ri_radix; wi_t const wide = M->width - blocknum; wi_t const count = (wide + 7) / 8; /* Unrolled loop count */ int const entry_point = wide % 8; /* Unrolled loop entry point */ int rem = k % 3; int const ka = k / 3 + ((rem >= 2) ? 1 : 0); int const kb = k / 3 + ((rem >= 1) ? 1 : 0); int const kc = k / 3; rci_t r; word const ka_bm = __M4RI_LEFT_BITMASK(ka); word const kb_bm = __M4RI_LEFT_BITMASK(kb); word const kc_bm = __M4RI_LEFT_BITMASK(kc); #if __M4RI_HAVE_OPENMP #pragma omp parallel for private(r) shared(startrow, stoprow) schedule(static,512) //if(stoprow-startrow > 128) #endif for(r= startrow; r < stoprow; ++r) { word bits = mzd_read_bits(M, r, startcol, k); rci_t const x0 = L0[ bits & ka_bm ]; bits>>=ka; rci_t const x1 = L1[ bits & kb_bm ]; bits>>=kb; rci_t const x2 = L2[ bits & kc_bm ]; if((x0 | x1 | x2) == 0) // x0 == 0 && x1 == 0 && x2 == 0 continue; word *m0 = M->rows[r] + blocknum; word const *t0 = T0->rows[x0] + blocknum; word const *t1 = T1->rows[x1] + blocknum; word const *t2 = T2->rows[x2] + blocknum; wi_t n = count; switch (entry_point) { case 0: do { *m0++ ^= *t0++ ^ *t1++ ^ *t2++; case 7: *m0++ ^= *t0++ ^ *t1++ ^ *t2++; case 6: *m0++ ^= *t0++ ^ *t1++ ^ *t2++; case 5: *m0++ ^= *t0++ ^ *t1++ ^ *t2++; case 4: *m0++ ^= *t0++ ^ *t1++ ^ *t2++; case 3: *m0++ ^= *t0++ ^ *t1++ ^ *t2++; case 2: *m0++ ^= *t0++ ^ *t1++ ^ *t2++; case 1: *m0++ ^= *t0++ ^ *t1++ ^ *t2++; } while (--n > 0); } } __M4RI_DD_MZD(M); } void mzd_process_rows4(mzd_t *M, rci_t startrow, rci_t stoprow, rci_t startcol, int k, mzd_t const *T0, rci_t const *L0, mzd_t const *T1, rci_t const *L1, mzd_t const *T2, rci_t const *L2, mzd_t const *T3, rci_t const *L3) { assert(k <= m4ri_radix); wi_t const blocknum = startcol / m4ri_radix; wi_t const wide = M->width - blocknum; wi_t const count = (wide + 7) / 8; /* Unrolled loop count */ int const entry_point = wide % 8; /* Unrolled loop entry point */ int const rem = k % 4; int const ka = k / 4 + ((rem >= 3) ? 1 : 0); int const kb = k / 4 + ((rem >= 2) ? 1 : 0); int const kc = k / 4 + ((rem >= 1) ? 1 : 0); int const kd = k / 4; rci_t r; word const ka_bm = __M4RI_LEFT_BITMASK(ka); word const kb_bm = __M4RI_LEFT_BITMASK(kb); word const kc_bm = __M4RI_LEFT_BITMASK(kc); word const kd_bm = __M4RI_LEFT_BITMASK(kd); #if __M4RI_HAVE_OPENMP #pragma omp parallel for private(r) shared(startrow, stoprow) schedule(static,512) //if(stoprow-startrow > 128) #endif for(r = startrow; r < stoprow; ++r) { word bits = mzd_read_bits(M, r, startcol, k); rci_t const x0 = L0[ bits & ka_bm ]; bits>>=ka; rci_t const x1 = L1[ bits & kb_bm ]; bits>>=kb; rci_t const x2 = L2[ bits & kc_bm ]; bits>>=kc; rci_t const x3 = L3[ bits & kd_bm ]; if(((x0 | x1) | (x2 | x3)) == 0) // x0 == 0 && x1 == 0 && x2 == 0 && x3 == 0 continue; word *m0 = M->rows[r] + blocknum; word const *t0 = T0->rows[x0] + blocknum; word const *t1 = T1->rows[x1] + blocknum; word const *t2 = T2->rows[x2] + blocknum; word const *t3 = T3->rows[x3] + blocknum; wi_t n = count; switch (entry_point) { case 0: do { *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++; case 7: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++; case 6: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++; case 5: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++; case 4: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++; case 3: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++; case 2: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++; case 1: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++; } while (--n > 0); } } __M4RI_DD_MZD(M); } void mzd_process_rows5(mzd_t *M, rci_t startrow, rci_t stoprow, rci_t startcol, int k, mzd_t const *T0, rci_t const *L0, mzd_t const *T1, rci_t const *L1, mzd_t const *T2, rci_t const *L2, mzd_t const *T3, rci_t const *L3, mzd_t const *T4, rci_t const *L4) { assert(k <= m4ri_radix); wi_t const blocknum = startcol / m4ri_radix; wi_t const wide = M->width - blocknum; wi_t const count = (wide + 7) / 8; /* Unrolled loop count */ int const entry_point = wide % 8; /* Unrolled loop entry point */ int rem = k % 5; int const ka = k / 5 + ((rem >= 4) ? 1 : 0); int const kb = k / 5 + ((rem >= 3) ? 1 : 0); int const kc = k / 5 + ((rem >= 2) ? 1 : 0); int const kd = k / 5 + ((rem >= 1) ? 1 : 0); int const ke = k / 5; rci_t r; word const ka_bm = __M4RI_LEFT_BITMASK(ka); word const kb_bm = __M4RI_LEFT_BITMASK(kb); word const kc_bm = __M4RI_LEFT_BITMASK(kc); word const kd_bm = __M4RI_LEFT_BITMASK(kd); word const ke_bm = __M4RI_LEFT_BITMASK(ke); #if __M4RI_HAVE_OPENMP #pragma omp parallel for private(r) shared(startrow, stoprow) schedule(static,512) //if(stoprow-startrow > 128) #endif for(r = startrow; r < stoprow; ++r) { word bits = mzd_read_bits(M, r, startcol, k); rci_t const x0 = L0[ bits & ka_bm ]; bits>>=ka; rci_t const x1 = L1[ bits & kb_bm ]; bits>>=kb; rci_t const x2 = L2[ bits & kc_bm ]; bits>>=kc; rci_t const x3 = L3[ bits & kd_bm ]; bits>>=kd; rci_t const x4 = L4[ bits & ke_bm ]; if(((x0 | x1 | x2) | (x3 | x4)) == 0) // x0 == 0 && x1 == 0 && x2 == 0 && x3 == 0 && x4 == 0 continue; word *m0 = M->rows[r] + blocknum; word const *t0 = T0->rows[x0] + blocknum; word const *t1 = T1->rows[x1] + blocknum; word const *t2 = T2->rows[x2] + blocknum; word const *t3 = T3->rows[x3] + blocknum; word const *t4 = T4->rows[x4] + blocknum; wi_t n = count; switch (entry_point) { case 0: do { *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++; case 7: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++; case 6: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++; case 5: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++; case 4: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++; case 3: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++; case 2: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++; case 1: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++; } while (--n > 0); } } __M4RI_DD_MZD(M); } void mzd_process_rows6(mzd_t *M, rci_t startrow, rci_t stoprow, rci_t startcol, int k, mzd_t const *T0, rci_t const *L0, mzd_t const *T1, rci_t const *L1, mzd_t const *T2, rci_t const *L2, mzd_t const *T3, rci_t const *L3, mzd_t const *T4, rci_t const *L4, mzd_t const *T5, rci_t const *L5) { assert(k <= m4ri_radix); wi_t const blocknum = startcol / m4ri_radix; wi_t const wide = M->width - blocknum; wi_t const count = (wide + 7) / 8; /* Unrolled loop count */ int const entry_point = wide % 8; /* Unrolled loop entry point */ int const rem = k % 6; int const ka = k / 6 + ((rem >= 5) ? 1 : 0); int const kb = k / 6 + ((rem >= 4) ? 1 : 0); int const kc = k / 6 + ((rem >= 3) ? 1 : 0); int const kd = k / 6 + ((rem >= 2) ? 1 : 0); int const ke = k / 6 + ((rem >= 1) ? 1 : 0);; int const kf = k / 6; rci_t r; word const ka_bm = __M4RI_LEFT_BITMASK(ka); word const kb_bm = __M4RI_LEFT_BITMASK(kb); word const kc_bm = __M4RI_LEFT_BITMASK(kc); word const kd_bm = __M4RI_LEFT_BITMASK(kd); word const ke_bm = __M4RI_LEFT_BITMASK(ke); word const kf_bm = __M4RI_LEFT_BITMASK(kf); #if __M4RI_HAVE_OPENMP #pragma omp parallel for private(r) shared(startrow, stoprow) schedule(static,512) //if(stoprow-startrow > 128) #endif for(r = startrow; r < stoprow; ++r) { word bits = mzd_read_bits(M, r, startcol, k); rci_t const x0 = L0[ bits & ka_bm ]; bits>>=ka; rci_t const x1 = L1[ bits & kb_bm ]; bits>>=kb; rci_t const x2 = L2[ bits & kc_bm ]; bits>>=kc; rci_t const x3 = L3[ bits & kd_bm ]; bits>>=kd; rci_t const x4 = L4[ bits & ke_bm ]; bits>>=ke; rci_t const x5 = L5[ bits & kf_bm ]; /* Waste three clocks on OR-ing (modern CPU can do three in * parallel) to avoid possible multiple conditional jumps. */ if(((x0 | x1) | (x2 | x3) | (x4 | x5)) == 0) // x0 == 0 && x1 == 0 && x2 == 0 && x3 == 0 && x4 == 0 && x5 == 0 continue; word *m0 = M->rows[r] + blocknum; word const *t0 = T0->rows[x0] + blocknum; word const *t1 = T1->rows[x1] + blocknum; word const *t2 = T2->rows[x2] + blocknum; word const *t3 = T3->rows[x3] + blocknum; word const *t4 = T4->rows[x4] + blocknum; word const *t5 = T5->rows[x5] + blocknum; wi_t n = count; switch (entry_point) { case 0: do { *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++; case 7: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++; case 6: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++; case 5: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++; case 4: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++; case 3: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++; case 2: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++; case 1: *m0++ ^= *t0++ ^ *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++; } while (--n > 0); } } __M4RI_DD_MZD(M); } rci_t _mzd_echelonize_m4ri(mzd_t *A, int const full, int k, int heuristic, double const threshold) { /** * \par General algorithm * \li Step 1.Denote the first column to be processed in a given * iteration as \f$a_i\f$. Then, perform Gaussian elimination on the * first \f$3k\f$ rows after and including the \f$i\f$-th row to * produce an identity matrix in \f$a_{i,i} ... a_{i+k-1,i+k-1},\f$ * and zeroes in \f$a_{i+k,i} ... a_{i+3k-1,i+k-1}\f$. * * \li Step 2. Construct a table consisting of the \f$2^k\f$ binary strings of * length k in a Gray code. Thus with only \f$2^k\f$ vector * additions, all possible linear combinations of these k rows * have been precomputed. * * \li Step 3. One can rapidly process the remaining rows from \f$i + * 3k\f$ until row \f$m\f$ (the last row) by using the table. For * example, suppose the \f$j\f$-th row has entries \f$a_{j,i} * ... a_{j,i+k-1}\f$ in the columns being processed. Selecting the * row of the table associated with this k-bit string, and adding it * to row j will force the k columns to zero, and adjust the * remaining columns from \f$ i + k\f$ to n in the appropriate way, * as if Gaussian elimination had been performed. * * \li Step 4. While the above form of the algorithm will reduce a * system of boolean linear equations to unit upper triangular form, * and thus permit a system to be solved with back substitution, the * M4RI algorithm can also be used to invert a matrix, or put the * system into reduced row echelon form (RREF). Simply run Step 3 * on rows \f$0 ... i-1\f$ as well as on rows \f$i + 3k * ... m\f$. This only affects the complexity slightly, changing the * 2.5 coeffcient to 3. * * \attention This function implements a variant of the algorithm * described above. If heuristic is true, then this algorithm, will * switch to PLUQ based echelon form computation once the density * reaches the threshold. */ rci_t const ncols = A->ncols; if (k == 0) { k = m4ri_opt_k(A->nrows, ncols, 0); if (k >= 7) k = 7; if (0.75 * __M4RI_TWOPOW(k) * ncols > __M4RI_CPU_L3_CACHE / 2.0) k -= 1; } int kk = 6 * k; mzd_t *U = mzd_init(kk, ncols); mzd_t *T0 = mzd_init(__M4RI_TWOPOW(k), ncols); mzd_t *T1 = mzd_init(__M4RI_TWOPOW(k), ncols); mzd_t *T2 = mzd_init(__M4RI_TWOPOW(k), ncols); mzd_t *T3 = mzd_init(__M4RI_TWOPOW(k), ncols); mzd_t *T4 = mzd_init(__M4RI_TWOPOW(k), ncols); mzd_t *T5 = mzd_init(__M4RI_TWOPOW(k), ncols); rci_t *L0 = (rci_t*)m4ri_mm_calloc(__M4RI_TWOPOW(k), sizeof(rci_t)); rci_t *L1 = (rci_t*)m4ri_mm_calloc(__M4RI_TWOPOW(k), sizeof(rci_t)); rci_t *L2 = (rci_t*)m4ri_mm_calloc(__M4RI_TWOPOW(k), sizeof(rci_t)); rci_t *L3 = (rci_t*)m4ri_mm_calloc(__M4RI_TWOPOW(k), sizeof(rci_t)); rci_t *L4 = (rci_t*)m4ri_mm_calloc(__M4RI_TWOPOW(k), sizeof(rci_t)); rci_t *L5 = (rci_t*)m4ri_mm_calloc(__M4RI_TWOPOW(k), sizeof(rci_t)); rci_t last_check = 0; rci_t r = 0; rci_t c = 0; if (heuristic) { if (c < ncols && r < A->nrows && _mzd_density(A, 32, 0, 0) >= threshold) { wi_t const tmp = c / m4ri_radix; rci_t const tmp2 = tmp * m4ri_radix; mzd_t *Abar = mzd_init_window(A, r, tmp2, A->nrows, ncols); r += mzd_echelonize_pluq(Abar, full); mzd_free(Abar); c = ncols; } } while(c < ncols) { if (heuristic && c > (last_check + 256)) { last_check = c; if (c < ncols && r < A->nrows && _mzd_density(A, 32, r, c) >= threshold) { mzd_t *Abar = mzd_init_window(A, r, (c / m4ri_radix) * m4ri_radix, A->nrows, ncols); if (!full) { r += mzd_echelonize_pluq(Abar, full); } else { rci_t r2 = mzd_echelonize_pluq(Abar, full); if (r > 0) _mzd_top_echelonize_m4ri(A, 0, r, c, r); r += r2; } mzd_free(Abar); break; } } if(c + kk > ncols) { kk = ncols - c; } int kbar; if (full) { kbar = _mzd_gauss_submatrix_full(A, r, c, A->nrows, kk); } else { kbar = _mzd_gauss_submatrix(A, r, c, A->nrows, kk); /* this isn't necessary, adapt make_table */ U = mzd_submatrix(U, A, r, 0, r + kbar, ncols); _mzd_gauss_submatrix_top(A, r, c, kbar); } if (kbar > 5 * k) { int const rem = kbar % 6; int const ka = kbar / 6 + ((rem >= 5) ? 1 : 0); int const kb = kbar / 6 + ((rem >= 4) ? 1 : 0); int const kc = kbar / 6 + ((rem >= 3) ? 1 : 0); int const kd = kbar / 6 + ((rem >= 2) ? 1 : 0); int const ke = kbar / 6 + ((rem >= 1) ? 1 : 0);; int const kf = kbar / 6; if(full || kbar == kk) { mzd_make_table(A, r, c, ka, T0, L0); mzd_make_table(A, r+ka, c, kb, T1, L1); mzd_make_table(A, r+ka+kb, c, kc, T2, L2); mzd_make_table(A, r+ka+kb+kc, c, kd, T3, L3); mzd_make_table(A, r+ka+kb+kc+kd, c, ke, T4, L4); mzd_make_table(A, r+ka+kb+kc+kd+ke, c, kf, T5, L5); } if(kbar == kk) mzd_process_rows6(A, r+kbar, A->nrows, c, kbar, T0, L0, T1, L1, T2, L2, T3, L3, T4, L4, T5, L5); if(full) mzd_process_rows6(A, 0, r, c, kbar, T0, L0, T1, L1, T2, L2, T3, L3, T4, L4, T5, L5); } else if (kbar > 4 * k) { int const rem = kbar % 5; int const ka = kbar / 5 + ((rem >= 4) ? 1 : 0); int const kb = kbar / 5 + ((rem >= 3) ? 1 : 0); int const kc = kbar / 5 + ((rem >= 2) ? 1 : 0); int const kd = kbar / 5 + ((rem >= 1) ? 1 : 0); int const ke = kbar / 5; if(full || kbar == kk) { mzd_make_table(A, r, c, ka, T0, L0); mzd_make_table(A, r+ka, c, kb, T1, L1); mzd_make_table(A, r+ka+kb, c, kc, T2, L2); mzd_make_table(A, r+ka+kb+kc, c, kd, T3, L3); mzd_make_table(A, r+ka+kb+kc+kd, c, ke, T4, L4); } if(kbar == kk) mzd_process_rows5(A, r+kbar, A->nrows, c, kbar, T0, L0, T1, L1, T2, L2, T3, L3, T4, L4); if(full) mzd_process_rows5(A, 0, r, c, kbar, T0, L0, T1, L1, T2, L2, T3, L3, T4, L4); } else if (kbar > 3 * k) { int const rem = kbar % 4; int const ka = kbar / 4 + ((rem >= 3) ? 1 : 0); int const kb = kbar / 4 + ((rem >= 2) ? 1 : 0); int const kc = kbar / 4 + ((rem >= 1) ? 1 : 0); int const kd = kbar / 4; if(full || kbar == kk) { mzd_make_table(A, r, c, ka, T0, L0); mzd_make_table(A, r+ka, c, kb, T1, L1); mzd_make_table(A, r+ka+kb, c, kc, T2, L2); mzd_make_table(A, r+ka+kb+kc, c, kd, T3, L3); } if(kbar == kk) mzd_process_rows4(A, r+kbar, A->nrows, c, kbar, T0, L0, T1, L1, T2, L2, T3, L3); if(full) mzd_process_rows4(A, 0, r, c, kbar, T0, L0, T1, L1, T2, L2, T3, L3); } else if (kbar > 2 * k) { int const rem = kbar % 3; int const ka = kbar / 3 + ((rem >= 2) ? 1 : 0); int const kb = kbar / 3 + ((rem >= 1) ? 1 : 0); int const kc = kbar / 3; if(full || kbar == kk) { mzd_make_table(A, r, c, ka, T0, L0); mzd_make_table(A, r+ka, c, kb, T1, L1); mzd_make_table(A, r+ka+kb, c, kc, T2, L2); } if(kbar == kk) mzd_process_rows3(A, r+kbar, A->nrows, c, kbar, T0, L0, T1, L1, T2, L2); if(full) mzd_process_rows3(A, 0, r, c, kbar, T0, L0, T1, L1, T2, L2); } else if (kbar > k) { int const ka = kbar / 2; int const kb = kbar - ka; if(full || kbar == kk) { mzd_make_table(A, r, c, ka, T0, L0); mzd_make_table(A, r+ka, c, kb, T1, L1); } if(kbar == kk) mzd_process_rows2(A, r+kbar, A->nrows, c, kbar, T0, L0, T1, L1); if(full) mzd_process_rows2(A, 0, r, c, kbar, T0, L0, T1, L1); } else if(kbar > 0) { if(full || kbar == kk) { mzd_make_table(A, r, c, kbar, T0, L0); } if(kbar == kk) mzd_process_rows(A, r+kbar, A->nrows, c, kbar, T0, L0); if(full) mzd_process_rows(A, 0, r, c, kbar, T0, L0); } if (!full) { _mzd_copy_back_rows(A, U, r, c, kbar); } r += kbar; c += kbar; if(kk != kbar) { rci_t cbar; rci_t rbar; if (mzd_find_pivot(A, r, c, &rbar, &cbar)) { c = cbar; mzd_row_swap(A, r, rbar); } else { break; } //c++; } } mzd_free(T0); m4ri_mm_free(L0); mzd_free(T1); m4ri_mm_free(L1); mzd_free(T2); m4ri_mm_free(L2); mzd_free(T3); m4ri_mm_free(L3); mzd_free(T4); m4ri_mm_free(L4); mzd_free(T5); m4ri_mm_free(L5); mzd_free(U); __M4RI_DD_MZD(A); __M4RI_DD_RCI(r); return r; } rci_t _mzd_top_echelonize_m4ri(mzd_t *A, int k, rci_t r, rci_t c, rci_t max_r) { rci_t const ncols = A->ncols; int kbar = 0; if (k == 0) { k = m4ri_opt_k(max_r, A->ncols, 0); if (k >= 7) k = 7; if (0.75 * __M4RI_TWOPOW(k) *A->ncols > __M4RI_CPU_L3_CACHE / 2.0) k -= 1; } int kk = 6 * k; mzd_t *U = mzd_init(kk, A->ncols); mzd_t *T0 = mzd_init(__M4RI_TWOPOW(k), A->ncols); mzd_t *T1 = mzd_init(__M4RI_TWOPOW(k), A->ncols); mzd_t *T2 = mzd_init(__M4RI_TWOPOW(k), A->ncols); mzd_t *T3 = mzd_init(__M4RI_TWOPOW(k), A->ncols); mzd_t *T4 = mzd_init(__M4RI_TWOPOW(k), A->ncols); mzd_t *T5 = mzd_init(__M4RI_TWOPOW(k), A->ncols); rci_t *L0 = (rci_t*)m4ri_mm_calloc(__M4RI_TWOPOW(k), sizeof(rci_t)); rci_t *L1 = (rci_t*)m4ri_mm_calloc(__M4RI_TWOPOW(k), sizeof(rci_t)); rci_t *L2 = (rci_t*)m4ri_mm_calloc(__M4RI_TWOPOW(k), sizeof(rci_t)); rci_t *L3 = (rci_t*)m4ri_mm_calloc(__M4RI_TWOPOW(k), sizeof(rci_t)); rci_t *L4 = (rci_t*)m4ri_mm_calloc(__M4RI_TWOPOW(k), sizeof(rci_t)); rci_t *L5 = (rci_t*)m4ri_mm_calloc(__M4RI_TWOPOW(k), sizeof(rci_t)); while(c < ncols) { if(c+kk > A->ncols) { kk = ncols - c; } kbar = _mzd_gauss_submatrix_full(A, r, c, MIN(A->nrows,r+kk), kk); if (kbar > 5 * k) { int const rem = kbar % 6; int const ka = kbar / 6 + ((rem >= 5) ? 1 : 0); int const kb = kbar / 6 + ((rem >= 4) ? 1 : 0); int const kc = kbar / 6 + ((rem >= 3) ? 1 : 0); int const kd = kbar / 6 + ((rem >= 2) ? 1 : 0); int const ke = kbar / 6 + ((rem >= 1) ? 1 : 0);; int const kf = kbar / 6; mzd_make_table(A, r, c, ka, T0, L0); mzd_make_table(A, r+ka, c, kb, T1, L1); mzd_make_table(A, r+ka+kb, c, kc, T2, L2); mzd_make_table(A, r+ka+kb+kc, c, kd, T3, L3); mzd_make_table(A, r+ka+kb+kc+kd, c, ke, T4, L4); mzd_make_table(A, r+ka+kb+kc+kd+ke, c, kf, T5, L5); mzd_process_rows6(A, 0, MIN(r, max_r), c, kbar, T0, L0, T1, L1, T2, L2, T3, L3, T4, L4, T5, L5); } else if (kbar > 4 * k) { int const rem = kbar % 5; int const ka = kbar / 5 + ((rem >= 4) ? 1 : 0); int const kb = kbar / 5 + ((rem >= 3) ? 1 : 0); int const kc = kbar / 5 + ((rem >= 2) ? 1 : 0); int const kd = kbar / 5 + ((rem >= 1) ? 1 : 0); int const ke = kbar / 5; mzd_make_table(A, r, c, ka, T0, L0); mzd_make_table(A, r+ka, c, kb, T1, L1); mzd_make_table(A, r+ka+kb, c, kc, T2, L2); mzd_make_table(A, r+ka+kb+kc, c, kd, T3, L3); mzd_make_table(A, r+ka+kb+kc+kd, c, ke, T4, L4); mzd_process_rows5(A, 0, MIN(r, max_r), c, kbar, T0, L0, T1, L1, T2, L2, T3, L3, T4, L4); } else if (kbar > 3 * k) { const int rem = kbar%4; const int ka = kbar/4 + ((rem >= 3) ? 1 : 0); const int kb = kbar/4 + ((rem >= 2) ? 1 : 0); const int kc = kbar/4 + ((rem >= 1) ? 1 : 0); const int kd = kbar/4; mzd_make_table(A, r, c, ka, T0, L0); mzd_make_table(A, r+ka, c, kb, T1, L1); mzd_make_table(A, r+ka+kb, c, kc, T2, L2); mzd_make_table(A, r+ka+kb+kc, c, kd, T3, L3); mzd_process_rows4(A, 0, MIN(r, max_r), c, kbar, T0, L0, T1, L1, T2, L2, T3, L3); } else if (kbar > 2 * k) { const int rem = kbar%3; const int ka = kbar/3 + ((rem >= 2) ? 1 : 0); const int kb = kbar/3 + ((rem >= 1) ? 1 : 0); const int kc = kbar/3; mzd_make_table(A, r, c, ka, T0, L0); mzd_make_table(A, r+ka, c, kb, T1, L1); mzd_make_table(A, r+ka+kb, c, kc, T2, L2); mzd_process_rows3(A, 0, MIN(r, max_r), c, kbar, T0, L0, T1, L1, T2, L2); } else if (kbar > k) { const int ka = kbar/2; const int kb = kbar - ka; mzd_make_table(A, r, c, ka, T0, L0); mzd_make_table(A, r+ka, c, kb, T1, L1); mzd_process_rows2(A, 0, MIN(r, max_r), c, kbar, T0, L0, T1, L1); } else if(kbar > 0) { mzd_make_table(A, r, c, kbar, T0, L0); mzd_process_rows(A, 0, MIN(r, max_r), c, kbar, T0, L0); } r += kbar; c += kbar; if(kk != kbar) { c++; } } mzd_free(T0); m4ri_mm_free(L0); mzd_free(T1); m4ri_mm_free(L1); mzd_free(T2); m4ri_mm_free(L2); mzd_free(T3); m4ri_mm_free(L3); mzd_free(T4); m4ri_mm_free(L4); mzd_free(T5); m4ri_mm_free(L5); mzd_free(U); __M4RI_DD_MZD(A); __M4RI_DD_RCI(r); return r; } void mzd_top_echelonize_m4ri(mzd_t *M, int k) { _mzd_top_echelonize_m4ri(M,k,0,0,M->nrows); } mzd_t *mzd_inv_m4ri(mzd_t *dst, mzd_t const* src, int k) { assert(src->nrows == src->ncols && src->offset == 0); if(dst == NULL) { dst = mzd_init(src->nrows, src->ncols); } else { assert(dst->ncols == src->ncols && dst->nrows && src->ncols && dst->offset == 0); } mzd_set_ui(dst, 1); mzd_t *A = mzd_concat(NULL, src, dst); mzd_echelonize_m4ri(A, TRUE, 0); dst = mzd_submatrix(dst, A, 0, src->ncols, A->nrows, A->ncols); mzd_free(A); __M4RI_DD_MZD(dst); return dst; } mzd_t *mzd_mul_m4rm(mzd_t *C, mzd_t const *A, mzd_t const *B, int k) { rci_t a = A->nrows; rci_t c = B->ncols; if(A->ncols != B->nrows) m4ri_die("mzd_mul_m4rm: A ncols (%d) need to match B nrows (%d).\n", A->ncols, B->nrows); if (C == NULL) { C = mzd_init(a, c); } else { if (C->nrows != a || C->ncols != c) m4ri_die("mzd_mul_m4rm: C (%d x %d) has wrong dimensions.\n", C->nrows, C->ncols); } return _mzd_mul_m4rm(C, A, B, k, TRUE); } mzd_t *mzd_addmul_m4rm(mzd_t *C, mzd_t const *A, mzd_t const *B, int k) { rci_t a = A->nrows; rci_t c = B->ncols; if(C->ncols == 0 || C->nrows == 0) return C; if(A->ncols != B->nrows) m4ri_die("mzd_mul_m4rm A ncols (%d) need to match B nrows (%d) .\n", A->ncols, B->nrows); if (C == NULL) { C = mzd_init(a, c); } else { if (C->nrows != a || C->ncols != c) m4ri_die("mzd_mul_m4rm: C has wrong dimensions.\n"); } return _mzd_mul_m4rm(C, A, B, k, FALSE); } #define __M4RI_M4RM_NTABLES 8 mzd_t *_mzd_mul_m4rm(mzd_t *C, mzd_t const *A, mzd_t const *B, int k, int clear) { /** * The algorithm proceeds as follows: * * Step 1. Make a Gray code table of all the \f$2^k\f$ linear combinations * of the \f$k\f$ rows of \f$B_i\f$. Call the \f$x\f$-th row * \f$T_x\f$. * * Step 2. Read the entries * \f$a_{j,(i-1)k+1}, a_{j,(i-1)k+2} , ... , a_{j,(i-1)k+k}.\f$ * * Let \f$x\f$ be the \f$k\f$ bit binary number formed by the * concatenation of \f$a_{j,(i-1)k+1}, ... , a_{j,ik}\f$. * * Step 3. for \f$h = 1,2, ... , c\f$ do * calculate \f$C_{jh} = C_{jh} + T_{xh}\f$. */ assert(A->offset == 0); assert(B->offset == 0); assert(C->offset == 0); rci_t x[__M4RI_M4RM_NTABLES]; rci_t *L[__M4RI_M4RM_NTABLES]; word *t[__M4RI_M4RM_NTABLES]; mzd_t *T[__M4RI_M4RM_NTABLES]; word *c; rci_t const a_nr = A->nrows; rci_t const a_nc = A->ncols; rci_t const b_nc = B->ncols; if (b_nc < m4ri_radix-10 || a_nr < 16) { if(clear) return mzd_mul_naive(C, A, B); else return mzd_addmul_naive(C, A, B); } /* clear first */ if (clear) { mzd_set_ui(C, 0); } const int blocksize = __M4RI_MUL_BLOCKSIZE; if(k==0) { /* __M4RI_CPU_L2_CACHE == 2^k * B->width * 8 * 8 */ k = (int)log2((__M4RI_CPU_L2_CACHE/64)/(double)B->width); if ((__M4RI_CPU_L2_CACHE - 64*__M4RI_TWOPOW(k)*B->width) > (64*__M4RI_TWOPOW(k+1)*B->width - __M4RI_CPU_L2_CACHE)) k++; rci_t const klog = round(0.75 * log2_floor(MIN(MIN(a_nr,a_nc),b_nc))); if(klog < k) k = klog; if (k<2) k=2; else if(k>6) k=6; } const wi_t wide = C->width; const word bm = __M4RI_TWOPOW(k)-1; rci_t *buffer = (rci_t*)m4ri_mm_malloc(__M4RI_M4RM_NTABLES * __M4RI_TWOPOW(k) * sizeof(rci_t)); for(int z=0; z<__M4RI_M4RM_NTABLES; z++) { L[z] = buffer + z*__M4RI_TWOPOW(k); T[z] = mzd_init(__M4RI_TWOPOW(k), b_nc); } /* process stuff that fits into multiple of k first, but blockwise (babystep-giantstep)*/ int const kk = __M4RI_M4RM_NTABLES * k; assert(kk <= m4ri_radix); rci_t const end = a_nc / kk; for (rci_t giantstep = 0; giantstep < a_nr; giantstep += blocksize) { for(rci_t i = 0; i < end; ++i) { for(int z=0; z<__M4RI_M4RM_NTABLES; z++) { mzd_make_table( B, kk*i + k*z, 0, k, T[z], L[z]); } const rci_t blockend = MIN(giantstep+blocksize, a_nr); #if __M4RI_HAVE_OPENMP #pragma omp parallel for schedule(static,512) private(x,t) #endif for(rci_t j = giantstep; j < blockend; j++) { const word a = mzd_read_bits(A, j, kk*i, kk); x[ 0] = L[ 0][ (a >> 0*k) & bm ]; x[ 1] = L[ 1][ (a >> 1*k) & bm ]; x[ 2] = L[ 2][ (a >> 2*k) & bm ]; x[ 3] = L[ 3][ (a >> 3*k) & bm ]; x[ 4] = L[ 4][ (a >> 4*k) & bm ]; x[ 5] = L[ 5][ (a >> 5*k) & bm ]; x[ 6] = L[ 6][ (a >> 6*k) & bm ]; x[ 7] = L[ 7][ (a >> 7*k) & bm ]; c = C->rows[j]; t[ 0] = T[ 0]->rows[x[ 0]]; t[ 1] = T[ 1]->rows[x[ 1]]; t[ 2] = T[ 2]->rows[x[ 2]]; t[ 3] = T[ 3]->rows[x[ 3]]; t[ 4] = T[ 4]->rows[x[ 4]]; t[ 5] = T[ 5]->rows[x[ 5]]; t[ 6] = T[ 6]->rows[x[ 6]]; t[ 7] = T[ 7]->rows[x[ 7]]; _mzd_combine8(c, t[ 0], t[ 1], t[ 2], t[ 3], t[ 4], t[ 5], t[ 6], t[ 7], wide); } } } /* handle stuff that doesn't fit into multiple of kk */ if (a_nc%kk) { rci_t i; for (i = kk / k * end; i < a_nc / k; ++i) { mzd_make_table( B, k*i, 0, k, T[0], L[0]); for(rci_t j = 0; j < a_nr; ++j) { x[0] = L[0][ mzd_read_bits_int(A, j, k*i, k) ]; c = C->rows[j]; t[0] = T[0]->rows[x[0]]; for(wi_t ii = 0; ii < wide; ++ii) { c[ii] ^= t[0][ii]; } } } /* handle stuff that doesn't fit into multiple of k */ if (a_nc%k) { mzd_make_table( B, k*(a_nc/k), 0, a_nc%k, T[0], L[0]); for(rci_t j = 0; j < a_nr; ++j) { x[0] = L[0][ mzd_read_bits_int(A, j, k*i, a_nc%k) ]; c = C->rows[j]; t[0] = T[0]->rows[x[0]]; for(wi_t ii = 0; ii < wide; ++ii) { c[ii] ^= t[0][ii]; } } } } for(int j=0; j<__M4RI_M4RM_NTABLES; j++) mzd_free(T[j]); m4ri_mm_free(buffer); __M4RI_DD_MZD(C); return C; } libm4ri-20130416/src/brilliantrussian.h000066400000000000000000000254471212302366200176540ustar00rootroot00000000000000/** * \file brilliantrussian.h * \brief M4RI and M4RM. * * \author Gregory Bard * \author Martin Albrecht * * \note For reference see Gregory Bard; Accelerating Cryptanalysis with * the Method of Four Russians; 2006; * http://eprint.iacr.org/2006/251.pdf */ #ifndef M4RI_BRILLIANTRUSSIAN_H #define M4RI_BRILLIANTRUSSIAN_H /******************************************************************* * * M4RI: Linear Algebra over GF(2) * * Copyright (C) 2007, 2008 Gregory Bard * Copyright (C) 2008-2010 Martin Albrecht * * Distributed under the terms of the GNU General Public License (GPL) * version 2 or higher. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ * ********************************************************************/ #include #include #include #include #include /** * \brief Constructs all possible \f$2^k\f$ row combinations using the gray * code table. * * \param M matrix to generate the tables from * \param r the starting row * \param c the starting column (only exact up to block) * \param k * \param T prealloced matrix of dimension \f$2^k\f$ x m->ncols * \param L prealloced table of length \f$2^k\f$ * * \wordoffset */ void mzd_make_table(mzd_t const *M, rci_t r, rci_t c, int k, mzd_t *T, rci_t *L); /** * \brief The function looks up k bits from position i,startcol in * each row and adds the appropriate row from T to the row i. * * This process is iterated for i from startrow to stoprow * (exclusive). * * \param M Matrix to operate on * \param startrow top row which is operated on * \param endrow bottom row which is operated on * \param startcol Starting column for addition * \param k M4RI parameter * \param T contains the correct row to be added * \param L Contains row number to be added * * \wordoffset */ void mzd_process_rows(mzd_t *M, rci_t startrow, rci_t endrow, rci_t startcol, int k, mzd_t const *T, rci_t const *L); /** * \brief Same as mzd_process_rows but works with two Gray code tables * in parallel. * * \param M Matrix to operate on * \param startrow top row which is operated on * \param endrow bottom row which is operated on * \param startcol Starting column for addition * \param k M4RI parameter * \param T0 contains the correct row to be added * \param L0 Contains row number to be added * \param T1 contains the correct row to be added * \param L1 Contains row number to be added * * \wordoffset */ void mzd_process_rows2(mzd_t *M, rci_t startrow, rci_t endrow, rci_t startcol, int k, mzd_t const *T0, rci_t const *L0, mzd_t const *T1, rci_t const *L1); /** * \brief Same as mzd_process_rows but works with three Gray code tables * in parallel. * * \param M Matrix to operate on * \param startrow top row which is operated on * \param endrow bottom row which is operated on * \param startcol Starting column for addition * \param k M4RI parameter * \param T0 contains the correct row to be added * \param L0 Contains row number to be added * \param T1 contains the correct row to be added * \param L1 Contains row number to be added * \param T2 contains the correct row to be added * \param L2 Contains row number to be added * * \wordoffset */ void mzd_process_rows3(mzd_t *M, rci_t startrow, rci_t endrow, rci_t startcol, int k, mzd_t const *T0, rci_t const *L0, mzd_t const *T1, rci_t const *L1, mzd_t const *T2, rci_t const *L2); /** * \brief Same as mzd_process_rows but works with four Gray code tables * in parallel. * * \param M Matrix to operate on * \param startrow top row which is operated on * \param endrow bottom row which is operated on * \param startcol Starting column for addition * \param k M4RI parameter * \param T0 contains the correct row to be added * \param L0 Contains row number to be added * \param T1 contains the correct row to be added * \param L1 Contains row number to be added * \param T2 contains the correct row to be added * \param L2 Contains row number to be added * \param T3 contains the correct row to be added * \param L3 Contains row number to be added * * \wordoffset */ void mzd_process_rows4(mzd_t *M, rci_t startrow, rci_t endrow, rci_t startcol, int k, mzd_t const *T0, rci_t const *L0, mzd_t const *T1, rci_t const *L1, mzd_t const *T2, rci_t const *L2, mzd_t const *T3, rci_t const *L3); /** * \brief Same as mzd_process_rows but works with five Gray code tables * in parallel. * * \param M Matrix to operate on * \param startrow top row which is operated on * \param endrow bottom row which is operated on * \param startcol Starting column for addition * \param k M4RI parameter * \param T0 contains the correct row to be added * \param L0 Contains row number to be added * \param T1 contains the correct row to be added * \param L1 Contains row number to be added * \param T2 contains the correct row to be added * \param L2 Contains row number to be added * \param T3 contains the correct row to be added * \param L3 Contains row number to be added * \param T4 contains the correct row to be added * \param L4 Contains row number to be added * * \wordoffset */ void mzd_process_rows5(mzd_t *M, rci_t startrow, rci_t endrow, rci_t startcol, int k, mzd_t const *T0, rci_t const *L0, mzd_t const *T1, rci_t const *L1, mzd_t const *T2, rci_t const *L2, mzd_t const *T3, rci_t const *L3, mzd_t const *T4, rci_t const *L4); /** * \brief Same as mzd_process_rows but works with six Gray code tables * in parallel. * * \param M Matrix to operate on * \param startrow top row which is operated on * \param endrow bottom row which is operated on * \param startcol Starting column for addition * \param k M4RI parameter * \param T0 contains the correct row to be added * \param L0 Contains row number to be added * \param T1 contains the correct row to be added * \param L1 Contains row number to be added * \param T2 contains the correct row to be added * \param L2 Contains row number to be added * \param T3 contains the correct row to be added * \param L3 Contains row number to be added * \param T4 contains the correct row to be added * \param L4 Contains row number to be added * \param T5 contains the correct row to be added * \param L5 Contains row number to be added * * \wordoffset */ void mzd_process_rows6(mzd_t *M, rci_t startrow, rci_t endrow, rci_t startcol, int k, mzd_t const *T0, rci_t const *L0, mzd_t const *T1, rci_t const *L1, mzd_t const *T2, rci_t const *L2, mzd_t const *T3, rci_t const *L3, mzd_t const *T4, rci_t const *L4, mzd_t const *T5, rci_t const *L5); /** * \brief Matrix elimination using the 'Method of the Four Russians' * (M4RI). * * The M4RI algorithm was proposed in Gregory Bard; Accelerating * Cryptanalysis with the Method of Four Russians; 2006; * http://eprint.iacr.org/2006/251 * * Our implementatation is discussed in in Martin Albrecht and Clément * Pernet; Efficient Decomposition of Dense Matrices over GF(2); * http://arxiv.org/abs/1006.1744 * * \param M Matrix to be reduced. * \param full Return the reduced row echelon form, not only upper triangular form. * \param k M4RI parameter, may be 0 for auto-choose. * * \example testsuite/test_elimination.c * \example testsuite/bench_elimination.c * * \wordoffset * * \return Rank of A. */ rci_t _mzd_echelonize_m4ri(mzd_t *A, const int full, int k, int heuristic, const double threshold); /** * \brief Given a matrix in upper triangular form compute the reduced row * echelon form of that matrix. * * \param M Matrix to be reduced. * \param k M4RI parameter, may be 0 for auto-choose. * * \wordoffset * */ void mzd_top_echelonize_m4ri(mzd_t *M, int k); /** * \brief Given a matrix in upper triangular form compute the reduced * row echelon form of that matrix but only start to do anything for * the pivot at (r,c). * * \param A Matrix to be reduced. * \param k M4RI parameter, may be 0 for auto-choose. * \param r Row index. * \param c Column index. * \param max_r Only clear top max_r rows. * * \wordoffset * */ rci_t _mzd_top_echelonize_m4ri(mzd_t *A, int k, rci_t r, rci_t c, rci_t max_r); /** * \brief Invert the matrix src using Konrod's method. * * \param dst Matrix to hold the inverse (may be NULL) * \param src Matrix to be inverted. * \param k Table size parameter, may be 0 for automatic choice. * * \wordoffset * * \return Inverse of src if src has full rank */ mzd_t *mzd_inv_m4ri(mzd_t *dst, const mzd_t* src, int k); /** * \brief Matrix multiplication using Konrod's method, i.e. compute C * such that C == AB. * * This is the convenient wrapper function, please see _mzd_mul_m4rm * for authors and implementation details. * * \param C Preallocated product matrix, may be NULL for automatic creation. * \param A Input matrix A * \param B Input matrix B * \param k M4RI parameter, may be 0 for auto-choose. * * \wordoffset * * \return Pointer to C. */ mzd_t *mzd_mul_m4rm(mzd_t *C, mzd_t const *A, mzd_t const *B, int k); /** * Set C to C + AB using Konrod's method. * * This is the convenient wrapper function, please see _mzd_mul_m4rm * for authors and implementation details. * * \param C Preallocated product matrix, may be NULL for zero matrix. * \param A Input matrix A * \param B Input matrix B * \param k M4RI parameter, may be 0 for auto-choose. * * \wordoffset * * \return Pointer to C. */ mzd_t *mzd_addmul_m4rm(mzd_t *C, mzd_t const *A, mzd_t const *B, int k); /** * \brief Matrix multiplication using Konrod's method, i.e. compute C such * that C == AB. * * This is the actual implementation. * * This function is described in Martin Albrecht, Gregory Bard and * William Hart; Efficient Multiplication of Dense Matrices over * GF(2); pre-print available at http://arxiv.org/abs/0811.1714 * * \param C Preallocated product matrix. * \param A Input matrix A * \param B Input matrix B * \param k M4RI parameter, may be 0 for auto-choose. * \param clear clear the matrix C first * * \author Martin Albrecht -- initial implementation * \author William Hart -- block matrix implementation, use of several * Gray code tables, general speed-ups * * \wordoffset * * \return Pointer to C. */ mzd_t *_mzd_mul_m4rm(mzd_t *C, mzd_t const *A, mzd_t const *B, int k, int clear); #endif // M4RI_BRILLIANTRUSSIAN_H libm4ri-20130416/src/config.h.windows000066400000000000000000000012141212302366200172070ustar00rootroot00000000000000#ifndef M4RI_M4RI_CONFIG_H #define M4RI_M4RI_CONFIG_H // Defines determined during configuration of m4ri. #define __M4RI_HAVE_MM_MALLOC 0 #define __M4RI_HAVE_POSIX_MEMALIGN 0 #define __M4RI_HAVE_SSE2 0 #define __M4RI_HAVE_OPENMP 0 #define __M4RI_CPU_L1_CACHE 32768 #define __M4RI_CPU_L2_CACHE 262144 #define __M4RI_CPU_L3_CACHE 2147483648 #define __M4RI_DEBUG_DUMP (0 || 0) #define __M4RI_DEBUG_MZD 0 // Helper macros. #define __M4RI_USE_MM_MALLOC (__M4RI_HAVE_MM_MALLOC && __M4RI_HAVE_SSE2) #define __M4RI_USE_POSIX_MEMALIGN (__M4RI_HAVE_POSIX_MEMALIGN && __M4RI_HAVE_SSE2) #define __M4RI_DD_QUIET (0 && !0) #endif // M4RI_M4RI_CONFIG_H libm4ri-20130416/src/debug_dump.c000066400000000000000000000123511212302366200163630ustar00rootroot00000000000000/****************************************************************************** * * M4RI: Linear Algebra over GF(2) * * Copyright (C) 2011 Carlo Wood * * Distributed under the terms of the GNU General Public License (GPL) * version 2 or higher. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ ******************************************************************************/ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include "mzd.h" #include "mzp.h" #if __M4RI_DEBUG_DUMP static unsigned long dd_sequence_number = 0; static void entry(char const* function, char const* file, int line) { #if !__M4RI_DD_QUIET printf("Sequence#: %ld; %s @ %s:%d; ", dd_sequence_number, function, file, line); #endif ++dd_sequence_number; } static inline void consistency_check_row(mzd_t const *M, rci_t row) { assert(row >= 0 && row < M->nrows); assert(M->rows[row] == mzd_row(M, row)); if (mzd_is_windowed(M)) return; // Check that the excess bits are zero. assert((M->rows[row][M->width - 1] & ~M->high_bitmask) == 0); // Check that the padding bits are zero, if any. assert(M->width == M->rowstride || M->rows[row][M->width] == 0); } static void consistency_check(mzd_t const *M) { assert(M->nrows >= 0 && M->ncols >= 0); assert(M->offset >= 0 && M->offset < m4ri_radix); assert(M->width * m4ri_radix >= M->ncols + M->offset); assert((M->width - 1) * m4ri_radix < M->ncols + M->offset); assert(M->width < mzd_paddingwidth || (M->rowstride & 1) == 0); //assert((M->blockrows_mask + 1) == (1 << M->blockrows_log)); assert((1 << M->blockrows_log) * M->rowstride <= __M4RI_MAX_MZD_BLOCKSIZE); assert((1 << M->blockrows_log) * M->rowstride > __M4RI_MAX_MZD_BLOCKSIZE / 2); assert((M->width > 1 && M->low_bitmask == __M4RI_RIGHT_BITMASK(m4ri_radix - M->offset)) || (M->width < 2 && M->low_bitmask == __M4RI_MIDDLE_BITMASK(M->ncols, M->offset))); assert((M->width > 1 && M->high_bitmask == __M4RI_LEFT_BITMASK((M->ncols + M->offset) % m4ri_radix)) || (M->width < 2 && M->high_bitmask == __M4RI_MIDDLE_BITMASK(M->ncols, M->offset))); assert(((M->flags & mzd_flag_nonzero_offset) == 0) == (M->offset == 0)); assert(((M->flags & mzd_flag_nonzero_excess) == 0) == ((M->ncols + M->offset) % m4ri_radix == 0)); assert((M->flags & mzd_flag_windowed_zerooffset) == 0 || M->offset == 0); assert((M->flags & mzd_flag_windowed_zeroexcess) == 0 || ((M->ncols + M->offset) % m4ri_radix == 0)); assert((((M->flags & mzd_flag_multiple_blocks) == 0) == (mzd_row_to_block(M, M->nrows - 1) == 0))); int n = 0; rci_t counted = 0; word* ptr = mzd_first_row(M); int row_count = mzd_rows_in_block(M, 0); while(1) { while (row_count--) { assert(ptr == M->rows[counted++]); ptr += M->rowstride; } ++n; row_count = mzd_rows_in_block(M, n); if (row_count <= 0) break; ptr = mzd_first_row_next_block(M, n); } assert(M->ncols == 0 || counted == M->nrows); if (mzd_is_windowed(M)) return; assert(M->rowstride == M->width || (M->rowstride == M->width + 1 && M->width >= mzd_paddingwidth)); for (rci_t r = 0; r < M->nrows; ++r) { consistency_check_row(M, r); } } void m4ri_dd_int(char const* function, char const* file, int line, int i) { entry(function, file, line); #if !__M4RI_DD_QUIET printf("int: %d\n", i); #endif } void m4ri_dd_rci(char const* function, char const* file, int line, rci_t rci) { entry(function, file, line); #if !__M4RI_DD_QUIET printf("rci: %d\n", rci); #endif } void m4ri_dd_rci_array(char const* function, char const* file, int line, rci_t *rciptr, int len) { entry(function, file, line); #if !__M4RI_DD_QUIET word hash = 0; for (int i = 0; i < len; ++i) hash ^= rotate_word(rciptr[i], i % m4ri_radix); printf("rci array (size %d) hash: %llx\n", len, hash); #endif } void m4ri_dd_rawrow(char const* function, char const* file, int line, word const* rowptr, wi_t wide) { entry(function, file, line); #if !__M4RI_DD_QUIET word hash = calculate_hash(rowptr, wide); printf("raw row (%d words) hash: %llx\n", wide, hash); #endif } void m4ri_dd_row(char const* function, char const* file, int line, mzd_t const* M, rci_t row) { entry(function, file, line); consistency_check_row(M, row); #if !__M4RI_DD_QUIET word hash = calculate_hash(M->rows[row], M->width); printf("row %d hash: %llx\n", row, hash); #endif } void m4ri_dd_mzd(char const* function, char const* file, int line, mzd_t const* M) { entry(function, file, line); consistency_check(M); #if !__M4RI_DD_QUIET word hash = 0; for (rci_t r = 0; r < M->nrows; ++r) hash ^= rotate_word(calculate_hash(M->rows[r], M->width), r % m4ri_radix); printf("mzd hash: %llx\n", hash); #endif } void m4ri_dd_mzp(char const* function, char const* file, int line, mzp_t const* P) { entry(function, file, line); #if !__M4RI_DD_QUIET word hash = 0; for (rci_t i = 0; i < P->length; ++i) hash ^= rotate_word(P->values[i], i % m4ri_radix); printf("mzp hash: %llx\n", hash); #endif } #endif libm4ri-20130416/src/debug_dump.h000066400000000000000000000054751212302366200164010ustar00rootroot00000000000000/** * \file debug_dump.h * * \brief Debug utility * * \author Carlo Wood * * To enable dumping of output per function, configure the library with --enable-debug-dump. */ /****************************************************************************** * * M4RI: Linear Algebra over GF(2) * * Copyright (C) 2011 Carlo Wood * * Distributed under the terms of the GNU General Public License (GPL) * version 2 or higher. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ ******************************************************************************/ #ifndef M4RI_DEBUG_DUMP #define M4RI_DEBUG_DUMP static inline word calculate_hash(word const* rowptr, wi_t wide) { word hash = 0; for (word const* ptr = rowptr; ptr < rowptr + wide; ++ptr) hash ^= *ptr; return hash; } static inline word rotate_word(word w, int shift) { return (w << shift) | (w >> (m4ri_radix - w)); } #if __M4RI_DEBUG_DUMP struct mzd_t; struct mzp_t; extern void m4ri_dd_int(char const* function, char const* file, int line, int i); extern void m4ri_dd_rci(char const* function, char const* file, int line, rci_t rci); extern void m4ri_dd_rci_array(char const* function, char const* file, int line, rci_t *rciptr, int len); extern void m4ri_dd_rawrow(char const* function, char const* file, int line, word const* rowptr, wi_t wide); extern void m4ri_dd_row(char const* function, char const* file, int line, struct mzd_t const* M, rci_t row); extern void m4ri_dd_mzd(char const* function, char const* file, int line, struct mzd_t const* M); extern void m4ri_dd_mzp(char const* function, char const* file, int line, struct mzp_t const* P); #define __M4RI_DD_INT(i) m4ri_dd_int(__FUNCTION__, __FILE__, __LINE__, i) #define __M4RI_DD_RCI(rci) m4ri_dd_rci(__FUNCTION__, __FILE__, __LINE__, rci) #define __M4RI_DD_RCI_ARRAY(rciptr, len) m4ri_dd_rci_array(__FUNCTION__, __FILE__, __LINE__, rciptr, len) #define __M4RI_DD_RAWROW(rowptr, wide) m4ri_dd_rawrow(__FUNCTION__, __FILE__, __LINE__, rowptr, wide) #define __M4RI_DD_ROW(M, row) m4ri_dd_row(__FUNCTION__, __FILE__, __LINE__, M, row) #define __M4RI_DD_MZD(M) m4ri_dd_mzd(__FUNCTION__, __FILE__, __LINE__, M) #define __M4RI_DD_MZP(P) m4ri_dd_mzp(__FUNCTION__, __FILE__, __LINE__, P) #else // __M4RI_DEBUG_DUMP #define __M4RI_DD_INT(i) #define __M4RI_DD_RCI(rci) #define __M4RI_DD_RCI_ARRAY(rciptr, len) #define __M4RI_DD_RAWROW(rowptr, wide) #define __M4RI_DD_ROW(M, row) #define __M4RI_DD_MZD(M) #define __M4RI_DD_MZP(P) #endif // __M4RI_DEBUG_DUMP #endif // M4RI_DEBUG_DUMP libm4ri-20130416/src/echelonform.c000066400000000000000000000043161212302366200165530ustar00rootroot00000000000000/******************************************************************* * * M4RI: Linear Algebra over GF(2) * * Copyright (C) 2010 Martin Albrecht * * Distributed under the terms of the GNU General Public License (GPL) * version 2 or higher. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ * ********************************************************************/ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include "echelonform.h" #include "brilliantrussian.h" #include "ple.h" #include "triangular.h" rci_t mzd_echelonize(mzd_t *A, int full) { return _mzd_echelonize_m4ri(A, full, 0, 1, __M4RI_ECHELONFORM_CROSSOVER_DENSITY); } rci_t mzd_echelonize_m4ri(mzd_t *A, int full, int k) { return _mzd_echelonize_m4ri(A, full, k, 0, 1.0); } rci_t mzd_echelonize_pluq(mzd_t *A, int full) { mzp_t *P = mzp_init(A->nrows); mzp_t *Q = mzp_init(A->ncols); rci_t r; if(full) { r = mzd_pluq(A, P, Q, 0); mzd_t *U = mzd_init_window(A, 0, 0, r, r); mzd_t *B = mzd_init_window(A, 0, r, r, A->ncols); if(r!=A->ncols) mzd_trsm_upper_left(U, B, 0); if(r) mzd_set_ui(U, 0); for(rci_t i = 0; i < r; ++i) mzd_write_bit(A, i, i, 1); mzd_free_window(U); mzd_free_window(B); if(r) { mzd_t *A0 = mzd_init_window(A, 0, 0, r, A->ncols); mzd_apply_p_right(A0, Q); mzd_free_window(A0); } else { mzd_apply_p_right(A, Q); } } else { r = mzd_ple(A, P, Q, 0); for(rci_t i = 0; i < r; ++i) { for(rci_t j = 0; j <= i; j += m4ri_radix) { int const length = MIN(m4ri_radix, i - j + 1); mzd_clear_bits(A, i, j, length); } mzd_write_bit(A, i, Q->values[i], 1); } } if(r != A->nrows) { mzd_t *R = mzd_init_window(A, r, 0, A->nrows, A->ncols); mzd_set_ui(R, 0); mzd_free_window(R); } mzp_free(P); mzp_free(Q); __M4RI_DD_MZD(A); __M4RI_DD_RCI(r); return r; } libm4ri-20130416/src/echelonform.h000066400000000000000000000040061212302366200165540ustar00rootroot00000000000000/** * \file echelonform.h * \brief Row echelon forms * * \author Martin Albrecht */ #ifndef M4RI_ECHELONFORM_H #define M4RI_ECHELONFORM_H /******************************************************************* * * M4RI: Linear Algebra over GF(2) * * Copyright (C) 2010 Martin Albrecht * * Distributed under the terms of the GNU General Public License (GPL) * version 2 or higher. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ * ********************************************************************/ #include /** * Density at which we switch to PLE decomposition. */ #define __M4RI_ECHELONFORM_CROSSOVER_DENSITY 0.15 /** * \brief (Reduced) row echelon form. * * This function will * * \param A Matrix. * \param full Return the reduced row echelon form, not only upper triangular form. * * \wordoffset * * \return Rank of A. */ rci_t mzd_echelonize(mzd_t *A, int full); /** * \brief (Reduced) row echelon form using PLUQ factorisation. * * \param A Matrix. * \param full Return the reduced row echelon form, not only upper triangular form. * * \wordoffset * * \sa mzd_pluq() * * \return Rank of A. */ rci_t mzd_echelonize_pluq(mzd_t *A, int full); /** * \brief Matrix elimination using the 'Method of the Four Russians' (M4RI). * * This is a wrapper function for _mzd_echelonize_m4ri() * * \param A Matrix to be reduced. * \param full Return the reduced row echelon form, not only upper triangular form. * \param k M4RI parameter, may be 0 for auto-choose. * * \wordoffset * * \sa _mzd_echelonize_m4ri() * * \return Rank of A. */ rci_t mzd_echelonize_m4ri(mzd_t *A, int full, int k); #endif // M4RI_ECHELONFORM_H libm4ri-20130416/src/graycode.c000066400000000000000000000045361212302366200160530ustar00rootroot00000000000000/****************************************************************************** * * M4RI: Linear Algebra over GF(2) * * Copyright (C) 2007 Gregory Bard * Copyright (C) 2007 Martin Albrecht * * Distributed under the terms of the GNU General Public License (GPL) * version 2 or higher. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ ******************************************************************************/ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include "misc.h" #include "graycode.h" code **m4ri_codebook = NULL; int m4ri_gray_code(int number, int length) { int lastbit = 0; int res = 0; for(int i = length - 1; i >= 0; --i) { int bit = number & (1 << i); res |= (lastbit >> 1) ^ bit; lastbit = bit; } return res; } void m4ri_build_code(int *ord, int *inc, int l) { for(int i = 0 ; i < (int)__M4RI_TWOPOW(l); ++i) { ord[i] = m4ri_gray_code(i, l); } for(int i = l; i > 0; --i) { for(int j = 1; j < (int)__M4RI_TWOPOW(i) + 1; ++j) { inc[j * __M4RI_TWOPOW(l - i) - 1] = l - i; } } } void m4ri_build_all_codes() { if (m4ri_codebook) { return; } m4ri_codebook=(code**)m4ri_mm_calloc(__M4RI_MAXKAY + 1, sizeof(code*)); for(int k = 1; k < __M4RI_MAXKAY + 1; ++k) { m4ri_codebook[k] = (code*)m4ri_mm_calloc(1, sizeof(code)); m4ri_codebook[k]->ord =(int*)m4ri_mm_calloc(__M4RI_TWOPOW(k), sizeof(int)); m4ri_codebook[k]->inc =(int*)m4ri_mm_calloc(__M4RI_TWOPOW(k), sizeof(int)); m4ri_build_code(m4ri_codebook[k]->ord, m4ri_codebook[k]->inc, k); } } void m4ri_destroy_all_codes() { if (!m4ri_codebook) { return; } for(int i = 1; i < __M4RI_MAXKAY + 1; ++i) { m4ri_mm_free(m4ri_codebook[i]->inc); m4ri_mm_free(m4ri_codebook[i]->ord); m4ri_mm_free(m4ri_codebook[i]); } m4ri_mm_free(m4ri_codebook); m4ri_codebook = NULL; } int m4ri_opt_k(int a, int b, int c) { int n = MIN(a, b); int res = MIN(__M4RI_MAXKAY, MAX(1, (int)(0.75 * (1 + log2_floor(n)))) ); return res; } libm4ri-20130416/src/graycode.h000066400000000000000000000073611212302366200160570ustar00rootroot00000000000000/** * \file grayflex.h * \brief Gray code implementation. * * The Gray code is a binary numeral system where two successive * values differ in only one digit. * * \author Gregory Bard * \author Martin Albrecht */ #ifndef M4RI_GRAYFLEX_H #define M4RI_GRAYFLEX_H /****************************************************************************** * * M4RI: Linear Algebra over GF(2) * * Copyright (C) 2007 Gregory Bard * Copyright (C) 2007 Martin Albrecht * * Distributed under the terms of the GNU General Public License (GPL) * version 2 or higher. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ ******************************************************************************/ /** * Maximum allowed value for k. */ #define __M4RI_MAXKAY 16 /** * \brief Gray codes. * * A codestruct represents one entry in the code book, i.e. it * represents a Gray code of a given length. * * For example the Gray code table of length \f$2^3\f$ is: * * \verbatim ------------------- | i | ord | inc | ------------------- | 0 | 0 | 0 | | 1 | 4 | 1 | | 2 | 6 | 0 | | 3 | 2 | 2 | | 4 | 3 | 0 | | 5 | 7 | 1 | | 6 | 5 | 0 | | 7 | 1 | 2 | ------------------- * \endverbatim */ typedef struct { /** * array of of Gray code entries */ int *ord; /** * increment */ int *inc; } code; /** * Global m4ri_codebook. * * \warning Not thread safe! */ extern code **m4ri_codebook; /** * Returns the i-th gray code entry for a gray code of length \f$2^l\f$. * * \param i The index in the Gray code table. * \param l Length of the Gray code. * * \return i-th Gray code entry. */ int m4ri_gray_code(int i, int l); /** * Fills var ord and var inc with Gray code data for a Gray code of * length \f$2^l\f$. * * \param ord Will hold gray code data, must be preallocated with correct size * \param inc Will hold some increment data, must be preallocated with correct size * \param l Logarithm of length of Gray code. * * \note Robert Miller had the idea for a non-recursive * implementation. */ void m4ri_build_code(int *ord, int *inc, int l); /** * \brief Generates global code book. * * This function is called automatically when the shared library is * loaded. * * \warning Not thread safe! */ void m4ri_build_all_codes(void); /** * Frees memory from the global code book. * * This function is called automatically when the shared library is * unloaded. * * \warning Not thread safe! */ void m4ri_destroy_all_codes(void); /** * floor(log_2(v)) */ static inline int log2_floor(int v) { static unsigned const int b[] = { 0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000 }; static unsigned const int S[] = { 1, 2, 4, 8, 16 }; unsigned int r = 0; for (int i = 4; i >= 0; --i) { if ((v & b[i])) { v >>= S[i]; r |= S[i]; } } return r; } /** * \brief Return the optimal var k for the given parameters. * * If var c != 0 then var k for multiplication is returned, else * var k for inversion. The optimal var k here means \f$0.75 log_2(n)\f$ * where \f$n\f$ is \f$min(a,b)\f$ for inversion and * \f$b\f$ for multiplication. * * \param a Number of rows of (first) matrix * \param b Number of columns of (first) matrix * \param c Number of columns of second matrix (may be 0) * * \return k */ int m4ri_opt_k(int a,int b,int c); #endif // M4RI_GRAYFLEX_H libm4ri-20130416/src/io.c000066400000000000000000000241121212302366200146550ustar00rootroot00000000000000/******************************************************************* * * M4RI: Linear Algebra over GF(2) * * Copyright (C) 2011 Martin Albrecht * * Distributed under the terms of the GNU General Public License (GPL) * version 2 or higher. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ * ********************************************************************/ #include "m4ri_config.h" #if __M4RI_HAVE_LIBPNG #include #endif //__M4RI_HAVE_LIBPNG #include "io.h" #include "echelonform.h" void mzd_info(const mzd_t *A, int do_rank) { printf("nrows: %6zu, ncols: %6zu, density: %6.5f, hash: 0x%016zx",(size_t)A->nrows,(size_t)A->ncols,mzd_density(A,1),mzd_hash(A)); if(do_rank) { mzd_t *AA = mzd_copy(NULL, A); printf(", rank: %6zu\n",(size_t)mzd_echelonize(AA,0)); mzd_free(AA); } else { printf("\n"); } } #define SAFECHAR (m4ri_radix + m4ri_radix / 4 + 1) void mzd_print( mzd_t const *M ) { char temp[SAFECHAR]; for (rci_t i = 0; i < M->nrows; ++i) { printf("["); word *row = M->rows[i]; if(M->offset == 0) { for (wi_t j = 0; j < M->width - 1; ++j) { m4ri_word_to_str(temp, row[j], 1); printf("%s|", temp); } row = row + M->width - 1; int const wide = (M->ncols % m4ri_radix) ? M->ncols % m4ri_radix : m4ri_radix; for (int j = 0; j < wide; ++j) { if(j != 0 && (j % 4) == 0) printf(":"); if (__M4RI_GET_BIT(*row, j)) printf("1"); else printf(" "); } } else { for (rci_t j = 0; j < M->ncols; ++j) { if(j != 0 && (j % 4) == 0) printf(((j % m4ri_radix) == 0) ? "|" : ":"); if(mzd_read_bit(M, i, j)) printf("1"); else printf(" "); } } printf("]\n"); } } #if __M4RI_HAVE_LIBPNG #define PNGSIGSIZE 8 mzd_t * mzd_from_png(const char *fn, int verbose) { int retval = 0; mzd_t *A = NULL; png_byte pngsig[PNGSIGSIZE]; FILE *fh = fopen(fn,"rb"); if (!fh) { if (verbose) printf("Could not open file '%s' for reading\n",fn); return NULL; }; if (fread((char*)pngsig, PNGSIGSIZE, 1, fh) != 1) { if (verbose) printf("Could not read file '%s'\n",fn); retval = 1; goto from_png_close_fh; } if (png_sig_cmp(pngsig, 0, PNGSIGSIZE) != 0) { if (verbose) printf("'%s' is not a PNG file.\n",fn); retval = 2; goto from_png_close_fh; } png_structp png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL); if (!png_ptr) { if (verbose) printf("failed to initialise PNG read struct.\n"); retval = 3; goto from_png_close_fh; } png_set_user_limits(png_ptr, 0x7fffffffL, 0x7fffffffL); png_infop info_ptr = png_create_info_struct(png_ptr); if (!info_ptr) { if (verbose) printf("failed to initialise PNG info struct\n"); retval = 3; goto from_png_destroy_read_struct; } png_init_io(png_ptr, fh); png_set_sig_bytes(png_ptr, PNGSIGSIZE); png_read_info(png_ptr, info_ptr); const png_uint_32 m = png_get_image_height(png_ptr, info_ptr); const png_uint_32 n = png_get_image_width(png_ptr, info_ptr); const png_uint_32 bit_depth = png_get_bit_depth(png_ptr, info_ptr); const png_uint_32 channels = png_get_channels(png_ptr, info_ptr); const png_uint_32 color_type = png_get_color_type(png_ptr, info_ptr); const png_uint_32 compression_type = png_get_compression_type(png_ptr, info_ptr); const png_uint_32 interlace_type = png_get_interlace_type(png_ptr, info_ptr); if (interlace_type != PNG_INTERLACE_NONE) { if (verbose) printf("interlaced images not supported\n"); goto from_png_destroy_read_struct; }; if (verbose) printf("reading %u x %u matrix (bit depth: %u, channels: %u, color type: %u, compression type: %u)\n",(unsigned int)m, (unsigned int)n, (unsigned int)bit_depth, (unsigned int)channels, (unsigned int)color_type, (unsigned int)compression_type); if(color_type != 0 && color_type != 3) { if (verbose) printf("only graycscale and palette colors are supported.\n"); goto from_png_destroy_read_struct; } A = mzd_init(m, n); const word bitmask_end = A->high_bitmask; png_bytep row = m4ri_mm_calloc(sizeof(char),n/8+1); word tmp; wi_t j; png_set_packswap(png_ptr); //png_set_invert_mono(png_ptr); for(rci_t i=0; iwidth-1; j++) { tmp = ((word)row[8*j+7])<<56 | ((word)row[8*j+6])<<48 \ | ((word)row[8*j+5])<<40 | ((word)row[8*j+4])<<32 \ | ((word)row[8*j+3])<<24 | ((word)row[8*j+2])<<16 \ | ((word)row[8*j+1])<< 8 | ((word)row[8*j+0])<< 0; A->rows[i][j] = ~tmp; } tmp = 0; switch((n/8 + ((n%8) ? 1 : 0))%8) { case 0: tmp |= ((word)row[8*j+7])<<56; case 7: tmp |= ((word)row[8*j+6])<<48; case 6: tmp |= ((word)row[8*j+5])<<40; case 5: tmp |= ((word)row[8*j+4])<<32; case 4: tmp |= ((word)row[8*j+3])<<24; case 3: tmp |= ((word)row[8*j+2])<<16; case 2: tmp |= ((word)row[8*j+1])<< 8; case 1: tmp |= ((word)row[8*j+0])<< 0; }; A->rows[i][j] |= (~tmp & bitmask_end); } m4ri_mm_free(row); png_read_end(png_ptr, NULL); from_png_destroy_read_struct: png_destroy_read_struct(&png_ptr, &info_ptr,(png_infopp)0); from_png_close_fh: fclose(fh); if (retval != 0 && A) { mzd_free(A); return NULL; } else { return A; } } int mzd_to_png(const mzd_t *A, const char *fn, int compression_level, const char *comment, int verbose) { FILE *fh = fopen(fn, "wb"); if (!fh) { if(verbose) printf("Could not open file '%s' for writing\n",fn); return 1; } png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL); if (!png_ptr) { if(verbose) printf("failed to initialise PNG write struct.\n"); fclose(fh); return 3; } png_set_user_limits(png_ptr, 0x7fffffffL, 0x7fffffffL); png_infop info_ptr = png_create_info_struct(png_ptr); if (!info_ptr) { if (verbose) printf("failed to initialise PNG info struct\n"); png_destroy_write_struct(&png_ptr, &info_ptr); fclose(fh); return 3; } if (setjmp(png_jmpbuf(png_ptr))) { if (verbose) printf("error writing PNG file\n"); png_destroy_write_struct(&png_ptr, &info_ptr); fclose(fh); return 1; } png_init_io(png_ptr, fh); png_set_compression_level(png_ptr, compression_level); png_set_IHDR(png_ptr, info_ptr, A->ncols, A->nrows, 1, \ PNG_COLOR_TYPE_GRAY, \ PNG_INTERLACE_NONE, \ PNG_COMPRESSION_TYPE_DEFAULT, \ PNG_FILTER_TYPE_DEFAULT); png_text txt_ptr[3]; char pdate[21]; time_t ptime=time(NULL); struct tm *ltime=localtime(&ptime); sprintf(pdate,"%04d/%02d/%02d %02d:%02d:%02d",ltime->tm_year+1900,ltime->tm_mon+1,ltime->tm_mday,ltime->tm_hour,ltime->tm_min,ltime->tm_sec); txt_ptr[0].key="Software"; txt_ptr[0].text="M4RI"; txt_ptr[0].compression=PNG_TEXT_COMPRESSION_NONE; txt_ptr[1].key="Date"; txt_ptr[1].text=pdate; txt_ptr[1].compression=PNG_TEXT_COMPRESSION_NONE; txt_ptr[2].key="Comment"; txt_ptr[2].text=(char*)comment; txt_ptr[2].compression=PNG_TEXT_COMPRESSION_NONE; png_set_text(png_ptr, info_ptr, txt_ptr, 3); png_write_info(png_ptr, info_ptr); png_set_packswap(png_ptr); png_set_invert_mono(png_ptr); png_bytep row = m4ri_mm_calloc(sizeof(char),A->ncols/8+8); wi_t j=0; word tmp = 0; for(rci_t i=0; inrows; i++) { word *rowptr = A->rows[i]; for(j=0; jwidth-1; j++) { tmp = rowptr[j]; row[8*j+0] = (png_byte)((tmp>> 0) & 0xff); row[8*j+1] = (png_byte)((tmp>> 8) & 0xff); row[8*j+2] = (png_byte)((tmp>>16) & 0xff); row[8*j+3] = (png_byte)((tmp>>24) & 0xff); row[8*j+4] = (png_byte)((tmp>>32) & 0xff); row[8*j+5] = (png_byte)((tmp>>40) & 0xff); row[8*j+6] = (png_byte)((tmp>>48) & 0xff); row[8*j+7] = (png_byte)((tmp>>56) & 0xff); } tmp = rowptr[j]; switch( (A->ncols/8 + ((A->ncols%8) ? 1 : 0)) %8 ) { case 0: row[8*j+7] = (png_byte)((tmp>>56) & 0xff); case 7: row[8*j+6] = (png_byte)((tmp>>48) & 0xff); case 6: row[8*j+5] = (png_byte)((tmp>>40) & 0xff); case 5: row[8*j+4] = (png_byte)((tmp>>32) & 0xff); case 4: row[8*j+3] = (png_byte)((tmp>>24) & 0xff); case 3: row[8*j+2] = (png_byte)((tmp>>16) & 0xff); case 2: row[8*j+1] = (png_byte)((tmp>> 8) & 0xff); case 1: row[8*j+0] = (png_byte)((tmp>> 0) & 0xff); }; png_write_row(png_ptr, row); } m4ri_mm_free(row); png_write_end(png_ptr, info_ptr); png_destroy_write_struct(&png_ptr, &info_ptr); fclose(fh); return 0; } #endif //__M4RI_HAVE_LIBPNG mzd_t *mzd_from_jcf(const char *fn, int verbose) { int retval = 0; mzd_t *A = NULL; FILE *fh = fopen(fn,"r"); rci_t m,n; long p = 0; long nonzero = 0; if (!fh) { if (verbose) printf("Could not open file '%s' for reading\n",fn); return NULL; } if (fscanf(fh, "%d %d %ld\n%ld\n\n",&m,&n,&p,&nonzero) != 4) { if (verbose) printf("File '%s' does not seem to be in JCF format.",fn); retval = 1; goto from_jcf_close_fh; } if(p != 2) { if (verbose) printf("Expected p==2 but found p==%ld\n",p); retval = 1; goto from_jcf_close_fh; } if (verbose) printf("reading %lu x %lu matrix with at most %ld non-zero entries (density at most: %6.5f)\n", (unsigned long)m, (unsigned long)n, (unsigned long)nonzero, ((double)nonzero)/((double)m*n)); A = mzd_init(m,n); long i = -1; long j = 0; while(fscanf(fh,"%ld\n",&j) == 1) { if (j<0) { i++, j = -j; } mzd_write_bit(A, i, j-1, 1); }; from_jcf_close_fh: fclose(fh); if(retval != 0 && A) { mzd_free(A); return NULL; } else { return A; } } libm4ri-20130416/src/io.h000066400000000000000000000063131212302366200146650ustar00rootroot00000000000000/** * \file io.h * \brief Input/output routines for matrices * * \author Martin Albrecht */ #ifndef M4RI_IO_H #define M4RI_IO_H /******************************************************************* * * M4RI: Linear Algebra over GF(2) * * Copyright (C) 2011 Martin Albrecht * * Distributed under the terms of the GNU General Public License (GPL) * version 2 or higher. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ * ********************************************************************/ #include #include /** * \brief Print a matrix to stdout. * * The output will contain colons between every 4-th column. * * \param M Matrix */ void mzd_print(mzd_t const *M); /** * \brief Print compact information about the matrix to stdout. * * Prints number of rows, number of columns, density (and rank). * * \param A Matrix * \param do_rank Also display the rank (expensive) */ void mzd_info(const mzd_t *A, int do_rank); #if __M4RI_HAVE_LIBPNG /** * \brief Read matrix from 1-bit PNG image. * * This function returns a matrix on success and NULL otherwise. 1-bit * Grayscale and 1-bit Palette images are supported. * * \param fn Filename * \param verbose Print error message to stdout if != 0 */ mzd_t * mzd_from_png(const char *fn, int verbose); /** * \brief Write matrix to 1-bit PNG image. * * This function returns zero on success and some value != 0 * otherwise. The parameter compression_level takes a zlib compression * level, i.e., an integer betweeen -1 and 9 (inclusive) such that * \verbatim #define Z_NO_COMPRESSION 0 #define Z_BEST_SPEED 1 #define Z_BEST_COMPRESSION 9 #define Z_DEFAULT_COMPRESSION (-1) \endverbatim * * The optional comment string is written as a PNG comment. * * * \param A Matrix * \param fn Filename (must have write permission) * \param compression_level Zlib compression level (see above) * \param comment Optional comment (may be NULL) * \param verbose Print error message to stdout if != 0 */ int mzd_to_png(const mzd_t *A, const char *fn, int compression_level, const char *comment, int verbose); #endif //__M4RI_HAVE_LIBPNG /** * \brief Read matrix from ASCII file in JCF format. * * The format is as follows: \verbatim nrows ncols modulus nonzero_entries_upper_bound column_index \endverbatim * * where a negative column_index indicates a row_index increase by one and a non-zero entry at index * -column_index. * * \note the JCF format is one-based in contrast to everything else in this library which is * zero-based. * * For example, a valid input is: \verbatim 2 3 2 3 -2 -1 -2 \endverbatim * * which produces the matrix \verbatim [0 1] [1 1] \endverbatim * * * \param fn Filename * \param verbose Print error message to stdout if != 0 */ mzd_t *mzd_from_jcf(const char *fn, int verbose); #endif //M4RI_IO_H libm4ri-20130416/src/m4ri.h000066400000000000000000000042371212302366200151340ustar00rootroot00000000000000/** * \file m4ri.h * \brief Main include file for the M4RI library. * * \author Gregory Bard * \author Martin Albrecht */ /****************************************************************************** * * M4RI: Linear Algebra over GF(2) * * Copyright (C) 2007 Gregory Bard * Copyright (C) 2007,2008 Martin Albrecht * * Distributed under the terms of the GNU General Public License (GPL) * version 2 or higher. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ ******************************************************************************/ #ifndef M4RI_M4RI_H #define M4RI_M4RI_H /** * \mainpage * * M4RI is a library to do fast arithmetic with dense matrices over * \f$F_2\f$. M4RI is available under the GPLv2+ and used by the Sage * mathematics software and the PolyBoRi library. See * http://m4ri.sagemath.org for details. * * \example testsuite/test_multiplication.c */ #include #include #include #if defined(__M4RI_HAVE_SSE2) && __M4RI_HAVE_SSE2 # if !defined(__SSE2__) || !__SSE2__ # error "Your current compiler and / or CFLAGS setting doesn't allow SSE2 code. Please change that or these to the setting(s) you used when compiling M4RI." # endif #endif #if defined(__cplusplus) && !defined(M4RI_WRAPWORD) && !defined (_MSC_VER) extern "C" { #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__cplusplus) && !defined(M4RI_WRAPWORD) && !defined (_MSC_VER) } #endif #endif // M4RI_M4RI_H libm4ri-20130416/src/m4ri_config.h.in000066400000000000000000000021701212302366200170600ustar00rootroot00000000000000#ifndef M4RI_M4RI_CONFIG_H #define M4RI_M4RI_CONFIG_H // Defines determined during configuration of m4ri. #define __M4RI_HAVE_MM_MALLOC @M4RI_HAVE_MM_MALLOC@ #define __M4RI_HAVE_POSIX_MEMALIGN @M4RI_HAVE_POSIX_MEMALIGN@ #define __M4RI_HAVE_SSE2 @M4RI_HAVE_SSE2@ #define __M4RI_HAVE_OPENMP @M4RI_HAVE_OPENMP@ #define __M4RI_CPU_L1_CACHE @M4RI_CPU_L1_CACHE@ #define __M4RI_CPU_L2_CACHE @M4RI_CPU_L2_CACHE@ #define __M4RI_CPU_L3_CACHE @M4RI_CPU_L3_CACHE@ #define __M4RI_DEBUG_DUMP (@M4RI_DEBUG_DUMP@ || @M4RI_DEBUG_MZD@) #define __M4RI_DEBUG_MZD @M4RI_DEBUG_MZD@ #define __M4RI_HAVE_LIBPNG @M4RI_HAVE_LIBPNG@ #define __M4RI_CC "@CC@" #define __M4RI_CFLAGS "@SIMD_CFLAGS@ @OPENMP_CFLAGS@ @CFLAGS@" #define __M4RI_SIMD_CFLAGS "@SIMD_CFLAGS@" #define __M4RI_OPENMP_CFLAGS "@OPENMP_CFLAGS@" // Helper macros. #define __M4RI_USE_MM_MALLOC (__M4RI_HAVE_MM_MALLOC && __M4RI_HAVE_SSE2) #define __M4RI_USE_POSIX_MEMALIGN (__M4RI_HAVE_POSIX_MEMALIGN && __M4RI_HAVE_SSE2) #define __M4RI_DD_QUIET (@M4RI_DEBUG_MZD@ && !@M4RI_DEBUG_DUMP@) #endif // M4RI_M4RI_CONFIG_H libm4ri-20130416/src/misc.c000066400000000000000000000055511212302366200152070ustar00rootroot00000000000000/****************************************************************************** * * M4RI: Linear Algebra over GF(2) * * Copyright (C) 2007 Gregory Bard * Copyright (C) 2011 Carlo Wood * * Distributed under the terms of the GNU General Public License (GPL) * version 2 or higher. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ ******************************************************************************/ #ifdef HAVE_CONFIG_H #include "config.h" #endif #ifdef _MSC_VER #include #endif #include #include #include #include "graycode.h" #include "misc.h" #include "mmc.h" void m4ri_die(const char *errormessage, ...) { va_list lst; va_start(lst, errormessage); vfprintf(stderr, errormessage, lst); va_end(lst); abort(); } /* Warning: If colon, destination must have m4ri_radix + (m4ri_radix - 1) / 4 + 1 bytes available. */ void m4ri_word_to_str(char *destination, word data, int colon) { int j = 0; for (int i = 0; i < m4ri_radix; ++i) { if (colon && (i % 4) == 0 && i != 0) destination[j++] = ':'; if (__M4RI_GET_BIT(data, i)) destination[j++] = '1'; else destination[j++] = ' '; } destination[j] = '\0'; } word m4ri_random_word() { #ifdef _MSC_VER word a = 0; int i; for(i=0; i< m4ri_radix; i+=8) { a ^= (((word)rand())< * \author Martin Albrecht * \author Carlo Wood */ #ifndef M4RI_MISC_H #define M4RI_MISC_H /******************************************************************* * * M4RI: Linear Algebra over GF(2) * * Copyright (C) 2007, 2008 Gregory Bard * Copyright (C) 2008 Martin Albrecht * Copyright (C) 2011 Carlo Wood * * Distributed under the terms of the GNU General Public License (GPL) * version 2 or higher. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ * ********************************************************************/ #include #if __M4RI_USE_MM_MALLOC #include #endif #include #include #include #define __STDC_LIMIT_MACROS #include /* * These define entirely the word width used in the library. */ /** * \brief Pretty for a boolean int. * * The value of a BIT is either 0 or 1. */ typedef int BIT; /** * \brief Type of row and column indexes. * * This type is used for integer values that hold row/colum sized values. */ typedef int rci_t; /** * \brief Type of word indexes. * * This type is used for the array of words that make up a row. */ typedef int wi_t; #ifdef M4RI_WRAPWORD // C++ wrapper class around an uint64_t, exclusively interesting for the developer(s) of M4RI. #include #else /** * \brief A word is the typical packed data structure to represent packed bits. */ typedef uint64_t word; /** * \brief Explicit conversion macro. * * Explicit conversion of a word, representing 64 columns, to an integer * to be used as index into an array. This is used for Gray codes. * No error checking is done that the most significant bits in w are zero. * * \note This is a no-op. It's purpose it to track intention. */ #define __M4RI_CONVERT_TO_INT(w) ((int)(w)) /** * \brief Explicit conversion macro. * * Explicit conversion of a word, representing 64 columns, to a BIT * to be used as boolean: this is an int with value 0 (false) or 1 (true). * No error checking is done that only the least significant bit is set (if any). * * \note This is a no-op. It's purpose it to track intention. */ #define __M4RI_CONVERT_TO_BIT(w) ((BIT)(w)) /** * \brief Explicit conversion macro. * * Explicit conversion of a word, representing 64 columns, to an uint64_t. * * The returned value is the underlaying integer representation of these 64 columns, * meaning in particular that if val is an uint64_t then * __M4RI_CONVERT_TO_UINT64_T(__M4RI_CONVERT_TO_WORD(val)) == val. * * \note This is a no-op. It's purpose it to track intention. */ #define __M4RI_CONVERT_TO_UINT64_T(w) (w) /** * \brief Explicit conversion macro. * * Explicit conversion of an integer to a word. * * \note This is a no-op. It's purpose it to track intention. */ #define __M4RI_CONVERT_TO_WORD(i) ((word)(i)) #endif /** * \brief The number of bits in a word. */ static int const m4ri_radix = 64; /** * \brief The number one as a word. */ static word const m4ri_one = __M4RI_CONVERT_TO_WORD(1); /** * \brief A word with all bits set. */ static word const m4ri_ffff = __M4RI_CONVERT_TO_WORD(-1); /** * \brief Return the maximal element of x and y * * \param x Word * \param y Word */ #ifndef MAX #define MAX(x,y) (((x) > (y))?(x):(y)) #endif /** * \brief Return the minimal element of x and y * * \param x Word * \param y Word */ #ifndef MIN #define MIN(x,y) (((x) < (y))?(x):(y)) #endif /** *\brief Pretty for 1. */ #ifndef TRUE #define TRUE 1 #endif /** *\brief Pretty for 0. */ #ifndef FALSE #define FALSE 0 #endif /** * \brief $2^i$ * * \param i Integer. */ #define __M4RI_TWOPOW(i) ((uint64_t)1 << (i)) /** * \brief Clear the bit spot (counting from the left) in the word w * * \param w Word * \param spot Integer with 0 <= spot < m4ri_radix */ #define __M4RI_CLR_BIT(w, spot) ((w) &= ~(m4ri_one << (spot)) /** * \brief Set the bit spot (counting from the left) in the word w * * \param w Word * \param spot Integer with 0 <= spot < m4ri_radix */ #define __M4RI_SET_BIT(w, spot) ((w) |= (m4ri_one << (spot))) /** * \brief Get the bit spot (counting from the left) in the word w * * \param w Word * \param spot Integer with 0 <= spot < m4ri_radix */ #define __M4RI_GET_BIT(w, spot) __M4RI_CONVERT_TO_BIT(((w) >> (spot)) & m4ri_one) /** * \brief Write the value to the bit spot in the word w * * \param w Word. * \param spot Integer with 0 <= spot < m4ri_radix. * \param value Either 0 or 1. */ #define __M4RI_WRITE_BIT(w, spot, value) ((w) = (((w) & ~(m4ri_one << (spot))) | (-__M4RI_CONVERT_TO_WORD(value) & (m4ri_one << (spot))))) /** * \brief Flip the spot in the word w * * \param w Word. * \param spot Integer with 0 <= spot < m4ri_radix. */ #define __M4RI_FLIP_BIT(w, spot) ((w) ^= (m4ri_one << (spot))) /** * \brief create a bit mask to zero out all but the (n - 1) % m4ri_radix + 1 leftmost bits. * * This function returns 1..64 bits, never zero bits. * This mask is mainly used to mask the valid bits in the most significant word, * by using __M4RI_LEFT_BITMASK((M->ncols + M->offset) % m4ri_radix). * In other words, the set bits represent the columns with the lowest index in the word. * * Thus, * * n Output * 0=64 1111111111111111111111111111111111111111111111111111111111111111 * 1 0000000000000000000000000000000000000000000000000000000000000001 * 2 0000000000000000000000000000000000000000000000000000000000000011 * . ... * 62 0011111111111111111111111111111111111111111111111111111111111111 * 63 0111111111111111111111111111111111111111111111111111111111111111 * * Note that n == 64 is only passed from __M4RI_MIDDLE_BITMASK, and still works * (behaves the same as n == 0): the input is modulo 64. * * \param n Integer with 0 <= n <= m4ri_radix */ #define __M4RI_LEFT_BITMASK(n) (m4ri_ffff >> (m4ri_radix - (n)) % m4ri_radix) /** * \brief create a bit mask to zero out all but the n rightmost bits. * * This function returns 1..64 bits, never zero bits. * This mask is mainly used to mask the n valid bits in the least significant word * with valid bits by using __M4RI_RIGHT_BITMASK(m4ri_radix - M->offset). * In other words, the set bits represent the columns with the highest index in the word. * * Thus, * * n Output * 1 1000000000000000000000000000000000000000000000000000000000000000 * 2 1100000000000000000000000000000000000000000000000000000000000000 * 3 1110000000000000000000000000000000000000000000000000000000000000 * . ... * 63 1111111111111111111111111111111111111111111111111111111111111110 * 64 1111111111111111111111111111111111111111111111111111111111111111 * * Note that n == 0 is never passed and would fail. * * \param n Integer with 0 < n <= m4ri_radix */ #define __M4RI_RIGHT_BITMASK(n) (m4ri_ffff << (m4ri_radix - (n))) /** * \brief create a bit mask that is the combination of __M4RI_LEFT_BITMASK and __M4RI_RIGHT_BITMASK. * * This function returns 1..64 bits, never zero bits. * This mask is mainly used to mask the n valid bits in the only word with valid bits, * when M->ncols + M->offset <= m4ri_radix), by using __M4RI_MIDDLE_BITMASK(M->ncols, M->offset). * It is equivalent to __M4RI_LEFT_BITMASK(n + offset) & __M4RI_RIGHT_BITMASK(m4ri_radix - offset). * In other words, the set bits represent the valid columns in the word. * * Note that when n == m4ri_radix (and thus offset == 0) then __M4RI_LEFT_BITMASK is called with n == 64. * * \param n Integer with 0 < n <= m4ri_radix - offset * \param offset Column offset, with 0 <= offset < m4ri_radix */ #define __M4RI_MIDDLE_BITMASK(n, offset) (__M4RI_LEFT_BITMASK(n) << (offset)) /** * \brief swap bits in the word v * * \param v The word whose bits need to be reversed. */ static inline word m4ri_swap_bits(word v) { v = ((v >> 1) & 0x5555555555555555ULL) | ((v & 0x5555555555555555ULL) << 1); v = ((v >> 2) & 0x3333333333333333ULL) | ((v & 0x3333333333333333ULL) << 2); v = ((v >> 4) & 0x0F0F0F0F0F0F0F0FULL) | ((v & 0x0F0F0F0F0F0F0F0FULL) << 4); v = ((v >> 8) & 0x00FF00FF00FF00FFULL) | ((v & 0x00FF00FF00FF00FFULL) << 8); v = ((v >> 16) & 0x0000FFFF0000FFFFULL) | ((v & 0x0000FFFF0000FFFFULL) << 16); v = (v >> 32) | (v << 32); return v; } /** * \brief pack bits (inverse of m4ri_spread_bits) * * \param from bitstring * \param Q array with bit positions * \param length bitsize of the output * \param base subtracted from every value in Q * * \output inverse of m4ri_spread_bits) * * \seealso m4ri_spread_bits */ static inline word m4ri_shrink_bits(word const from, rci_t* const Q, int const length, int const base) { word to = 0; switch(length-1) { case 15: to |= (from & (m4ri_one << (Q[15] - base))) >> (Q[15] - 15 - base); case 14: to |= (from & (m4ri_one << (Q[14] - base))) >> (Q[14] - 14 - base); case 13: to |= (from & (m4ri_one << (Q[13] - base))) >> (Q[13] - 13 - base); case 12: to |= (from & (m4ri_one << (Q[12] - base))) >> (Q[12] - 12 - base); case 11: to |= (from & (m4ri_one << (Q[11] - base))) >> (Q[11] - 11 - base); case 10: to |= (from & (m4ri_one << (Q[10] - base))) >> (Q[10] - 10 - base); case 9: to |= (from & (m4ri_one << (Q[ 9] - base))) >> (Q[ 9] - 9 - base); case 8: to |= (from & (m4ri_one << (Q[ 8] - base))) >> (Q[ 8] - 8 - base); case 7: to |= (from & (m4ri_one << (Q[ 7] - base))) >> (Q[ 7] - 7 - base); case 6: to |= (from & (m4ri_one << (Q[ 6] - base))) >> (Q[ 6] - 6 - base); case 5: to |= (from & (m4ri_one << (Q[ 5] - base))) >> (Q[ 5] - 5 - base); case 4: to |= (from & (m4ri_one << (Q[ 4] - base))) >> (Q[ 4] - 4 - base); case 3: to |= (from & (m4ri_one << (Q[ 3] - base))) >> (Q[ 3] - 3 - base); case 2: to |= (from & (m4ri_one << (Q[ 2] - base))) >> (Q[ 2] - 2 - base); case 1: to |= (from & (m4ri_one << (Q[ 1] - base))) >> (Q[ 1] - 1 - base); case 0: to |= (from & (m4ri_one << (Q[ 0] - base))) >> (Q[ 0] - 0 - base); break; default: abort(); } return to; } /** * \brief spread bits * * Given a bitstring 'from' and a spreading table Q, return a * bitstring where the bits of 'from' are in the positions indicated * by Q. * * \param from bitstring of length 'length' stored in a word * \param Q table with new bit positions * \param length bitsize of input * \param base subtracted from every value in Q * * \output bitstring having the same bits as from but spread using Q * * \seealso m4ri_shrink_bits */ static inline word m4ri_spread_bits(word const from, rci_t* const Q, int const length, int const base) { word to = 0; switch(length-1) { case 15: to |= (from & (m4ri_one << (15))) << (Q[15]-15-base); case 14: to |= (from & (m4ri_one << (14))) << (Q[14]-14-base); case 13: to |= (from & (m4ri_one << (13))) << (Q[13]-13-base); case 12: to |= (from & (m4ri_one << (12))) << (Q[12]-12-base); case 11: to |= (from & (m4ri_one << (11))) << (Q[11]-11-base); case 10: to |= (from & (m4ri_one << (10))) << (Q[10]-10-base); case 9: to |= (from & (m4ri_one << ( 9))) << (Q[ 9]- 9-base); case 8: to |= (from & (m4ri_one << ( 8))) << (Q[ 8]- 8-base); case 7: to |= (from & (m4ri_one << ( 7))) << (Q[ 7]- 7-base); case 6: to |= (from & (m4ri_one << ( 6))) << (Q[ 6]- 6-base); case 5: to |= (from & (m4ri_one << ( 5))) << (Q[ 5]- 5-base); case 4: to |= (from & (m4ri_one << ( 4))) << (Q[ 4]- 4-base); case 3: to |= (from & (m4ri_one << ( 3))) << (Q[ 3]- 3-base); case 2: to |= (from & (m4ri_one << ( 2))) << (Q[ 2]- 2-base); case 1: to |= (from & (m4ri_one << ( 1))) << (Q[ 1]- 1-base); case 0: to |= (from & (m4ri_one << ( 0))) << (Q[ 0]- 0-base); break; default: abort(); } return to; } /** * \brief Return alignment of addr w.r.t. n. For example the address * 17 would be 1 aligned w.r.t. 16. * * \param addr * \param n */ #define __M4RI_ALIGNMENT(addr, n) (((unsigned long)(addr))%(n)) /** * \brief Test for gcc >= maj.min, as per __GNUC_PREREQ in glibc * * \param maj The major version. * \param min The minor version. * \return TRUE iff we are using a GNU compile of at least version maj.min. */ #if defined(__GNUC__) && defined(__GNUC_MINOR__) #define __M4RI_GNUC_PREREQ(maj, min) ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min)) #else #define __M4RI_GNUC_PREREQ(maj, min) FALSE #endif /* __builtin_expect is in gcc 3.0, and not in 2.95. */ #if __M4RI_GNUC_PREREQ(3,0) || defined(M4RI_DOXYGEN) /** * \brief Macro to help with branch prediction. */ #define __M4RI_LIKELY(cond) __builtin_expect ((cond) != 0, 1) /** * \brief Macro to help with branch prediction. */ #define __M4RI_UNLIKELY(cond) __builtin_expect ((cond) != 0, 0) #else #define __M4RI_LIKELY(cond) (cond) #define __M4RI_UNLIKELY(cond) (cond) #endif /** * Return true if a's least significant bit is smaller than b's least significant bit. * * return true if LSBI(a) < LSBI(b), * where LSBI(w) is the index of the least significant bit that is set in w, or 64 if w is zero. * * \param a Word * \param b Word */ static inline int m4ri_lesser_LSB(word a, word b) { uint64_t const ia = __M4RI_CONVERT_TO_UINT64_T(a); uint64_t const ib = __M4RI_CONVERT_TO_UINT64_T(b); /* * If a is zero then we should always return false, otherwise * if b is zero we should return true iff a has at least one bit set. */ return !(ib ? ((ia - 1) ^ ia) & ib : !ia); } /**** Error Handling *****/ /** * \brief Print error message and abort(). * * The function accepts additional * parameters like printf, so e.g. m4ri_die("foo %d bar %f\n",1 ,2.0) * is valid and will print the string "foo 1 bar 2.0" before dying. * * \param errormessage a string to be printed. * * \todo Allow user to register callback which is called on * m4ri_die(). * * \warning The provided string is not free'd. */ void m4ri_die(const char *errormessage, ...); /**** IO *****/ /** * \brief Write a sting representing the word data to destination. * * \param destination Address of buffer of length at least m4ri_radix*1.3 * \param data Source word * \param colon Insert a Colon after every 4-th bit. * \warning Assumes destination has m4ri_radix*1.3 bytes available */ void m4ri_word_to_str( char *destination, word data, int colon); /** * \brief Return 1 or 0 uniformly randomly distributed. * * \todo Allow user to provide her own random() function. */ static inline BIT m4ri_coin_flip() { if (rand() < RAND_MAX/2) { return 0; } else { return 1; } } /** * \brief Return uniformly randomly distributed random word. * * \todo Allow user to provide her own random() function. */ word m4ri_random_word(); /***** Initialization *****/ /** * \brief Initialize global data structures for the M4RI library. * * On Linux/Solaris this is called automatically when the shared * library is loaded, but it doesn't harm if it is called twice. */ #if defined(__GNUC__) void __attribute__ ((constructor)) m4ri_init(void); #else void m4ri_init(void); #endif #ifdef __SUNPRO_C #pragma init(m4ri_init) #endif /** * \brief De-initialize global data structures from the M4RI library. * * On Linux/Solaris this is called automatically when the shared * library is unloaded, but it doesn't harm if it is called twice. */ #if defined(__GNUC__) void __attribute__ ((destructor)) m4ri_fini(void); #else void m4ri_fini(void); #endif #ifdef __SUNPRO_C #pragma fini(m4ri_fini) #endif /***** Memory Management *****/ #if __M4RI_CPU_L3_CACHE == 0 /* * Fix some standard value for L3 cache size if it couldn't be * determined by configure. */ #if __M4RI_CPU_L2_CACHE #define __M4RI_CPU_L3_CACHE __M4RI_CPU_L2_CACHE #else #define __M4RI_CPU_L3_CACHE 4194304 #endif // __M4RI_CPU_L2_CACHE #endif // __M4RI_CPU_L3_CACHE #if __M4RI_CPU_L2_CACHE == 0 /* * Fix some standard value for L2 cache size if it couldn't be * determined by configure. */ #define __M4RI_CPU_L2_CACHE 262144 #endif // __M4RI_CPU_L2_CACHE #if __M4RI_CPU_L1_CACHE == 0 /* * Fix some standard value for L1 cache size if it couldn't be * determined by configure. */ #define __M4RI_CPU_L1_CACHE 16384 #endif // __M4RI_CPU_L1_CACHE /** * \brief Calloc wrapper. * * \param count Number of elements. * \param size Size of each element. * * \return pointer to allocated memory block. * * \todo Allow user to register calloc function. */ static inline void *m4ri_mm_calloc(size_t count, size_t size) { void *newthing; #if __M4RI_USE_MM_MALLOC newthing = _mm_malloc(count * size, 64); #elif __M4RI_USE_POSIX_MEMALIGN int error = posix_memalign(&newthing, 64, count * size); if (error) newthing = NULL; #else newthing = calloc(count, size); #endif if (newthing == NULL) { m4ri_die("m4ri_mm_calloc: calloc returned NULL\n"); return NULL; /* unreachable. */ } #if __M4RI_USE_MM_MALLOC || __M4RI_USE_POSIX_MEMALIGN char *b = (char*)newthing; memset(b, 0, count * size); #endif return newthing; } /** * \brief Aligned malloc wrapper. * * This function will attempt to align memory, but does not guarantee * success in case neither _mm_malloc nor posix_memalign are available. * * \param size Size in bytes. * \param alignment Alignment (16,64,...). * * \return pointer to allocated memory block. * * \todo Allow user to register malloc function. */ static inline void *m4ri_mm_malloc_aligned(size_t size, size_t alignment) { void *newthing; #if __M4RI_USE_MM_MALLOC newthing = _mm_malloc(size, alignment); #elif __M4RI_USE_POSIX_MEMALIGN int error = posix_memalign(&newthing, alignment, size); if (error) newthing = NULL; #else newthing = malloc(size); #endif if (newthing==NULL && (size>0)) { m4ri_die("m4ri_mm_malloc: malloc returned NULL\n"); return NULL; /* unreachable */ } else return newthing; } /** * \brief Malloc wrapper. * * \param size Size in bytes. * * \return pointer to allocated memory block. * * \todo Allow user to register malloc function. */ static inline void *m4ri_mm_malloc(size_t size) { void *newthing; #if __M4RI_USE_MM_MALLOC newthing = _mm_malloc(size, 64); #elif __M4RI_USE_POSIX_MEMALIGN int error = posix_memalign(&newthing, 64, size); if (error) newthing = NULL; #else newthing = malloc(size); #endif //__M4RI_USE_MM_MALLOC if (newthing==NULL && (size>0)) { m4ri_die("m4ri_mm_malloc: malloc returned NULL\n"); return NULL; /* unreachable */ } else return newthing; } /** * \brief Free wrapper. * * \param condemned Pointer. * * \todo Allow user to register free function. */ /* void m4ri_mm_free(void *condemned, ...); */ static inline void m4ri_mm_free(void *condemned, ...) { #if __M4RI_USE_MM_MALLOC _mm_free(condemned); #else free(condemned); #endif } /** * MSVC does not understand the restrict keyword */ #if defined (__GNUC__) #define RESTRICT __restrict__ #else #define RESTRICT #endif #endif // M4RI_MISC_H libm4ri-20130416/src/mmc.c000066400000000000000000000061071212302366200150260ustar00rootroot00000000000000/******************************************************************* * * M4RI: Linear Algebra over GF(2) * * Copyright (C) 2007, 2008 Gregory Bard * Copyright (C) 2008 Martin Albrecht * * Distributed under the terms of the GNU General Public License (GPL) * version 2 or higher. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ * ********************************************************************/ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include "mmc.h" #ifdef __M4RI_ENABLE_MMC /** * The actual memory block cache. */ mmb_t m4ri_mmc_cache[__M4RI_MMC_NBLOCKS]; #endif // __M4RI_ENABLE_MMC /** * \brief Allocate size bytes. * * \param size Number of bytes. * * \return pointer to allocated memory block. */ void *m4ri_mmc_malloc(size_t size) { #ifdef __M4RI_ENABLE_MMC void *ret = NULL; #if __M4RI_HAVE_OPENMP #pragma omp critical (mmc) { #endif mmb_t *mm = m4ri_mmc_cache; if (size <= __M4RI_MMC_THRESHOLD) { for (int i = 0; i < __M4RI_MMC_NBLOCKS; ++i) { if(mm[i].size == size) { ret = mm[i].data; mm[i].data = NULL; mm[i].size = 0; break; } } } #if __M4RI_HAVE_OPENMP } #endif if (ret) return ret; else return m4ri_mm_malloc(size); #else // __M4RI_ENABLE_MMC return m4ri_mm_malloc(size); #endif // __M4RI_ENABLE_MMC } /** * \brief Free the data pointed to by condemned of the given size. * * \param condemned Pointer to memory. * \param size Number of bytes. */ void m4ri_mmc_free(void *condemned, size_t size) { #ifdef __M4RI_ENABLE_MMC #if __M4RI_HAVE_OPENMP #pragma omp critical (mmc) { #endif static int j = 0; mmb_t *mm = m4ri_mmc_cache; if (size < __M4RI_MMC_THRESHOLD) { for(int i = 0; i < __M4RI_MMC_NBLOCKS; ++i) { if(mm[i].size == 0) { mm[i].size = size; mm[i].data = condemned; goto done; } } m4ri_mm_free(mm[j].data); mm[j].size = size; mm[j].data = condemned; j = (j+1) % __M4RI_MMC_NBLOCKS; } else { m4ri_mm_free(condemned); } done: ; #if __M4RI_HAVE_OPENMP } #endif // __M4RI_HAVE_OPENMP #else // __M4RI_ENABLE_MMC m4ri_mm_free(condemned); #endif // __M4RI_ENABLE_MMC } /** * \brief Cleans up memory block cache. * * This function is called automatically when the shared library is unloaded. * * \warning Not thread safe. */ void m4ri_mmc_cleanup(void) { #ifdef __M4RI_ENABLE_MMC #if __M4RI_HAVE_OPENMP #pragma omp critical (mmc) { #endif mmb_t *mm = m4ri_mmc_cache; for(int i = 0; i < __M4RI_MMC_NBLOCKS; ++i) { if (mm[i].size) m4ri_mm_free(mm[i].data); mm[i].size = 0; } #if __M4RI_HAVE_OPENMP } #endif // __M4RI_HAVE_OPENMP #endif // __M4RI_ENABLE_MMC } libm4ri-20130416/src/mmc.h000066400000000000000000000041211212302366200150250ustar00rootroot00000000000000/** * \file mmc.h * \brief The mmc memory management functions check a cache for re-usable unused memory before asking the system for it. * * \author Gregory Bard * \author Martin Albrecht */ #ifndef M4RI_MMC_H #define M4RI_MMC_H /******************************************************************* * * M4RI: Linear Algebra over GF(2) * * Copyright (C) 2007, 2008 Gregory Bard * Copyright (C) 2008 Martin Albrecht * * Distributed under the terms of the GNU General Public License (GPL) * version 2 or higher. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ * ********************************************************************/ #include void *m4ri_mmc_malloc(size_t size); void m4ri_mmc_free(void *condemned, size_t size); void m4ri_mmc_cleanup(void); /** * \brief Enable memory block cache (default: enabled). */ #define __M4RI_ENABLE_MMC /** * \brief Number of blocks that are cached. */ #define __M4RI_MMC_NBLOCKS 16 /** * \brief Maximal size of blocks stored in cache. */ #define __M4RI_MMC_THRESHOLD __M4RI_CPU_L3_CACHE /** * \brief Tuple of pointer to allocated memory block and it's size. */ typedef struct _mm_block { /** * Size in bytes of the data. */ size_t size; /** * Pointer to buffer of data. */ void *data; } mmb_t; /** * \brief Allocate an array of count times size zeroed bytes. * * \param count Number of elements. * \param size Number of bytes per element. * * \return Pointer to allocated memory block. */ static inline void *m4ri_mmc_calloc(size_t count, size_t size) { size_t total_size = count * size; void *ret = m4ri_mmc_malloc(total_size); memset((char*)ret, 0, total_size); return ret; } #endif // M4RI_MMC_H libm4ri-20130416/src/mzd.c000066400000000000000000002267131212302366200150530ustar00rootroot00000000000000/****************************************************************************** * * M4RI: Linear Algebra over GF(2) * * Copyright (C) 2007 Gregory Bard * Copyright (C) 2009,2010 Martin Albrecht * Copyright (C) 2011 Carlo Wood * * Distributed under the terms of the GNU General Public License (GPL) * version 2 or higher. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ ******************************************************************************/ #ifdef HAVE_CONFIG_H #include "config.h" #endif #ifdef __M4RI_HAVE_LIBPNG #include #endif #include #include #include "mzd.h" #include "parity.h" #include "mmc.h" typedef struct mzd_t_cache { mzd_t mzd[64]; struct mzd_t_cache *prev; struct mzd_t_cache *next; uint64_t used; unsigned char padding[sizeof(mzd_t) - 2 * sizeof(struct mzd_t_cache*) - sizeof(uint64_t)]; #ifdef __GNUC__ } mzd_t_cache_t __attribute__ ((__aligned__ (64))); #else } mzd_t_cache_t; #endif #define __M4RI_MZD_T_CACHE_MAX 16 static mzd_t_cache_t mzd_cache; static mzd_t_cache_t* current_cache = &mzd_cache; static int log2_floor(uint64_t v) { static uint64_t const b[] = { 0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000, 0xFFFFFFFF00000000 }; static unsigned int const S[] = { 1, 2, 4, 8, 16, 32 }; unsigned int r = 0; for (int i = 5; i >= 0; --i) { if ((v & b[i])) { v >>= S[i]; r |= S[i]; } } return r; } /* * Return a pointer to a new mzd_t structure. * The structure will be 64 byte aligned. * Call mzd_t_free to free the structure for next use. */ static mzd_t* mzd_t_malloc() { #if __M4RI_HAVE_OPENMP return (mzd_t*)m4ri_mm_malloc(sizeof(mzd_t)); #else mzd_t *ret = NULL; int i=0; if (current_cache->used == (uint64_t)-1) { mzd_t_cache_t *cache = &mzd_cache; while (cache && cache->used == (uint64_t)-1) { current_cache = cache; cache = cache->next; i++; } if (!cache && i< __M4RI_MZD_T_CACHE_MAX) { cache = (mzd_t_cache_t*)m4ri_mm_malloc_aligned(sizeof(mzd_t_cache_t), 64); memset((char*)cache, 0, sizeof(mzd_t_cache_t)); cache->prev = current_cache; current_cache->next = cache; current_cache = cache; } else if (!cache && i>= __M4RI_MZD_T_CACHE_MAX) { /* We have reached the upper limit on the number of caches */ ret = (mzd_t*)m4ri_mm_malloc(sizeof(mzd_t)); } else { current_cache = cache; } } if (ret == NULL) { int free_entry =log2_floor(~current_cache->used); current_cache->used |= ((uint64_t)1 << free_entry); ret = ¤t_cache->mzd[free_entry]; } return ret; #endif //__M4RI_HAVE_OPENMP } static void mzd_t_free(mzd_t *M) { #if __M4RI_HAVE_OPENMP m4ri_mm_free(M); #else int foundit = 0; mzd_t_cache_t *cache = &mzd_cache; while(cache) { size_t entry = M - cache->mzd; if (entry < 64) { cache->used &= ~((uint64_t)1 << entry); if (cache->used == 0) { if (cache == &mzd_cache) { current_cache = cache; } else { if (cache == current_cache) { current_cache = cache->prev; } cache->prev->next = cache->next; if (cache->next) cache->next->prev = cache->prev; m4ri_mm_free(cache); } } foundit = 1; break; } cache = cache->next; } if(!foundit) { m4ri_mm_free(M); } #endif } mzd_t *mzd_init(rci_t r, rci_t c) { mzd_t *A = mzd_t_malloc(); A->nrows = r; A->ncols = c; A->width = (c + m4ri_radix - 1) / m4ri_radix; A->rowstride = (A->width < mzd_paddingwidth || (A->width & 1) == 0) ? A->width : A->width + 1; if (A->width == 1) { A->high_bitmask = A->low_bitmask = __M4RI_MIDDLE_BITMASK(c, 0); } else { A->high_bitmask = __M4RI_LEFT_BITMASK(c % m4ri_radix); A->low_bitmask = m4ri_ffff; } A->flags = (A->high_bitmask != m4ri_ffff) ? mzd_flag_nonzero_excess : 0; A->offset = 0; A->offset_vector = 0; A->row_offset = 0; A->rows = (word**)m4ri_mmc_calloc(r + 1, sizeof(word*)); // We're overcomitting here. if (r && c) { int blockrows = __M4RI_MAX_MZD_BLOCKSIZE / A->rowstride; A->blockrows_log = 0; while(blockrows >>= 1) A->blockrows_log++; blockrows = 1 << A->blockrows_log; //A->blockrows_mask = blockrows - 1; int const blockrows_mask = blockrows - 1; int const nblocks = (r + blockrows - 1) / blockrows; A->flags |= (nblocks > 1) ? mzd_flag_multiple_blocks : 0; A->blocks = (mzd_block_t*)m4ri_mmc_calloc(nblocks + 1, sizeof(mzd_block_t)); size_t block_words = (r - (nblocks - 1) * blockrows) * A->rowstride; for(int i = nblocks - 1; i >= 0; --i) { A->blocks[i].size = block_words * sizeof(word); A->blocks[i].begin = (word*)m4ri_mmc_calloc(1, A->blocks[i].size); A->blocks[i].end = A->blocks[i].begin + block_words; block_words = blockrows * A->rowstride; } for(rci_t i = 0; i < A->nrows; ++i) { A->rows[i] = A->blocks[i >> A->blockrows_log].begin + (i & blockrows_mask) * A->rowstride; #ifdef M4RI_WRAPWORD word::init_array(A->rows[i], A->width); #endif } } else { A->blocks = NULL; } return A; } /* Explanation of offset_vector (in words), and row_offset. <------------------------------- row_stride (in words)---------------------> .---------------------------------------------------------------------------. <-- m->blocks[0].begin ^ | ^ /| | | m->row_offset| m->offset_vector_/ | | | v / | | | .--------------------------------------------------------------------v<--|---- m->rows[0] |_ skipped_blocks (in blocks) | |m (also a window) ^ | | | | | | | | | `---------------------------------|-----------------------------------------' v .---------------------------------|----------------------------------------_. <-- m->blocks[1].begin <-- windows.blocks[0].begin | | ^ lowr| |_^ | | | window->row_offset| | window->offset_vector _-^| | | | v v _-^ | | | | .----------------------------------------------------------v<--. |<--|---- m->rows[lowr] | | |window | `-|---|---- window->rows[0] | | | | | | `---------------------------------------------------------------------------' .---------------------------------------------------------------------------. <-- m->blocks[2].begin <-- windows.blocks[1].begin | | | | | | | | | | lowc | | | | | |<---->| | | | | | \__|___|__ also wrd_offset (in words) | | `----------------------------------------------------------' | | | `--------------------------------------------------------------------' | `---------------------------------------------------------------------------' .---------------------------------------------------------------------------. | | */ mzd_t *mzd_init_window (mzd_t *m, rci_t lowr, rci_t lowc, rci_t highr, rci_t highc) { rci_t nrows, ncols; mzd_t *window; window = mzd_t_malloc(); nrows = MIN(highr - lowr, m->nrows - lowr); ncols = highc - lowc; window->nrows = nrows; window->ncols = ncols; window->rowstride = m->rowstride; window->offset = (lowc + m->offset) % m4ri_radix; window->width = (ncols + window->offset + m4ri_radix - 1) / m4ri_radix; if (window->width == 1) { window->high_bitmask = window->low_bitmask = __M4RI_MIDDLE_BITMASK(ncols, window->offset); } else { window->high_bitmask = __M4RI_LEFT_BITMASK((ncols + window->offset) % m4ri_radix); window->low_bitmask = __M4RI_RIGHT_BITMASK(m4ri_radix - window->offset); } window->flags = (window->offset == 0) ? mzd_flag_windowed_zerooffset : mzd_flag_nonzero_offset; window->flags |= ((ncols + window->offset) % m4ri_radix == 0) ? mzd_flag_windowed_zeroexcess : mzd_flag_nonzero_excess; window->blockrows_log = m->blockrows_log; //window->blockrows_mask = m->blockrows_mask; wi_t const blockrows_mask = (1 << window->blockrows_log) - 1; int const skipped_blocks = (m->row_offset + lowr) >> window->blockrows_log; assert(skipped_blocks == 0 || ((m->flags & mzd_flag_multiple_blocks))); window->row_offset = (m->row_offset + lowr) & blockrows_mask; window->blocks = &m->blocks[skipped_blocks]; wi_t const wrd_offset = (lowc + m->offset) / m4ri_radix; window->offset_vector = (m->offset_vector + wrd_offset) + (window->row_offset - m->row_offset) * window->rowstride; if(nrows) window->rows = (word**)m4ri_mmc_calloc(nrows + 1, sizeof(word*)); else window->rows = NULL; for(rci_t i = 0; i < nrows; ++i) { window->rows[i] = m->rows[lowr + i] + wrd_offset; } if (mzd_row_to_block(window, nrows - 1) > 0) window->flags |= m->flags & mzd_flag_multiple_blocks; /* offset_vector is the distance from the start of the first block to the first word of the first row. */ assert(nrows == 0 || window->blocks[0].begin + window->offset_vector == window->rows[0]); __M4RI_DD_MZD(window); return window; } void mzd_free(mzd_t *A) { if(A->rows) m4ri_mmc_free(A->rows, (A->nrows + 1) * sizeof(word*)); if(mzd_owns_blocks(A)) { int i; for(i = 0; A->blocks[i].size; ++i) { m4ri_mmc_free(A->blocks[i].begin, A->blocks[i].size); } m4ri_mmc_free(A->blocks, (i + 1) * sizeof(mzd_block_t)); } mzd_t_free(A); } void mzd_row_add(mzd_t *M, rci_t sourcerow, rci_t destrow) { mzd_row_add_offset(M, destrow, sourcerow, 0); } rci_t mzd_gauss_delayed(mzd_t *M, rci_t startcol, int full) { assert(M->offset == 0); rci_t startrow = startcol; rci_t pivots = 0; for (rci_t i = startcol; i < M->ncols ; ++i) { for(rci_t j = startrow ; j < M->nrows; ++j) { if (mzd_read_bit(M, j, i)) { mzd_row_swap(M, startrow, j); ++pivots; for(rci_t ii = full ? 0 : startrow + 1; ii < M->nrows; ++ii) { if (ii != startrow) { if (mzd_read_bit(M, ii, i)) { mzd_row_add_offset(M, ii, startrow, i); } } } startrow = startrow + 1; break; } } } __M4RI_DD_MZD(M); __M4RI_DD_RCI(pivots); return pivots; } rci_t mzd_echelonize_naive(mzd_t *M, int full) { return mzd_gauss_delayed(M, 0, full); } /** * Transpose a 64 x 64 matrix with width 1. * * \param dst First word of destination matrix. * \param src First word of source matrix. * \param rowstride_dst Rowstride of matrix dst. * \param rowstride_src Rowstride of matrix src. * * Rows of both matrices are expected to fit exactly in a word (offset == 0) * and lay entirely inside a single block. * * \note This function also works when dst == src. */ static inline void _mzd_copy_transpose_64x64(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src) { /* * m runs over the values: * 0x00000000FFFFFFFF * 0x0000FFFF0000FFFF * 0x00FF00FF00FF00FF * 0x0F0F0F0F0F0F0F0F * 0x3333333333333333 * 0x5555555555555555, * alternating j zeroes with j ones. * * Assume we have a matrix existing of four jxj matrices ((0,0) is in the top-right corner, * this is the memory-model view, see the layout on http://m4ri.sagemath.org/doxygen/structmzd__t.html): * ...[A1][B1][A0][B0] * ...[C1][D1][C0][D0] * . [A2][B2] * . [C2][B2] * . . * . * The following calulates the XOR between A and D, * and subsequently applies that to A and D respectively, * swapping A and D as a result. * Therefore wk starts at the first row and then has rowstride * added j times, running over the rows of A, then skips C * by adding j * rowstride to continue with the next A below C. */ word m = __M4RI_CONVERT_TO_WORD(0xFFFFFFFF); wi_t j_rowstride_dst = rowstride_dst * 64; wi_t j_rowstride_src = rowstride_src * 32; word* const end = dst + j_rowstride_dst; // We start with j = 32, and a one-time unrolled loop, where // we copy from src and write the result to dst, swapping // the two 32x32 corner matrices. int j = 32; j_rowstride_dst >>= 1; word* RESTRICT wk = dst; for (word const* RESTRICT wks = src; wk < end; wk += j_rowstride_dst, wks += j_rowstride_src) { for (int k = 0; k < j; ++k, wk += rowstride_dst, wks += rowstride_src) { word xor = ((*wks >> j) ^ *(wks + j_rowstride_src)) & m; *wk = *wks ^ (xor << j); *(wk + j_rowstride_dst) = *(wks + j_rowstride_src) ^ xor; } } // Next we work in-place in dst and swap the corners of // each of the last matrices, all in parallel, for all // remaining values of j. m ^= m << 16; for (j = 16; j != 0; j = j >> 1, m ^= m << j) { j_rowstride_dst >>= 1; for (wk = dst; wk < end; wk += j_rowstride_dst) { for (int k = 0; k < j; ++k, wk += rowstride_dst) { word xor = ((*wk >> j) ^ *(wk + j_rowstride_dst)) & m; *wk ^= xor << j; *(wk + j_rowstride_dst) ^= xor; } } } } /** * Transpose two 64 x 64 matrix with width 1. * * \param dst1 First word of destination matrix 1. * \param dst2 First word of destination matrix 2. * \param src1 First word of source matrix 1. * \param src2 First word of source matrix 2. * \param rowstride_dst Rowstride of destination matrices. * \param rowstride_src Rowstride of source matrices. * * Rows of all matrices are expected to fit exactly in a word (offset == 0) * and lay entirely inside a single block. * * \note This function also works to transpose in-place. */ static inline void _mzd_copy_transpose_64x64_2(word* RESTRICT dst1, word* RESTRICT dst2, word const* RESTRICT src1, word const* RESTRICT src2, wi_t rowstride_dst, wi_t rowstride_src) { word m = __M4RI_CONVERT_TO_WORD(0xFFFFFFFF); wi_t j_rowstride_dst = rowstride_dst * 64; wi_t j_rowstride_src = rowstride_src * 32; word* const end = dst1 + j_rowstride_dst; int j = 32; word* RESTRICT wk[2]; word const* RESTRICT wks[2]; word xor[2]; j_rowstride_dst >>= 1; wk[0] = dst1; wk[1] = dst2; wks[0] = src1; wks[1] = src2; do { for (int k = 0; k < j; ++k) { xor[0] = ((*wks[0] >> j) ^ *(wks[0] + j_rowstride_src)) & m; xor[1] = ((*wks[1] >> j) ^ *(wks[1] + j_rowstride_src)) & m; *wk[0] = *wks[0] ^ (xor[0] << j); *wk[1] = *wks[1] ^ (xor[1] << j); *(wk[0] + j_rowstride_dst) = *(wks[0] + j_rowstride_src) ^ xor[0]; *(wk[1] + j_rowstride_dst) = *(wks[1] + j_rowstride_src) ^ xor[1]; wk[0] += rowstride_dst; wk[1] += rowstride_dst; wks[0] += rowstride_src; wks[1] += rowstride_src; } wk[0] += j_rowstride_dst; wk[1] += j_rowstride_dst; wks[0] += j_rowstride_src; wks[1] += j_rowstride_src; } while(wk[0] < end); m ^= m << 16; for (j = 16; j != 0; j = j >> 1, m ^= m << j) { j_rowstride_dst >>= 1; wk[0] = dst1; wk[1] = dst2; do { for (int k = 0; k < j; ++k) { xor[0] = ((*wk[0] >> j) ^ *(wk[0] + j_rowstride_dst)) & m; xor[1] = ((*wk[1] >> j) ^ *(wk[1] + j_rowstride_dst)) & m; *wk[0] ^= xor[0] << j; *wk[1] ^= xor[1] << j; *(wk[0] + j_rowstride_dst) ^= xor[0]; *(wk[1] + j_rowstride_dst) ^= xor[1]; wk[0] += rowstride_dst; wk[1] += rowstride_dst; } wk[0] += j_rowstride_dst; wk[1] += j_rowstride_dst; } while(wk[0] < end); } } static unsigned char log2_ceil_table[64] = { 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6 }; static inline int log2_ceil(int n) { return log2_ceil_table[n - 1]; } static word const transpose_mask[6] = { 0x5555555555555555ULL, 0x3333333333333333ULL, 0x0F0F0F0F0F0F0F0FULL, 0x00FF00FF00FF00FFULL, 0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL, }; /** * Transpose 64/j matrices of size jxj in parallel. * * Where j equals n rounded up to the nearest power of 2. * The input array t must be of size j (containing the rows i of all matrices in t[i]). * * t[0..{j-1}] = [Al]...[A1][A0] * * \param t An array of j words. * \param n The number of rows in each matrix. * * \return log2(j) */ static inline int _mzd_transpose_Nxjx64(word* RESTRICT t, int n) { int j = 1; int mi = 0; // Index into the transpose_mask array. while (j < n) // Don't swap with entirely undefined data (where [D] exists entirely of non-existant rows). { // Swap 64/j matrices of size jxj in 2j rows. Thus, // <---- one word ---> // [Al][Bl]...[A0][B0] // [Cl][Dl]...[C0][D0], where l = 64/j - 1 and each matrix [A], [B] etc is jxj. // Then swap [A] and [D] in-place. // m runs over the values in transpose_mask, so that at all // times m exists of j zeroes followed by j ones, repeated. word const m = transpose_mask[mi]; int k = 0; // Index into t[]. do { // Run over all rows of [A] and [D]. for (int i = 0; i < j; ++i, ++k) { // t[k] contains row i of all [A], and t[k + j] contains row i of all [D]. Swap them. word xor = ((t[k] >> j) ^ t[k + j]) & m; t[k] ^= xor << j; t[k + j] ^= xor; } k += j; // Skip [C]. } while (k < n); // Stop if we passed all valid input. // Double the size of j and repeat this for the next 2j rows until all // n rows have been swapped (possibly with non-existant rows). j <<= 1; ++mi; } return mi; } /** * Transpose a n x 64 matrix with width 1. * * \param dst First word of destination matrix. * \param src First word of source matrix. * \param rowstride_dst Rowstride of destination matrix. * \param rowstride_src Rowstride of source matrix. * \param n Number of rows in source matrix, must be less than 64. * * Rows of all matrices are expected have offset zero * and lay entirely inside a single block. * * \note This function also works to transpose in-place. */ static inline void _mzd_copy_transpose_lt64x64(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n) { // Preload the n input rows into level 1, using a minimum of cache lines (compact storage). word t[64]; word const* RESTRICT wks = src; int k; for (k = 0; k < n; ++k) { t[k] = *wks; wks += rowstride_src; } if (n > 32) { while (k < 64) t[k++] = 0; _mzd_copy_transpose_64x64(dst, t, rowstride_dst, 1); return; } int log2j = _mzd_transpose_Nxjx64(t, n); // All output bits are now transposed, but still might need to be shifted in place. // What we have now is 64/j matrices of size jxj. Thus, // [Al]...[A1][A0], where l = 64/j - 1. // while the actual output is: // [A0] // [A1] // ... // [Al] word const m = __M4RI_LEFT_BITMASK(n); word* RESTRICT wk = dst; switch (log2j) { case 5: { wi_t const j_rowstride_dst = 32 * rowstride_dst; for (int k = 0; k < 32; ++k) { wk[0] = t[k] & m; wk[j_rowstride_dst] = (t[k] >> 32) & m; wk += rowstride_dst; } break; } case 4: { wi_t const j_rowstride_dst = 16 * rowstride_dst; for (int k = 0; k < 16; ++k) { wk[0] = t[k] & m; wk[j_rowstride_dst] = (t[k] >> 16) & m; wk[2 * j_rowstride_dst] = (t[k] >> 32) & m; wk[3 * j_rowstride_dst] = (t[k] >> 48) & m; wk += rowstride_dst; } break; } case 3: { wi_t const j_rowstride_dst = 8 * rowstride_dst; for (int k = 0; k < 8; ++k) { wk[0] = t[k] & m; wk[j_rowstride_dst] = (t[k] >> 8) & m; wk[2 * j_rowstride_dst] = (t[k] >> 16) & m; wk[3 * j_rowstride_dst] = (t[k] >> 24) & m; wk[4 * j_rowstride_dst] = (t[k] >> 32) & m; wk[5 * j_rowstride_dst] = (t[k] >> 40) & m; wk[6 * j_rowstride_dst] = (t[k] >> 48) & m; wk[7 * j_rowstride_dst] = (t[k] >> 56) & m; wk += rowstride_dst; } break; } case 2: { wi_t const j_rowstride_dst = 4 * rowstride_dst; for (int k = 0; k < 4; ++k) { word* RESTRICT wk2 = wk; word tk = t[k]; for (int i = 0; i < 2; ++i) { wk2[0] = tk & m; wk2[j_rowstride_dst] = (tk >> 4) & m; wk2[2 * j_rowstride_dst] = (tk >> 8) & m; wk2[3 * j_rowstride_dst] = (tk >> 12) & m; wk2[4 * j_rowstride_dst] = (tk >> 16) & m; wk2[5 * j_rowstride_dst] = (tk >> 20) & m; wk2[6 * j_rowstride_dst] = (tk >> 24) & m; wk2[7 * j_rowstride_dst] = (tk >> 28) & m; wk2 += 8 * j_rowstride_dst; tk >>= 32; } wk += rowstride_dst; } break; } case 1: { wi_t const j_rowstride_dst = 2 * rowstride_dst; for (int k = 0; k < 2; ++k) { word* RESTRICT wk2 = wk; word tk = t[k]; for (int i = 0; i < 8; ++i) { wk2[0] = tk & m; wk2[j_rowstride_dst] = (tk >> 2) & m; wk2[2 * j_rowstride_dst] = (tk >> 4) & m; wk2[3 * j_rowstride_dst] = (tk >> 6) & m; wk2 += 4 * j_rowstride_dst; tk >>= 8; } wk += rowstride_dst; } break; } case 0: { word* RESTRICT wk2 = wk; word tk = t[0]; for (int i = 0; i < 16; ++i) { wk2[0] = tk & m; wk2[rowstride_dst] = (tk >> 1) & m; wk2[2 * rowstride_dst] = (tk >> 2) & m; wk2[3 * rowstride_dst] = (tk >> 3) & m; wk2 += 4 * rowstride_dst; tk >>= 4; } break; } } } /** * Transpose a 64 x n matrix with width 1. * * \param dst First word of destination matrix. * \param src First word of source matrix. * \param rowstride_dst Rowstride of destination matrix. * \param rowstride_src Rowstride of source matrix. * \param n Number of columns in source matrix, must be less than 64. * * Rows of all matrices are expected have offset zero * and lay entirely inside a single block. * * \note This function also works to transpose in-place. */ static inline void _mzd_copy_transpose_64xlt64(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n) { word t[64]; int log2j = log2_ceil(n); word const* RESTRICT wks = src; switch (log2j) { case 6: { _mzd_copy_transpose_64x64(t, src, 1, rowstride_src); word* RESTRICT wk = dst; for (int k = 0; k < n; ++k) { *wk = t[k]; wk += rowstride_dst; } return; } case 5: { wi_t const j_rowstride_src = 32 * rowstride_src; for (int k = 0; k < 32; ++k) { t[k] = wks[0] | (wks[j_rowstride_src] << 32); wks += rowstride_src; } break; } case 4: { wi_t const j_rowstride_src = 16 * rowstride_src; for (int k = 0; k < 16; ++k) { t[k] = wks[0] | (wks[j_rowstride_src] << 16); t[k] |= (wks[2 * j_rowstride_src] << 32) | (wks[3 * j_rowstride_src] << 48); wks += rowstride_src; } break; } case 3: { wi_t const j_rowstride_src = 8 * rowstride_src; word tt; for (int k = 0; k < 8; ++k) { tt = wks[0] | (wks[j_rowstride_src] << 8); t[k] = (wks[2 * j_rowstride_src] << 16) | (wks[3 * j_rowstride_src] << 24); tt |= (wks[4 * j_rowstride_src] << 32) | (wks[5 * j_rowstride_src] << 40); t[k] |= (wks[6 * j_rowstride_src] << 48) | (wks[7 * j_rowstride_src] << 56); wks += rowstride_src; t[k] |= tt; } break; } case 2: { word const* RESTRICT wks2 = wks + 60 * rowstride_src; t[0] = wks2[0]; t[1] = wks2[rowstride_src]; t[2] = wks2[2 * rowstride_src]; t[3] = wks2[3 * rowstride_src]; for (int i = 0; i < 15; ++i) { wks2 -= 4 * rowstride_src; t[0] <<= 4; t[1] <<= 4; t[2] <<= 4; t[3] <<= 4; t[0] |= wks2[0]; t[1] |= wks2[rowstride_src]; t[2] |= wks2[2 * rowstride_src]; t[3] |= wks2[3 * rowstride_src]; } break; } case 1: { wks += 62 * rowstride_src; t[0] = wks[0]; t[1] = wks[rowstride_src]; for (int i = 0; i < 31; ++i) { wks -= 2 * rowstride_src; t[0] <<= 2; t[1] <<= 2; t[0] |= wks[0]; t[1] |= wks[rowstride_src]; } break; } case 0: { word tt[2]; tt[0] = wks[0]; tt[1] = wks[rowstride_src]; for (int i = 2; i < 64; i += 2) { wks += 2 * rowstride_src; tt[0] |= wks[0] << i; tt[1] |= wks[rowstride_src] << i; } *dst = tt[0] | (tt[1] << 1); return; } } int j = 1 << log2j; _mzd_transpose_Nxjx64(t, j); word* RESTRICT wk = dst; for (int k = 0; k < n; ++k) { *wk = t[k]; wk += rowstride_dst; } } /** * Transpose a n x m matrix with width 1, offset 0 and m and n less than or equal 8. * * \param dst First word of destination matrix. * \param src First word of source matrix. * \param rowstride_dst Rowstride of destination matrix. * \param rowstride_src Rowstride of source matrix. * \param n Number of rows in source matrix, must be less than or equal 8. * \param m Number of columns in source matrix, must be less than or equal 8. * * Rows of all matrices are expected to have offset zero * and lay entirely inside a single block. * * \note This function also works to transpose in-place. */ static inline void _mzd_copy_transpose_le8xle8(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n, int m, int maxsize) { int end = maxsize * 7; word const* RESTRICT wks = src; word w = *wks; int shift = 0; for (int i = 1; i < n; ++i) { wks += rowstride_src; shift += 8; w |= (*wks << shift); } word mask = 0x80402010080402ULL; word w7 = w >> 7; shift = 7; --m; do { word xor = (w ^ w7) & mask; mask >>= 8; w ^= (xor << shift); shift += 7; w7 >>= 7; w ^= xor; } while(shift < end); word* RESTRICT wk = dst + m * rowstride_dst; for (int shift = 8 * m; shift > 0; shift -= 8) { *wk = (unsigned char)(w >> shift); wk -= rowstride_dst; } *wk = (unsigned char)w; } /** * Transpose a n x m matrix with width 1, offset 0 and m and n less than or equal 16. * * \param dst First word of destination matrix. * \param src First word of source matrix. * \param rowstride_dst Rowstride of destination matrix. * \param rowstride_src Rowstride of source matrix. * \param n Number of rows in source matrix, must be less than or equal 16. * \param m Number of columns in source matrix, must be less than or equal 16. * * Rows of all matrices are expected to have offset zero * and lay entirely inside a single block. * * \note This function also works to transpose in-place. */ static inline void _mzd_copy_transpose_le16xle16(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n, int m, int maxsize) { int end = maxsize * 3; word const* RESTRICT wks = src; word t[4]; int i = n; do { t[0] = wks[0]; if (--i == 0) { t[1] = 0; t[2] = 0; t[3] = 0; break; } t[1] = wks[rowstride_src]; if (--i == 0) { t[2] = 0; t[3] = 0; break; } t[2] = wks[2 * rowstride_src]; if (--i == 0) { t[3] = 0; break; } t[3] = wks[3 * rowstride_src]; if (--i == 0) break; wks += 4 * rowstride_src; for(int shift = 16;; shift += 16) { t[0] |= (*wks << shift); if (--i == 0) break; t[1] |= (wks[rowstride_src] << shift); if (--i == 0) break; t[2] |= (wks[2 * rowstride_src] << shift); if (--i == 0) break; t[3] |= (wks[3 * rowstride_src] << shift); if (--i == 0) break; wks += 4 * rowstride_src; } } while(0); word mask = 0xF0000F0000F0ULL; int shift = 12; word xor[4]; do { xor[0] = (t[0] ^ (t[0] >> shift)) & mask; xor[1] = (t[1] ^ (t[1] >> shift)) & mask; xor[2] = (t[2] ^ (t[2] >> shift)) & mask; xor[3] = (t[3] ^ (t[3] >> shift)) & mask; mask >>= 16; t[0] ^= (xor[0] << shift); t[1] ^= (xor[1] << shift); t[2] ^= (xor[2] << shift); t[3] ^= (xor[3] << shift); shift += 12; t[0] ^= xor[0]; t[1] ^= xor[1]; t[2] ^= xor[2]; t[3] ^= xor[3]; } while(shift < end); _mzd_transpose_Nxjx64(t, 4); i = m; word* RESTRICT wk = dst; do { wk[0] = (uint16_t)t[0]; if (--i == 0) break; wk[rowstride_dst] = (uint16_t)t[1]; if (--i == 0) break; wk[2 * rowstride_dst] = (uint16_t)t[2]; if (--i == 0) break; wk[3 * rowstride_dst] = (uint16_t)t[3]; if (--i == 0) break; wk += 4 * rowstride_dst; for(int shift = 16;; shift += 16) { wk[0] = (uint16_t)(t[0] >> shift); if (--i == 0) break; wk[rowstride_dst] = (uint16_t)(t[1] >> shift); if (--i == 0) break; wk[2 * rowstride_dst] = (uint16_t)(t[2] >> shift); if (--i == 0) break; wk[3 * rowstride_dst] = (uint16_t)(t[3] >> shift); if (--i == 0) break; wk += 4 * rowstride_dst; } } while(0); } /** * Transpose a n x m matrix with width 1, offset 0 and m and n less than or equal 32. * * \param dst First word of destination matrix. * \param src First word of source matrix. * \param rowstride_dst Rowstride of destination matrix. * \param rowstride_src Rowstride of source matrix. * \param n Number of rows in source matrix, must be less than or equal 32. * \param m Number of columns in source matrix, must be less than or equal 32. * * Rows of all matrices are expected to have offset zero * and lay entirely inside a single block. * * \note This function also works to transpose in-place. */ static inline void _mzd_copy_transpose_le32xle32(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n, int m) { word const* RESTRICT wks = src; word t[16]; int i = n; if (n > 16) { i -= 16; for (int j = 0; j < 16; ++j) { t[j] = *wks; wks += rowstride_src; } int j = 0; do { t[j++] |= (*wks << 32); wks += rowstride_src; } while(--i); } else { int j; for (j = 0; j < n; ++j) { t[j] = *wks; wks += rowstride_src; } for (; j < 16; ++j) t[j] = 0; } _mzd_transpose_Nxjx64(t, 16); int one_more = (m & 1); word* RESTRICT wk = dst; if (m > 16) { m -= 16; for (int j = 0; j < 16; j += 2) { *wk = (t[j] & 0xFFFF) | ((t[j] >> 16) & 0xFFFF0000); wk[rowstride_dst] = (t[j + 1] & 0xFFFF) | ((t[j + 1] >> 16) & 0xFFFF0000); wk += 2 * rowstride_dst; } for (int j = 1; j < m; j += 2) { *wk = ((t[j - 1] >> 16) & 0xFFFF) | ((t[j - 1] >> 32) & 0xFFFF0000); wk[rowstride_dst] = ((t[j] >> 16) & 0xFFFF) | ((t[j] >> 32) & 0xFFFF0000); wk += 2 * rowstride_dst; } if (one_more) { *wk = ((t[m - 1] >> 16) & 0xFFFF) | ((t[m - 1] >> 32) & 0xFFFF0000); } } else { for (int j = 1; j < m; j += 2) { *wk = (t[j - 1] & 0xFFFF) | ((t[j - 1] >> 16) & 0xFFFF0000); wk[rowstride_dst] = (t[j] & 0xFFFF) | ((t[j] >> 16) & 0xFFFF0000); wk += 2 * rowstride_dst; } if (one_more) { *wk = (t[m - 1] & 0xFFFF) | ((t[m - 1] >> 16) & 0xFFFF0000); } } } static inline void _mzd_copy_transpose_le64xle64(word* RESTRICT dst, word const* RESTRICT src, wi_t rowstride_dst, wi_t rowstride_src, int n, int m) { word const* RESTRICT wks = src; word t[64]; int k; for (k = 0; k < n; ++k) { t[k] = *wks; wks += rowstride_src; } while(k < 64) t[k++] = 0; _mzd_copy_transpose_64x64(t, t, 1, 1); word* RESTRICT wk = dst; for (int k = 0; k < m; ++k) { *wk = t[k]; wk += rowstride_dst; } return; } void _mzd_transpose_multiblock(mzd_t *DST, mzd_t const *A, word* RESTRICT* fwdp, word const* RESTRICT* fwsp, rci_t* nrowsp, rci_t* ncolsp); mzd_t *_mzd_transpose(mzd_t *DST, mzd_t const *A) { assert(!mzd_is_windowed(DST) && !mzd_is_windowed(A)); // We assume that there fit at least 64 rows in a block, if // that is the case then each block will contain a multiple // of 64 rows, since blockrows is a power of 2. assert(A->blockrows_log >= 6 && DST->blockrows_log >= 6); rci_t nrows = A->nrows; rci_t ncols = A->ncols; rci_t maxsize = MAX(nrows, ncols); word* RESTRICT fwd = mzd_first_row(DST); word const* RESTRICT fws = mzd_first_row(A); if (maxsize >= 64) { // This is the most non-intrusive way to deal with the case of multiple blocks. // Note that this code is VERY sensitive. ANY change to _mzd_transpose can easily // reduce the speed for small matrices (up to 64x64) by 5 to 10%. int const multiple_blocks = (A->flags | DST->flags) & mzd_flag_multiple_blocks; if (__M4RI_UNLIKELY(multiple_blocks)) { word* RESTRICT non_register_fwd; word const* RESTRICT non_register_fws; rci_t non_register_nrows; rci_t non_register_ncols; _mzd_transpose_multiblock(DST, A, &non_register_fwd, &non_register_fws, &non_register_nrows, &non_register_ncols); fwd = non_register_fwd; fws = non_register_fws; nrows = non_register_nrows; ncols = non_register_ncols; } if (nrows >= 64) { /* * This is an interesting #if ... * I recommend to investigate the number of instructions, and the clocks per instruction, * as function of various sizes of the matrix (most likely especially the number of columns * (the size of a row) will have influence; also always use multiples of 64 or even 128), * for both cases below. * * To measure this run for example: * * ./bench_mzd -m 10 -x 10 -p PAPI_TOT_INS,PAPI_L1_TCM,PAPI_L2_TCM mzd_transpose 32000 32000 * ./bench_mzd -m 10 -x 100 -p PAPI_TOT_INS,PAPI_L1_TCM,PAPI_L2_TCM mzd_transpose 128 10240 * etc (increase -x for smaller sizes to get better accuracy). * * --Carlo Wood */ #if 1 int js = ncols & nrows & 64; // True if the total number of whole 64x64 matrices is odd. wi_t const rowstride_64_dst = 64 * DST->rowstride; word* RESTRICT fwd_current = fwd; word const* RESTRICT fws_current = fws; if (js) { js = 1; _mzd_copy_transpose_64x64(fwd, fws, DST->rowstride, A->rowstride); if ((nrows | ncols) == 64) { __M4RI_DD_MZD(DST); return DST; } fwd_current += rowstride_64_dst; ++fws_current; } rci_t const whole_64cols = ncols / 64; // The use of delayed and even, is to avoid calling _mzd_copy_transpose_64x64_2 twice. // This way it can be inlined without duplicating the amount of code that has to be loaded. word* RESTRICT fwd_delayed = NULL; word const* RESTRICT fws_delayed = NULL; int even = 0; while (1) { for (int j = js; j < whole_64cols; ++j) { if (!even) { fwd_delayed = fwd_current; fws_delayed = fws_current; } else { _mzd_copy_transpose_64x64_2(fwd_delayed, fwd_current, fws_delayed, fws_current, DST->rowstride, A->rowstride); } fwd_current += rowstride_64_dst; ++fws_current; even = !even; } nrows -= 64; if (ncols % 64) { _mzd_copy_transpose_64xlt64(fwd + whole_64cols * rowstride_64_dst, fws + whole_64cols, DST->rowstride, A->rowstride, ncols % 64); } fwd += 1; fws += 64 * A->rowstride; if (nrows < 64) break; js = 0; fws_current = fws; fwd_current = fwd; } #else // The same as the above, but without using _mzd_copy_transpose_64x64_2. wi_t const rowstride_64_dst = 64 * DST->rowstride; rci_t const whole_64cols = ncols / 64; assert(nrows >= 64); do { for (int j = 0; j < whole_64cols; ++j) { _mzd_copy_transpose_64x64(fwd + j * rowstride_64_dst, fws + j, DST->rowstride, A->rowstride); } nrows -= 64; if (ncols % 64) { _mzd_copy_transpose_64xlt64(fwd + whole_64cols * rowstride_64_dst, fws + whole_64cols, DST->rowstride, A->rowstride, ncols % 64); } fwd += 1; fws += 64 * A->rowstride; } while(nrows >= 64); #endif } if (nrows == 0) { __M4RI_DD_MZD(DST); return DST; } // Transpose the remaining top rows. Now 0 < nrows < 64. while (ncols >= 64) { _mzd_copy_transpose_lt64x64(fwd, fws, DST->rowstride, A->rowstride, nrows); ncols -= 64; fwd += 64 * DST->rowstride; fws += 1; } if (ncols == 0) { __M4RI_DD_MZD(DST); return DST; } maxsize = MAX(nrows, ncols); } // Transpose the remaining corner. Now both 0 < nrows < 64 and 0 < ncols < 64. if (maxsize <= 8) { _mzd_copy_transpose_le8xle8(fwd, fws, DST->rowstride, A->rowstride, nrows, ncols, maxsize); } else if (maxsize <= 16) { _mzd_copy_transpose_le16xle16(fwd, fws, DST->rowstride, A->rowstride, nrows, ncols, maxsize); } else if (maxsize <= 32) { _mzd_copy_transpose_le32xle32(fwd, fws, DST->rowstride, A->rowstride, nrows, ncols); } else { _mzd_copy_transpose_le64xle64(fwd, fws, DST->rowstride, A->rowstride, nrows, ncols); } __M4RI_DD_MZD(DST); return DST; } void _mzd_transpose_multiblock(mzd_t *DST, mzd_t const *A, word* RESTRICT* fwdp, word const* RESTRICT* fwsp, rci_t* nrowsp, rci_t* ncolsp) { rci_t nrows = A->nrows; rci_t ncols = A->ncols; rci_t blockrows_dst = 1 << DST->blockrows_log; // The maximum number of rows in a block of DST. rci_t blockrows_src = 1 << A->blockrows_log; // The maximum number of rows in a block of A. /* We're deviding the source matrix into blocks of multiples of 64x64, such that each * block fits entirely inside a single memory allocation block, both in the source * as well as the corresponding destination. * * <-------------------ncols-----------------> * <---------blockrows_dst-------> * .---------------------------------------------------------------. * |P ^ Matrix A:| . |Q . . . |<-^---- A->blocks[0].begin * | | | . | . . . | | * | | | . | . . . | | * | | |- - - - - -|- - - - - - - - - - - - - - - -| | * | | | . | . ^ . . | | * | | | . | .<64x64>. . | | * | | | . | . v . . | | * | | |- - - - - -|- - - - - - - - - - - - - - - -| |- blockrows_src * | | | . | . . . | | * | | | . | . . . | | * | | | . | . . . | | * | |nrows |- - - - - -|- - - - - - - - - - - - - - - -| | * | | | . | . . . | | * | | | . | . . . | | * | | | . | . . . | v * |===================+===========================================| * |R | | . |S . . . |<------ A->blocks[1].begin * | | | . | . . . | * | | | . | . . . | * | | |- - - - - -|- - - - - - - - - - - - - - - -| * | | | . | . . . | * | | | . | . . . | * | | | . | . . . | * | | |- - - - - -|- - - - - - - - - - - - - - - -| * | v | . | . . . | * | `-------------------------------------------| * | | | * | | | * | | | * | | | * | | | * `---------------------------------------------------------------' * * Imagine this also to be the memory map of DST, which then would be * mirrored in the diagonal line from the top/right to the bottom/left. * Then each of the squares P, Q, R and S lay entirely inside one * memory block in both the source as well as the destination matrix. * P and Q are really the same block for matrix A (as are R and S), * while P and R (and Q and S) are really the same block for DST. * * We're going to run over the top/right corners of each of these * memory "blocks" and then transpose it, one by one, running * from right to left and top to bottom. The last one (R) will be * done by the calling function, so we just return when we get there. */ rci_t R_top = (nrows >> A->blockrows_log) << A->blockrows_log; rci_t R_right = (ncols >> DST->blockrows_log) << DST->blockrows_log; for (rci_t col = 0; col < ncols; col += blockrows_dst) { rci_t end = (col == R_right) ? R_top : nrows; for (rci_t row = 0; row < end; row += blockrows_src) { rci_t nrowsb = (row < R_top) ? blockrows_src : (nrows - R_top); rci_t ncolsb = (col < R_right) ? blockrows_dst : (ncols - R_right); word const* RESTRICT fws = mzd_row(A, row) + col / m4ri_radix; word* RESTRICT fwd = mzd_row(DST, col) + row / m4ri_radix; // The following code is (almost) duplicated from _mzd_transpose. if (nrowsb >= 64) { int js = ncolsb & nrowsb & 64; // True if the total number of whole 64x64 matrices is odd. wi_t const rowstride_64_dst = 64 * DST->rowstride; word* RESTRICT fwd_current = fwd; word const* RESTRICT fws_current = fws; if (js) { js = 1; _mzd_copy_transpose_64x64(fwd, fws, DST->rowstride, A->rowstride); fwd_current += rowstride_64_dst; ++fws_current; } rci_t const whole_64cols = ncolsb / 64; // The use of delayed and even, is to avoid calling _mzd_copy_transpose_64x64_2 twice. // This way it can be inlined without duplicating the amount of code that has to be loaded. word* RESTRICT fwd_delayed = NULL; word const* RESTRICT fws_delayed = NULL; int even = 0; while (1) { for (int j = js; j < whole_64cols; ++j) { if (!even) { fwd_delayed = fwd_current; fws_delayed = fws_current; } else { _mzd_copy_transpose_64x64_2(fwd_delayed, fwd_current, fws_delayed, fws_current, DST->rowstride, A->rowstride); } fwd_current += rowstride_64_dst; ++fws_current; even = !even; } nrowsb -= 64; if (ncolsb % 64) { _mzd_copy_transpose_64xlt64(fwd + whole_64cols * rowstride_64_dst, fws + whole_64cols, DST->rowstride, A->rowstride, ncolsb % 64); } fwd += 1; fws += 64 * A->rowstride; if (nrowsb < 64) break; js = 0; fws_current = fws; fwd_current = fwd; } } if (nrowsb == 0) continue; // Transpose the remaining top rows. Now 0 < nrowsb < 64. while (ncolsb >= 64) { _mzd_copy_transpose_lt64x64(fwd, fws, DST->rowstride, A->rowstride, nrowsb); ncolsb -= 64; fwd += 64 * DST->rowstride; fws += 1; } // This is true because if it wasn't then nrowsb has to be 0 and we continued before already. assert(ncolsb == 0); } } *nrowsp = nrows - R_top; *ncolsp = ncols - R_right; if (R_top < nrows) *fwsp = mzd_row(A, R_top) + R_right / m4ri_radix; if (R_right < ncols) *fwdp = mzd_row(DST, R_right) + R_top / m4ri_radix; } mzd_t *mzd_transpose(mzd_t *DST, mzd_t const *A) { if (DST == NULL) { DST = mzd_init( A->ncols, A->nrows ); } else if (__M4RI_UNLIKELY(DST->nrows != A->ncols || DST->ncols != A->nrows)) { m4ri_die("mzd_transpose: Wrong size for return matrix.\n"); } else { /** it seems this is taken care of in the subroutines, re-enable if running into problems **/ //mzd_set_ui(DST,0); } if(A->nrows == 0 || A->ncols == 0) return mzd_copy(DST, A); if (__M4RI_LIKELY(!mzd_is_windowed(DST) && !mzd_is_windowed(A))) return _mzd_transpose(DST, A); int A_windowed = mzd_is_windowed(A); if (A_windowed) A = mzd_copy(NULL, A); if (__M4RI_LIKELY(!mzd_is_windowed(DST))) _mzd_transpose(DST, A); else { mzd_t *D = mzd_init(DST->nrows, DST->ncols); _mzd_transpose(D, A); mzd_copy(DST, D); mzd_free(D); } if (A_windowed) mzd_free((mzd_t*)A); return DST; } mzd_t *mzd_mul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B) { if (C == NULL) { C = mzd_init(A->nrows, B->ncols); } else { if (C->nrows != A->nrows || C->ncols != B->ncols) { m4ri_die("mzd_mul_naive: Provided return matrix has wrong dimensions.\n"); } } if(B->ncols < m4ri_radix-10) { /* this cutoff is rather arbitrary */ mzd_t *BT = mzd_transpose(NULL, B); _mzd_mul_naive(C, A, BT, 1); mzd_free (BT); } else { _mzd_mul_va(C, A, B, 1); } return C; } mzd_t *mzd_addmul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B) { if (C->nrows != A->nrows || C->ncols != B->ncols) { m4ri_die("mzd_mul_naive: Provided return matrix has wrong dimensions.\n"); } if(B->ncols < m4ri_radix-10) { /* this cutoff is rather arbitrary */ mzd_t *BT = mzd_transpose(NULL, B); _mzd_mul_naive(C, A, BT, 0); mzd_free (BT); } else { _mzd_mul_va(C, A, B, 0); } return C; } mzd_t *_mzd_mul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B, const int clear) { assert(A->offset == 0); assert(B->offset == 0); assert(C->offset == 0); wi_t eol; word *a, *b, *c; if (clear) { word const mask_end = __M4RI_LEFT_BITMASK(C->ncols % m4ri_radix); /* improves performance on x86_64 but is not cross plattform */ /* asm __volatile__ (".p2align 4\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop"); */ for (rci_t i = 0; i < C->nrows; ++i) { wi_t j = 0; for (; j < C->width - 1; ++j) { C->rows[i][j] = 0; } C->rows[i][j] &= ~mask_end; } } if(C->ncols % m4ri_radix) { eol = (C->width - 1); } else { eol = (C->width); } word parity[64]; for (int i = 0; i < 64; ++i) { parity[i] = 0; } wi_t const wide = A->width; int const blocksize = __M4RI_MUL_BLOCKSIZE; for (rci_t start = 0; start + blocksize <= C->nrows; start += blocksize) { for (rci_t i = start; i < start + blocksize; ++i) { a = A->rows[i]; c = C->rows[i]; for (rci_t j = 0; j < m4ri_radix * eol; j += m4ri_radix) { for (int k = 0; k < m4ri_radix; ++k) { b = B->rows[j + k]; parity[k] = a[0] & b[0]; for (wi_t ii = wide - 1; ii >= 1; --ii) parity[k] ^= a[ii] & b[ii]; } c[j / m4ri_radix] ^= m4ri_parity64(parity); } if (eol != C->width) { word const mask_end = __M4RI_LEFT_BITMASK(C->ncols % m4ri_radix); /* improves performance on x86_64 but is not cross plattform */ /* asm __volatile__ (".p2align 4\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop"); */ for (int k = 0; k < (C->ncols % m4ri_radix); ++k) { b = B->rows[m4ri_radix * eol + k]; parity[k] = a[0] & b[0]; for (wi_t ii = 1; ii < A->width; ++ii) parity[k] ^= a[ii] & b[ii]; } c[eol] ^= m4ri_parity64(parity) & mask_end; } } } for (rci_t i = C->nrows - (C->nrows % blocksize); i < C->nrows; ++i) { a = A->rows[i]; c = C->rows[i]; for (rci_t j = 0; j < m4ri_radix * eol; j += m4ri_radix) { for (int k = 0; k < m4ri_radix; ++k) { b = B->rows[j+k]; parity[k] = a[0] & b[0]; for (wi_t ii = wide - 1; ii >= 1; --ii) parity[k] ^= a[ii] & b[ii]; } c[j/m4ri_radix] ^= m4ri_parity64(parity); } if (eol != C->width) { word const mask_end = __M4RI_LEFT_BITMASK(C->ncols % m4ri_radix); /* improves performance on x86_64 but is not cross plattform */ /* asm __volatile__ (".p2align 4\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop"); */ for (int k = 0; k < (C->ncols % m4ri_radix); ++k) { b = B->rows[m4ri_radix * eol + k]; parity[k] = a[0] & b[0]; for (wi_t ii = 1; ii < A->width; ++ii) parity[k] ^= a[ii] & b[ii]; } c[eol] ^= m4ri_parity64(parity) & mask_end; } } __M4RI_DD_MZD(C); return C; } mzd_t *_mzd_mul_va(mzd_t *C, mzd_t const *v, mzd_t const *A, int const clear) { assert(C->offset == 0); assert(A->offset == 0); assert(v->offset == 0); if(clear) mzd_set_ui(C, 0); rci_t const m = v->nrows; rci_t const n = v->ncols; for(rci_t i = 0; i < m; ++i) for(rci_t j = 0; j < n; ++j) if (mzd_read_bit(v,i,j)) mzd_combine(C,i,0, C,i,0, A,j,0); __M4RI_DD_MZD(C); return C; } void mzd_randomize(mzd_t *A) { wi_t const width = A->width - 1; int const offset = A->offset; if(offset) { if(width == 0) { word const mask = __M4RI_MIDDLE_BITMASK(A->ncols, offset); for(rci_t i = 0; i < A->nrows; ++i) A->rows[i][0] ^= (A->rows[i][0] ^ (m4ri_random_word() << offset)) & mask; } else { word const mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix - offset); word const mask_end = __M4RI_LEFT_BITMASK((A->ncols + offset) % m4ri_radix); int const need_last_bits = ((m4ri_one << offset) & mask_end) != 0; for(rci_t i = 0; i < A->nrows; ++i) { word prev_random_word; word random_word = m4ri_random_word(); A->rows[i][0] ^= (A->rows[i][0] ^ (random_word << offset)) & mask_begin; for(wi_t j = 1; j < width; ++j) { prev_random_word = random_word; random_word = m4ri_random_word(); A->rows[i][j] = (random_word << offset) | (prev_random_word >> (m4ri_radix - offset)); } prev_random_word = random_word; random_word = 0; if (need_last_bits) random_word = m4ri_random_word(); A->rows[i][width] ^= (A->rows[i][width] ^ ((random_word << offset) | (prev_random_word >> (m4ri_radix - offset)))) & mask_end; } } } else { word const mask_end = __M4RI_LEFT_BITMASK(A->ncols % m4ri_radix); for(rci_t i = 0; i < A->nrows; ++i) { for(wi_t j = 0; j < width; ++j) A->rows[i][j] = m4ri_random_word(); A->rows[i][width] ^= (A->rows[i][width] ^ m4ri_random_word()) & mask_end; } } __M4RI_DD_MZD(A); } void mzd_set_ui( mzd_t *A, unsigned int value) { word const mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix - A->offset); word const mask_end = __M4RI_LEFT_BITMASK((A->ncols + A->offset) % m4ri_radix); if(A->width == 1) { for(rci_t i = 0; i < A->nrows; ++i) { for(rci_t j = 0 ; j < A->ncols; ++j) mzd_write_bit(A,i,j, 0); } } else { for (rci_t i = 0; i < A->nrows; ++i) { word *row = A->rows[i]; row[0] &= ~mask_begin; for(wi_t j = 1; j < A->width - 1; ++j) row[j] = 0; row[A->width - 1] &= ~mask_end; } } if(value % 2 == 0) { __M4RI_DD_MZD(A); return; } rci_t const stop = MIN(A->nrows, A->ncols); for (rci_t i = 0; i < stop; ++i) { mzd_write_bit(A, i, i, 1); } __M4RI_DD_MZD(A); } int mzd_equal(mzd_t const *A, mzd_t const *B) { if (A->nrows != B->nrows) return FALSE; if (A->ncols != B->ncols) return FALSE; if (A == B) return TRUE; wi_t Awidth = A->width - 1; if (A->offset == B->offset) { int const non_zero_offset = (A->offset != 0); if (non_zero_offset < Awidth) { for (rci_t i = 0; i < A->nrows; ++i) { for (wi_t j = non_zero_offset; j < Awidth; ++j) { if (A->rows[i][j] != B->rows[i][j]) return FALSE; } } } if (non_zero_offset) { word mask_begin = A->low_bitmask; for (rci_t i = 0; i < A->nrows; ++i) { if (((A->rows[i][0] ^ B->rows[i][0]) & mask_begin)) return FALSE; } if (!Awidth) return TRUE; } word const mask_end = A->high_bitmask; for (rci_t i = 0; i < A->nrows; ++i) { if (((A->rows[i][Awidth] ^ B->rows[i][Awidth]) & mask_end)) return FALSE; } } else { int shift = B->offset - A->offset; if (shift < 0) { mzd_t const *tmp = A; A = B; B = tmp; shift = -shift; Awidth = A->width - 1; } int const non_zero_offset = (A->offset != 0); if (non_zero_offset < Awidth) { for (rci_t i = 0; i < A->nrows; ++i) { for (wi_t j = non_zero_offset; j < Awidth; ++j) { word Bval = (B->rows[i][j] >> shift) | (B->rows[i][j + 1] << (m4ri_radix - shift)); if (A->rows[i][j] != Bval) return FALSE; } } } if (non_zero_offset) { word mask_begin = A->low_bitmask; if (1 < B->width) { for (rci_t i = 0; i < A->nrows; ++i) { word Bval = (B->rows[i][0] >> shift) | (B->rows[i][1] << (m4ri_radix - shift)); if (((A->rows[i][0] ^ Bval) & mask_begin)) return FALSE; } } else { for (rci_t i = 0; i < A->nrows; ++i) { word Bval = B->rows[i][0] >> shift; if (((A->rows[i][0] ^ Bval) & mask_begin)) return FALSE; } } if (!Awidth) return TRUE; } word const mask_end = A->high_bitmask; if (Awidth + 1 < B->width) { for (rci_t i = 0; i < A->nrows; ++i) { word Bval = (B->rows[i][Awidth] >> shift) | (B->rows[i][Awidth + 1] << (m4ri_radix - shift)); if (((A->rows[i][Awidth] ^ Bval) & mask_end)) return FALSE; } } else { for (rci_t i = 0; i < A->nrows; ++i) { word Bval = B->rows[i][Awidth] >> shift; if (((A->rows[i][Awidth] ^ Bval) & mask_end)) return FALSE; } } } return TRUE; } int mzd_cmp(mzd_t const *A, mzd_t const *B) { assert(A->offset == 0); assert(B->offset == 0); if(A->nrows < B->nrows) return -1; if(B->nrows < A->nrows) return 1; if(A->ncols < B->ncols) return -1; if(B->ncols < A->ncols) return 1; const word mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix - A->offset); const word mask_end = __M4RI_LEFT_BITMASK((A->offset + A->ncols)%m4ri_radix); const wi_t n = A->width-1; /* Columns with large index are "larger", but rows with small index are more important than with large index. */ if(A->width > 1) { for(rci_t i=0; inrows; i++) { if ((A->rows[i][n]&mask_end) < (B->rows[i][n]&mask_end)) return -1; else if ((A->rows[i][n]&mask_end) > (B->rows[i][n]&mask_end)) return 1; for(wi_t j=n-1; j>=1; j--) { if (A->rows[i][j] < B->rows[i][j]) return -1; else if (A->rows[i][j] > B->rows[i][j]) return 1; } if ((A->rows[i][0]&mask_begin) < (B->rows[i][0]&mask_begin)) return -1; else if ((A->rows[i][0]&mask_begin) > (B->rows[i][0]&mask_begin)) return 1; } } else { for(rci_t i=0; inrows; i++) { if ( (A->rows[i][0] & mask_begin & mask_end) < (B->rows[i][0] & mask_begin & mask_end) ) return -1; else if ( (A->rows[i][0] & mask_begin & mask_end) > (B->rows[i][0] & mask_begin & mask_end) ) return 1; } } return 0; } void mzd_copy_row_weird_to_even(mzd_t *B, rci_t i, mzd_t const *A, rci_t j); mzd_t *mzd_copy(mzd_t *N, mzd_t const *P) { if (N == P) return N; if (!P->offset){ if (N == NULL) { N = mzd_init(P->nrows, P->ncols); } else { if (N->nrows < P->nrows || N->ncols < P->ncols) m4ri_die("mzd_copy: Target matrix is too small."); } word *p_truerow, *n_truerow; wi_t const wide = P->width - 1; word mask = __M4RI_LEFT_BITMASK(P->ncols % m4ri_radix); for (rci_t i = 0; i < P->nrows; ++i) { p_truerow = P->rows[i]; n_truerow = N->rows[i]; for (wi_t j = 0; j < wide; ++j) n_truerow[j] = p_truerow[j]; n_truerow[wide] = (n_truerow[wide] & ~mask) | (p_truerow[wide] & mask); } } else { // P->offset > 0 if (N == NULL) { N = mzd_init(P->nrows, P->ncols+ P->offset); N->ncols -= P->offset; N->offset = P->offset; N->width = P->width; N->flags |= (mzd_flag_nonzero_offset | mzd_flag_windowed_ownsblocks); N->low_bitmask &= m4ri_ffff << N->offset; if (N->width == 1) N->high_bitmask = N->low_bitmask; N->flags |= ((N->high_bitmask & ((word)1 << (m4ri_radix - 1)))) ? mzd_flag_windowed_zeroexcess : mzd_flag_nonzero_excess; } else { if (N->nrows < P->nrows || N->ncols < P->ncols) m4ri_die("mzd_copy: Target matrix is too small."); } if(N->offset == P->offset) { for(rci_t i = 0; i < P->nrows; ++i) { mzd_copy_row(N, i, P, i); } } else if(N->offset == 0) { for(rci_t i = 0; i < P->nrows; ++i) { mzd_copy_row_weird_to_even(N, i, P, i); } } else { m4ri_die("mzd_copy: completely unaligned copy not implemented yet."); } } __M4RI_DD_MZD(N); return N; } /* This is sometimes called augment */ mzd_t *mzd_concat(mzd_t *C, mzd_t const *A, mzd_t const *B) { assert(A->offset == 0); assert(B->offset == 0); if (A->nrows != B->nrows) { m4ri_die("mzd_concat: Bad arguments to concat!\n"); } if (C == NULL) { C = mzd_init(A->nrows, A->ncols + B->ncols); } else if (C->nrows != A->nrows || C->ncols != (A->ncols + B->ncols)) { m4ri_die("mzd_concat: C has wrong dimension!\n"); } for (rci_t i = 0; i < A->nrows; ++i) { word *dst_truerow = C->rows[i]; word *src_truerow = A->rows[i]; for (wi_t j = 0; j < A->width; ++j) { dst_truerow[j] = src_truerow[j]; } } for (rci_t i = 0; i < B->nrows; ++i) { for (rci_t j = 0; j < B->ncols; ++j) { mzd_write_bit(C, i, j + A->ncols, mzd_read_bit(B, i, j)); } } __M4RI_DD_MZD(C); return C; } mzd_t *mzd_stack(mzd_t *C, mzd_t const *A, mzd_t const *B) { assert(A->offset == 0); assert(B->offset == 0); if (A->ncols != B->ncols) { m4ri_die("mzd_stack: A->ncols (%d) != B->ncols (%d)!\n", A->ncols, B->ncols); } if (C == NULL) { C = mzd_init(A->nrows + B->nrows, A->ncols); } else if (C->nrows != (A->nrows + B->nrows) || C->ncols != A->ncols) { m4ri_die("mzd_stack: C has wrong dimension!\n"); } for(rci_t i = 0; i < A->nrows; ++i) { word *src_truerow = A->rows[i]; word *dst_truerow = C->rows[i]; for (wi_t j = 0; j < A->width; ++j) { dst_truerow[j] = src_truerow[j]; } } for(rci_t i = 0; i < B->nrows; ++i) { word *dst_truerow = C->rows[A->nrows + i]; word *src_truerow = B->rows[i]; for (wi_t j = 0; j < B->width; ++j) { dst_truerow[j] = src_truerow[j]; } } __M4RI_DD_MZD(C); return C; } mzd_t *mzd_invert_naive(mzd_t *INV, mzd_t const *A, mzd_t const *I) { assert(A->offset == 0); mzd_t *H; H = mzd_concat(NULL, A, I); rci_t x = mzd_echelonize_naive(H, TRUE); if (x == 0) { mzd_free(H); return NULL; } INV = mzd_submatrix(INV, H, 0, A->ncols, A->nrows, 2 * A->ncols); mzd_free(H); __M4RI_DD_MZD(INV); return INV; } mzd_t *mzd_add(mzd_t *ret, mzd_t const *left, mzd_t const *right) { if (left->nrows != right->nrows || left->ncols != right->ncols) { m4ri_die("mzd_add: rows and columns must match.\n"); } if (ret == NULL) { ret = mzd_init(left->nrows, left->ncols); } else if (ret != left) { if (ret->nrows != left->nrows || ret->ncols != left->ncols) { m4ri_die("mzd_add: rows and columns of returned matrix must match.\n"); } } return _mzd_add(ret, left, right); } mzd_t *_mzd_add(mzd_t *C, mzd_t const *A, mzd_t const *B) { rci_t const nrows = MIN(MIN(A->nrows, B->nrows), C->nrows); if (C == B) { //swap mzd_t const *tmp = A; A = B; B = tmp; } if (C->offset | A->offset | B->offset) { for(rci_t i = 0; i < nrows; ++i) { mzd_combine_weird(C,i, A,i, B,i); } __M4RI_DD_MZD(C); return C; } word const mask_end = __M4RI_LEFT_BITMASK(C->ncols%m4ri_radix); switch(A->width) { case 0: return C; case 1: for(rci_t i = 0; i < nrows; ++i) { C->rows[i][0] ^= ((A->rows[i][0] ^ B->rows[i][0] ^ C->rows[i][0]) & mask_end); } break; case 2: for(rci_t i = 0; i < nrows; ++i) { C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0]; C->rows[i][1] ^= ((A->rows[i][1] ^ B->rows[i][1] ^ C->rows[i][1]) & mask_end); } break; case 3: for(rci_t i = 0; i < nrows; ++i) { C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0]; C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1]; C->rows[i][2] ^= ((A->rows[i][2] ^ B->rows[i][2] ^ C->rows[i][2]) & mask_end); } break; case 4: for(rci_t i = 0; i < nrows; ++i) { C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0]; C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1]; C->rows[i][2] = A->rows[i][2] ^ B->rows[i][2]; C->rows[i][3] ^= ((A->rows[i][3] ^ B->rows[i][3] ^ C->rows[i][3]) & mask_end); } break; case 5: for(rci_t i = 0; i < nrows; ++i) { C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0]; C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1]; C->rows[i][2] = A->rows[i][2] ^ B->rows[i][2]; C->rows[i][3] = A->rows[i][3] ^ B->rows[i][3]; C->rows[i][4] ^= ((A->rows[i][4] ^ B->rows[i][4] ^ C->rows[i][4]) & mask_end); } break; case 6: for(rci_t i = 0; i < nrows; ++i) { C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0]; C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1]; C->rows[i][2] = A->rows[i][2] ^ B->rows[i][2]; C->rows[i][3] = A->rows[i][3] ^ B->rows[i][3]; C->rows[i][4] = A->rows[i][4] ^ B->rows[i][4]; C->rows[i][5] ^= ((A->rows[i][5] ^ B->rows[i][5] ^ C->rows[i][5]) & mask_end); } break; case 7: for(rci_t i = 0; i < nrows; ++i) { C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0]; C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1]; C->rows[i][2] = A->rows[i][2] ^ B->rows[i][2]; C->rows[i][3] = A->rows[i][3] ^ B->rows[i][3]; C->rows[i][4] = A->rows[i][4] ^ B->rows[i][4]; C->rows[i][5] = A->rows[i][5] ^ B->rows[i][5]; C->rows[i][6] ^= ((A->rows[i][6] ^ B->rows[i][6] ^ C->rows[i][6]) & mask_end); } break; case 8: for(rci_t i = 0; i < nrows; ++i) { C->rows[i][0] = A->rows[i][0] ^ B->rows[i][0]; C->rows[i][1] = A->rows[i][1] ^ B->rows[i][1]; C->rows[i][2] = A->rows[i][2] ^ B->rows[i][2]; C->rows[i][3] = A->rows[i][3] ^ B->rows[i][3]; C->rows[i][4] = A->rows[i][4] ^ B->rows[i][4]; C->rows[i][5] = A->rows[i][5] ^ B->rows[i][5]; C->rows[i][6] = A->rows[i][6] ^ B->rows[i][6]; C->rows[i][7] ^= ((A->rows[i][7] ^ B->rows[i][7] ^ C->rows[i][7]) & mask_end); } break; default: for(rci_t i = 0; i < nrows; ++i) { mzd_combine_even(C,i,0, A,i,0, B,i,0); } } __M4RI_DD_MZD(C); return C; } mzd_t *mzd_submatrix(mzd_t *S, mzd_t const *M, rci_t const startrow, rci_t const startcol, rci_t const endrow, rci_t const endcol) { rci_t const nrows = endrow - startrow; rci_t const ncols = endcol - startcol; if (S == NULL) { S = mzd_init(nrows, ncols); } else if(S->nrows < nrows || S->ncols < ncols) { m4ri_die("mzd_submatrix: got S with dimension %d x %d but expected %d x %d\n", S->nrows, S->ncols, nrows, ncols); } assert(M->offset == S->offset); wi_t const startword = (startcol + M->offset) / m4ri_radix; /* we start at the beginning of a word */ if ((startcol + M->offset) % m4ri_radix == 0) { if(ncols / m4ri_radix != 0) { for(rci_t x = startrow, i = 0; i < nrows; ++i, ++x) { memcpy(S->rows[i], M->rows[x] + startword, sizeof(word) * (ncols / m4ri_radix)); } } if (ncols % m4ri_radix) { word const mask_end = __M4RI_LEFT_BITMASK(ncols % m4ri_radix); for(rci_t x = startrow, i = 0; i < nrows; ++i, ++x) { /* process remaining bits */ word temp = M->rows[x][startword + ncols / m4ri_radix] & mask_end; S->rows[i][ncols / m4ri_radix] = temp; } } /* startcol is not the beginning of a word */ } else { int const spot = (startcol + M->offset) % m4ri_radix; for(rci_t x = startrow, i = 0; i < nrows; ++i, ++x) { word *truerow = M->rows[x]; /* process full words first */ for(wi_t colword = 0; colword < (ncols / m4ri_radix); ++colword) { wi_t block = colword + startword; word temp = (truerow[block] >> spot) | (truerow[block + 1] << (m4ri_radix - spot)); S->rows[i][colword] = temp; } /* process remaining bits (lazy) */ wi_t colword = ncols / m4ri_radix; for (int y = 0; y < ncols % m4ri_radix; ++y) { BIT bit = mzd_read_bit(M, x, startcol + colword * m4ri_radix + y); mzd_write_bit(S, i, colword * m4ri_radix + y, bit); } } } __M4RI_DD_MZD(S); return S; } void mzd_combine(mzd_t *C, rci_t const c_row, wi_t const c_startblock, mzd_t const *A, rci_t const a_row, wi_t const a_startblock, mzd_t const *B, rci_t const b_row, wi_t const b_startblock) { /** * \todo respect ncols at the end */ if(C->offset | A->offset | B->offset) { mzd_combine_weird(C, c_row, A, a_row, B, b_row); return; } if( C == A && a_row == c_row && a_startblock == c_startblock) { mzd_combine_even_in_place(C, c_row, c_startblock, B, b_row, b_startblock); return; } mzd_combine_even(C, c_row, c_startblock, A, a_row, a_startblock, B, b_row, b_startblock); return; } void mzd_col_swap(mzd_t *M, rci_t const cola, rci_t const colb) { if (cola == colb) return; rci_t const _cola = cola + M->offset; rci_t const _colb = colb + M->offset; wi_t const a_word = _cola / m4ri_radix; wi_t const b_word = _colb / m4ri_radix; int const a_bit = _cola % m4ri_radix; int const b_bit = _colb % m4ri_radix; word* RESTRICT ptr = mzd_first_row(M); int max_bit = MAX(a_bit, b_bit); int count = mzd_rows_in_block(M, 0); assert(count > 0); int min_bit = a_bit + b_bit - max_bit; int block = 0; int offset = max_bit - min_bit; word mask = m4ri_one << min_bit; if (a_word == b_word) { while(1) { ptr += a_word; int fast_count = count / 4; int rest_count = count - 4 * fast_count; word xor[4]; wi_t const rowstride = M->rowstride; while (fast_count--) { xor[0] = ptr[0]; xor[1] = ptr[rowstride]; xor[2] = ptr[2 * rowstride]; xor[3] = ptr[3 * rowstride]; xor[0] ^= xor[0] >> offset; xor[1] ^= xor[1] >> offset; xor[2] ^= xor[2] >> offset; xor[3] ^= xor[3] >> offset; xor[0] &= mask; xor[1] &= mask; xor[2] &= mask; xor[3] &= mask; xor[0] |= xor[0] << offset; xor[1] |= xor[1] << offset; xor[2] |= xor[2] << offset; xor[3] |= xor[3] << offset; ptr[0] ^= xor[0]; ptr[rowstride] ^= xor[1]; ptr[2 * rowstride] ^= xor[2]; ptr[3 * rowstride] ^= xor[3]; ptr += 4 * rowstride; } while (rest_count--) { word xor = *ptr; xor ^= xor >> offset; xor &= mask; *ptr ^= xor | (xor << offset); ptr += rowstride; } if ((count = mzd_rows_in_block(M, ++block)) <= 0) break; ptr = mzd_first_row_next_block(M, block); } } else { word* RESTRICT min_ptr; wi_t max_offset; if (min_bit == a_bit) { min_ptr = ptr + a_word; max_offset = b_word - a_word; } else { min_ptr = ptr + b_word; max_offset = a_word - b_word; } while(1) { wi_t const rowstride = M->rowstride; while(count--) { word xor = (min_ptr[0] ^ (min_ptr[max_offset] >> offset)) & mask; min_ptr[0] ^= xor; min_ptr[max_offset] ^= xor << offset; min_ptr += rowstride; } if ((count = mzd_rows_in_block(M, ++block)) <= 0) break; ptr = mzd_first_row_next_block(M, block); if (min_bit == a_bit) min_ptr = ptr + a_word; else min_ptr = ptr + b_word; } } __M4RI_DD_MZD(M); } int mzd_is_zero(mzd_t const *A) { /* Could be improved: stopping as the first non zero value is found (status!=0) */ rci_t const mb = A->nrows; rci_t const nb = A->ncols; int const Aoffset = A->offset; int const nbrest = (nb + Aoffset) % m4ri_radix; word status = 0; if (nb + Aoffset >= m4ri_radix) { // Large A word mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix-Aoffset); word mask_end = __M4RI_LEFT_BITMASK(nbrest); for (rci_t i = 0; i < mb; ++i) { status |= A->rows[i][0] & mask_begin; for (wi_t j = 1; j < A->width - 1; ++j) status |= A->rows[i][j]; status |= A->rows[i][A->width - 1] & mask_end; if(status) return 0; } } else { // Small A word mask = __M4RI_MIDDLE_BITMASK(nb,Aoffset); for (rci_t i = 0; i < mb; ++i) { status |= A->rows[i][0] & mask; } } return !status; } void mzd_copy_row_weird_to_even(mzd_t *B, rci_t i, mzd_t const *A, rci_t j) { assert(B->offset == 0); assert(B->ncols >= A->ncols); word *b = B->rows[j]; int const rest = A->ncols % m4ri_radix; rci_t c; for(c = 0; c + m4ri_radix <= A->ncols; c += m4ri_radix) { b[c / m4ri_radix] = mzd_read_bits(A, i, c, m4ri_radix); } if (rest) { word const temp = mzd_read_bits(A, i, c, rest); b[c / m4ri_radix] &= __M4RI_LEFT_BITMASK(m4ri_radix - rest); b[c / m4ri_radix] |= temp; } __M4RI_DD_MZD(B); } void mzd_copy_row(mzd_t *B, rci_t i, mzd_t const *A, rci_t j) { assert(B->offset == A->offset); assert(B->ncols >= A->ncols); wi_t const width = MIN(B->width, A->width) - 1; word const *a = A->rows[j]; word *b = B->rows[i]; word const mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix - A->offset); word const mask_end = __M4RI_LEFT_BITMASK((A->ncols + A->offset) % m4ri_radix); if (width != 0) { b[0] = (b[0] & ~mask_begin) | (a[0] & mask_begin); for(wi_t k = 1; k < width; ++k) b[k] = a[k]; b[width] = (b[width] & ~mask_end) | (a[width] & mask_end); } else { b[0] = (b[0] & ~mask_begin) | (a[0] & mask_begin & mask_end) | (b[0] & ~mask_end); } __M4RI_DD_ROW(B, i); } void mzd_row_clear_offset(mzd_t *M, rci_t row, rci_t coloffset) { coloffset += M->offset; wi_t const startblock = coloffset / m4ri_radix; word temp; /* make sure to start clearing at coloffset */ if (coloffset%m4ri_radix) { temp = M->rows[row][startblock]; temp &= __M4RI_RIGHT_BITMASK(m4ri_radix - coloffset); } else { temp = 0; } M->rows[row][startblock] = temp; for (wi_t i = startblock + 1; i < M->width; ++i) { M->rows[row][i] = 0; } __M4RI_DD_ROW(M, row); } int mzd_find_pivot(mzd_t const *A, rci_t start_row, rci_t start_col, rci_t *r, rci_t *c) { assert(A->offset == 0); rci_t const nrows = A->nrows; rci_t const ncols = A->ncols; word data = 0; rci_t row_candidate = 0; if(A->ncols - start_col < m4ri_radix) { for(rci_t j = start_col; j < A->ncols; j += m4ri_radix) { int const length = MIN(m4ri_radix, ncols - j); for(rci_t i = start_row; i < nrows; ++i) { word const curr_data = mzd_read_bits(A, i, j, length); if (m4ri_lesser_LSB(curr_data, data)) { row_candidate = i; data = curr_data; } } if(data) { *r = row_candidate; for(int l = 0; l < length; ++l) { if(__M4RI_GET_BIT(data, l)) { *c = j + l; break; } } __M4RI_DD_RCI(*r); __M4RI_DD_RCI(*c); __M4RI_DD_INT(1); return 1; } } } else { /* we definitely have more than one word */ /* handle first word */ int const bit_offset = (start_col % m4ri_radix); wi_t const word_offset = start_col / m4ri_radix; word const mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix-bit_offset); for(rci_t i = start_row; i < nrows; ++i) { word const curr_data = A->rows[i][word_offset] & mask_begin; if (m4ri_lesser_LSB(curr_data, data)) { row_candidate = i; data = curr_data; if(__M4RI_GET_BIT(data,bit_offset)) { break; } } } if(data) { *r = row_candidate; data >>= bit_offset; assert(data); for(int l = 0; l < (m4ri_radix - bit_offset); ++l) { if(__M4RI_GET_BIT(data, l)) { *c = start_col + l; break; } } __M4RI_DD_RCI(*r); __M4RI_DD_RCI(*c); __M4RI_DD_INT(1); return 1; } /* handle complete words */ for(wi_t wi = word_offset + 1; wi < A->width - 1; ++wi) { for(rci_t i = start_row; i < nrows; ++i) { word const curr_data = A->rows[i][wi]; if (m4ri_lesser_LSB(curr_data, data)) { row_candidate = i; data = curr_data; if(__M4RI_GET_BIT(data, 0)) break; } } if(data) { *r = row_candidate; for(int l = 0; l < m4ri_radix; ++l) { if(__M4RI_GET_BIT(data, l)) { *c = wi * m4ri_radix + l; break; } } __M4RI_DD_RCI(*r); __M4RI_DD_RCI(*c); __M4RI_DD_INT(1); return 1; } } /* handle last word */ int const end_offset = (A->ncols % m4ri_radix) ? (A->ncols % m4ri_radix) : m4ri_radix; word const mask_end = __M4RI_LEFT_BITMASK(end_offset % m4ri_radix); wi_t wi = A->width - 1; for(rci_t i = start_row; i < nrows; ++i) { word const curr_data = A->rows[i][wi] & mask_end; if (m4ri_lesser_LSB(curr_data, data)) { row_candidate = i; data = curr_data; if(__M4RI_GET_BIT(data,0)) break; } } if(data) { *r = row_candidate; for(int l = 0; l < end_offset; ++l) { if(__M4RI_GET_BIT(data, l)) { *c = wi * m4ri_radix + l; break; } } __M4RI_DD_RCI(*r); __M4RI_DD_RCI(*c); __M4RI_DD_INT(1); return 1; } } __M4RI_DD_RCI(*r); __M4RI_DD_RCI(*c); __M4RI_DD_INT(0); return 0; } #define MASK(c) (((uint64_t)(-1)) / (__M4RI_TWOPOW(__M4RI_TWOPOW(c)) + 1)) #define COUNT(x,c) ((x) & MASK(c)) + (((x) >> (__M4RI_TWOPOW(c))) & MASK(c)) static inline int m4ri_bitcount(word w) { uint64_t n = __M4RI_CONVERT_TO_UINT64_T(w); n = COUNT(n, 0); n = COUNT(n, 1); n = COUNT(n, 2); n = COUNT(n, 3); n = COUNT(n, 4); n = COUNT(n, 5); return (int)n; } double _mzd_density(mzd_t const *A, wi_t res, rci_t r, rci_t c) { size_t count = 0; size_t total = 0; if(A->width == 1) { for(rci_t i = r; i < A->nrows; ++i) for(rci_t j = c; j < A->ncols; ++j) if(mzd_read_bit(A, i, j)) ++count; return ((double)count)/(1.0 * A->ncols * A->nrows); } if(res == 0) res = A->width / 100; if (res < 1) res = 1; for(rci_t i = r; i < A->nrows; ++i) { word *truerow = A->rows[i]; for(rci_t j = c; j < m4ri_radix-A->offset; ++j) if(mzd_read_bit(A, i, j)) ++count; total += m4ri_radix - A->offset; for(wi_t j = MAX(1, (c + A->offset) / m4ri_radix); j < A->width - 1; j += res) { count += m4ri_bitcount(truerow[j]); total += m4ri_radix; } for(int j = 0; j < (A->ncols + A->offset) % m4ri_radix; ++j) if(mzd_read_bit(A, i, m4ri_radix * ((A->ncols + A->offset) / m4ri_radix) + j)) ++count; total += (A->ncols + A->offset) % m4ri_radix; } return (double)count / total; } double mzd_density(mzd_t const *A, wi_t res) { return _mzd_density(A, res, 0, 0); } rci_t mzd_first_zero_row(mzd_t const *A) { word const mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix - A->offset); word const mask_end = __M4RI_LEFT_BITMASK((A->ncols + A->offset) % m4ri_radix); wi_t const end = A->width - 1; word *row; for(rci_t i = A->nrows - 1; i >= 0; --i) { row = A->rows[i]; word tmp = row[0] & mask_begin; for (wi_t j = 1; j < end; ++j) tmp |= row[j]; tmp |= row[end] & mask_end; if(tmp) { __M4RI_DD_INT(i + 1); return i + 1; } } __M4RI_DD_INT(0); return 0; } mzd_t *mzd_extract_u(mzd_t *U, mzd_t const *A) { rci_t k = MIN(A->nrows, A->ncols); if (U == NULL) U = mzd_submatrix(NULL, A, 0, 0, k, k); else assert(U->nrows == k && U->ncols == k); for(rci_t i=1; inrows; i++) { for(wi_t j=0; jrows[i][j] = 0; } if(i%m4ri_radix) mzd_clear_bits(U, i, (i/m4ri_radix)*m4ri_radix, i%m4ri_radix); } return U; } mzd_t *mzd_extract_l(mzd_t *L, mzd_t const *A) { rci_t k = MIN(A->nrows, A->ncols); if (L == NULL) L = mzd_submatrix(NULL, A, 0, 0, k, k); else assert(L->nrows == k && L->ncols == k); for(rci_t i=0; inrows-1; i++) { if(m4ri_radix - (i+1)%m4ri_radix) mzd_clear_bits(L, i, i+1, m4ri_radix - (i+1)%m4ri_radix); for(wi_t j=(i/m4ri_radix+1); jwidth; j++) { L->rows[i][j] = 0; } } return L; } libm4ri-20130416/src/mzd.h000066400000000000000000001125101212302366200150450ustar00rootroot00000000000000/** * \file mzd.h * \brief Dense matrices over GF(2) represented as a bit field. * * \author Gregory Bard * \author Martin Albrecht * \author Carlo Wood */ #ifndef M4RI_MZD #define M4RI_MZD /******************************************************************* * * M4RI: Linear Algebra over GF(2) * * Copyright (C) 2007, 2008 Gregory Bard * Copyright (C) 2008-2010 Martin Albrecht * Copyright (C) 2011 Carlo Wood * * Distributed under the terms of the GNU General Public License (GPL) * version 2 or higher. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ * ********************************************************************/ #include #include #include #include #if __M4RI_HAVE_SSE2 #include #endif #include #include #if __M4RI_HAVE_SSE2 /** * \brief SSE2 cutoff in words. * * Cutoff in words after which row length SSE2 instructions should be * used. */ #define __M4RI_SSE2_CUTOFF 10 #endif /** * Maximum number of words allocated for one mzd_t block. * * \note This value must fit in an int, even though it's type is size_t. */ #define __M4RI_MAX_MZD_BLOCKSIZE (((size_t)1) << 27) /** * \brief Matrix multiplication block-ing dimension. * * Defines the number of rows of the matrix A that are * processed as one block during the execution of a multiplication * algorithm. */ #define __M4RI_MUL_BLOCKSIZE MIN(((int)sqrt((double)(4 * __M4RI_CPU_L3_CACHE))) / 2, 2048) typedef struct { size_t size; word* begin; word* end; } mzd_block_t; /** * \brief Dense matrices over GF(2). * * The most fundamental data type in this library. */ typedef struct mzd_t { /** * Number of rows. */ rci_t nrows; /** * Number of columns. */ rci_t ncols; /** * Number of words with valid bits. * * width = ceil((ncols + offset) / m4ri_radix) */ wi_t width; /** * Offset in words between rows. * * rowstride = (width < mzd_paddingwidth || (width & 1) == 0) ? width : width + 1; * where width is the width of the underlying non-windowed matrix. */ wi_t rowstride; /** * Offset in words from start of block to first word. * * rows[0] = blocks[0].begin + offset_vector; * This, together with rowstride, makes the rows array obsolete. */ wi_t offset_vector; /** * Number of rows to the first row counting from the start of the first block. */ wi_t row_offset; /** * column offset of the first column. */ uint16_t offset; /** * Booleans to speed up things. * * The bits have the following meaning: * * 0: Has non-zero offset (and thus is windowed). * 1: Has non-zero excess. * 2: Is windowed, but has zero offset. * 3: Is windowed, but has zero excess. * 4: Is windowed, but owns the blocks allocations. * 5: Spans more than 1 block. */ uint8_t flags; /** * blockrows_log = log2(blockrows); * where blockrows is the number of rows in one block, which is a power of 2. */ uint8_t blockrows_log; #if 0 // Commented out in order to keep the size of mzd_t 64 bytes (one cache line). This could be added back if rows was ever removed. /** * blockrows_mask = blockrows - 1; * where blockrows is the number of rows in one block, which is a power of 2. */ int blockrows_mask; #endif /** * Mask for valid bits in the word with the highest index (width - 1). */ word high_bitmask; /** * Mask for valid bits in the word with the lowest index (0). */ word low_bitmask; /** * Contains pointers to the actual blocks of memory containing the * values packed into words of size m4ri_radix. */ mzd_block_t *blocks; /** * Address of first word in each row, so the first word of row i is * is m->rows[i] */ word **rows; } mzd_t; /** * \brief The minimum width where padding occurs. */ static wi_t const mzd_paddingwidth = 3; static uint8_t const mzd_flag_nonzero_offset = 0x1; static uint8_t const mzd_flag_nonzero_excess = 0x2; static uint8_t const mzd_flag_windowed_zerooffset = 0x4; static uint8_t const mzd_flag_windowed_zeroexcess = 0x8; static uint8_t const mzd_flag_windowed_ownsblocks = 0x10; static uint8_t const mzd_flag_multiple_blocks = 0x20; /** * \brief Test if a matrix is windowed. * * \param M Matrix * * \return a non-zero value if the matrix is windowed, otherwise return zero. */ static inline int mzd_is_windowed(mzd_t const *M) { return M->flags & (mzd_flag_nonzero_offset | mzd_flag_windowed_zerooffset); } /** * \brief Test if this mzd_t should free blocks. * * \param M Matrix * * \return TRUE iff blocks is non-zero and should be freed upon a call to mzd_free. */ static inline int mzd_owns_blocks(mzd_t const *M) { return M->blocks && (!mzd_is_windowed(M) || ((M->flags & mzd_flag_windowed_ownsblocks))); } /** * \brief Get a pointer the first word. * * \param M Matrix * * \return a pointer to the first word of the first row. */ static inline word* mzd_first_row(mzd_t const *M) { word* result = M->blocks[0].begin + M->offset_vector; assert(M->nrows == 0 || result == M->rows[0]); return result; } /** * \brief Get a pointer to the first word in block n. * * Use mzd_first_row for block number 0. * * \param M Matrix * \param n The block number. Must be larger than 0. * * \return a pointer to the first word of the first row in block n. */ static inline word* mzd_first_row_next_block(mzd_t const* M, int n) { assert(n > 0); return M->blocks[n].begin + M->offset_vector - M->row_offset * M->rowstride; } /** * \brief Convert row to blocks index. * * \param M Matrix. * \param row The row to convert. * * \return the block number that contains this row. */ static inline int mzd_row_to_block(mzd_t const* M, rci_t row) { return (M->row_offset + row) >> M->blockrows_log; } /** * \brief Total number of rows in this block. * * Should be called with a constant n=0, or with * n > 0 when n is a variable, for optimization * reasons. * * \param M Matrix * \param n The block number. * * \return the total number of rows in this block. */ static inline wi_t mzd_rows_in_block(mzd_t const* M, int n) { if (__M4RI_UNLIKELY(M->flags & mzd_flag_multiple_blocks)) { if (__M4RI_UNLIKELY(n == 0)) { return (1 << M->blockrows_log) - M->row_offset; } else { int const last_block = mzd_row_to_block(M, M->nrows - 1); if (n < last_block) return (1 << M->blockrows_log); return M->nrows + M->row_offset - (n << M->blockrows_log); } } return n ? 0 : M->nrows; } /** * \brief Number of rows in this block including r * * \param M Matrix * \param rci_t row * * \return the number of rows with index >= r in this block */ static inline wi_t mzd_remaining_rows_in_block(mzd_t const* M, rci_t r) { const int n = mzd_row_to_block(M, r); r = (r - (n << M->blockrows_log)); if (__M4RI_UNLIKELY(M->flags & mzd_flag_multiple_blocks)) { if (__M4RI_UNLIKELY(n == 0)) { return (1 << M->blockrows_log) - M->row_offset - r; } else { int const last_block = mzd_row_to_block(M, M->nrows - 1); if (n < last_block) return (1 << M->blockrows_log) - r; return M->nrows + M->row_offset - (n << M->blockrows_log) - r; } } return n ? 0 : M->nrows - r; } /** * \brief Get pointer to first word of row. * * \param M Matrix * \param row The row index. * * \return pointer to first word of the row. */ static inline word* mzd_row(mzd_t const* M, rci_t row) { wi_t big_vector = M->offset_vector + row * M->rowstride; word* result = M->blocks[0].begin + big_vector; if (__M4RI_UNLIKELY(M->flags & mzd_flag_multiple_blocks)) { int const n = (M->row_offset + row) >> M->blockrows_log; result = M->blocks[n].begin + big_vector - n * (M->blocks[0].size / sizeof(word)); } assert(result == M->rows[row]); return result; } /** * \brief Create a new matrix of dimension r x c. * * Use mzd_free to kill it. * * \param r Number of rows * \param c Number of columns * */ mzd_t *mzd_init(rci_t const r, rci_t const c); /** * \brief Free a matrix created with mzd_init. * * \param A Matrix */ void mzd_free(mzd_t *A); /** * \brief Create a window/view into the matrix M. * * A matrix window for M is a meta structure on the matrix M. It is * setup to point into the matrix so M \em must \em not be freed while the * matrix window is used. * * This function puts the restriction on the provided parameters that * all parameters must be within range for M which is not enforced * currently . * * Use mzd_free_window to free the window. * * \param M Matrix * \param lowr Starting row (inclusive) * \param lowc Starting column (inclusive) * \param highr End row (exclusive) * \param highc End column (exclusive) * */ mzd_t *mzd_init_window(mzd_t *M, rci_t const lowr, rci_t const lowc, rci_t const highr, rci_t const highc); /** * \brief Create a const window/view into a const matrix M. * * See mzd_init_window, but for constant M. */ static inline mzd_t const *mzd_init_window_const(mzd_t const *M, rci_t const lowr, rci_t const lowc, rci_t const highr, rci_t const highc) { return mzd_init_window((mzd_t*)M, lowr, lowc, highr, highc); } /** * \brief Free a matrix window created with mzd_init_window. * * \param A Matrix */ #define mzd_free_window mzd_free /** * \brief Swap the two rows rowa and rowb starting at startblock. * * \param M Matrix with a zero offset. * \param rowa Row index. * \param rowb Row index. * \param startblock Start swapping only in this block. */ static inline void _mzd_row_swap(mzd_t *M, rci_t const rowa, rci_t const rowb, wi_t const startblock) { if ((rowa == rowb) || (startblock >= M->width)) return; /* This is the case since we're only called from _mzd_ple_mmpf, * which makes the same assumption. Therefore we don't need * to take a mask_begin into account. */ assert(M->offset == 0); wi_t width = M->width - startblock - 1; word *a = M->rows[rowa] + startblock; word *b = M->rows[rowb] + startblock; word tmp; word const mask_end = __M4RI_LEFT_BITMASK((M->ncols + M->offset) % m4ri_radix); if (width != 0) { for(wi_t i = 0; i < width; ++i) { tmp = a[i]; a[i] = b[i]; b[i] = tmp; } } tmp = (a[width] ^ b[width]) & mask_end; a[width] ^= tmp; b[width] ^= tmp; __M4RI_DD_ROW(M, rowa); __M4RI_DD_ROW(M, rowb); } /** * \brief Swap the two rows rowa and rowb. * * \param M Matrix * \param rowa Row index. * \param rowb Row index. */ static inline void mzd_row_swap(mzd_t *M, rci_t const rowa, rci_t const rowb) { if(rowa == rowb) return; wi_t width = M->width - 1; word *a = M->rows[rowa]; word *b = M->rows[rowb]; word const mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix - M->offset); word const mask_end = __M4RI_LEFT_BITMASK((M->ncols + M->offset) % m4ri_radix); word tmp = (a[0] ^ b[0]) & mask_begin; if (width != 0) { a[0] ^= tmp; b[0] ^= tmp; for(wi_t i = 1; i < width; ++i) { tmp = a[i]; a[i] = b[i]; b[i] = tmp; } tmp = (a[width] ^ b[width]) & mask_end; a[width] ^= tmp; b[width] ^= tmp; } else { tmp &= mask_end; a[0] ^= tmp; b[0] ^= tmp; } __M4RI_DD_ROW(M, rowa); __M4RI_DD_ROW(M, rowb); } /** * \brief copy row j from A to row i from B. * * The offsets of A and B must match and the number of columns of A * must be less than or equal to the number of columns of B. * * \param B Target matrix. * \param i Target row index. * \param A Source matrix. * \param j Source row index. */ void mzd_copy_row(mzd_t *B, rci_t i, mzd_t const *A, rci_t j); /** * \brief Swap the two columns cola and colb. * * \param M Matrix. * \param cola Column index. * \param colb Column index. */ void mzd_col_swap(mzd_t *M, rci_t const cola, rci_t const colb); /** * \brief Swap the two columns cola and colb but only between start_row and stop_row. * * \param M Matrix. * \param cola Column index. * \param colb Column index. * \param start_row Row index. * \param stop_row Row index (exclusive). */ static inline void mzd_col_swap_in_rows(mzd_t *M, rci_t const cola, rci_t const colb, rci_t const start_row, rci_t const stop_row) { if (cola == colb) return; rci_t const _cola = cola + M->offset; rci_t const _colb = colb + M->offset; wi_t const a_word = _cola / m4ri_radix; wi_t const b_word = _colb / m4ri_radix; int const a_bit = _cola % m4ri_radix; int const b_bit = _colb % m4ri_radix; word* RESTRICT ptr = mzd_row(M, start_row); int max_bit = MAX(a_bit, b_bit); int count_remaining = stop_row - start_row; int min_bit = a_bit + b_bit - max_bit; int block = mzd_row_to_block(M, start_row); int offset = max_bit - min_bit; word mask = m4ri_one << min_bit; int count = MIN(mzd_remaining_rows_in_block(M, start_row), count_remaining); // Apparently we're calling with start_row == stop_row sometimes (seems a bug to me). if (count <= 0) return; if (a_word == b_word) { while(1) { count_remaining -= count; ptr += a_word; int fast_count = count / 4; int rest_count = count - 4 * fast_count; word xor_v[4]; wi_t const rowstride = M->rowstride; while (fast_count--) { xor_v[0] = ptr[0]; xor_v[1] = ptr[rowstride]; xor_v[2] = ptr[2 * rowstride]; xor_v[3] = ptr[3 * rowstride]; xor_v[0] ^= xor_v[0] >> offset; xor_v[1] ^= xor_v[1] >> offset; xor_v[2] ^= xor_v[2] >> offset; xor_v[3] ^= xor_v[3] >> offset; xor_v[0] &= mask; xor_v[1] &= mask; xor_v[2] &= mask; xor_v[3] &= mask; xor_v[0] |= xor_v[0] << offset; xor_v[1] |= xor_v[1] << offset; xor_v[2] |= xor_v[2] << offset; xor_v[3] |= xor_v[3] << offset; ptr[0] ^= xor_v[0]; ptr[rowstride] ^= xor_v[1]; ptr[2 * rowstride] ^= xor_v[2]; ptr[3 * rowstride] ^= xor_v[3]; ptr += 4 * rowstride; } while (rest_count--) { word xor_v = *ptr; xor_v ^= xor_v >> offset; xor_v &= mask; *ptr ^= xor_v | (xor_v << offset); ptr += rowstride; } block++; if ((count = MIN(mzd_rows_in_block(M, block), count_remaining)) <= 0) break; ptr = mzd_first_row_next_block(M, block); } } else { word* RESTRICT min_ptr; wi_t max_offset; if (min_bit == a_bit) { min_ptr = ptr + a_word; max_offset = b_word - a_word; } else { min_ptr = ptr + b_word; max_offset = a_word - b_word; } while(1) { count_remaining -= count; wi_t const rowstride = M->rowstride; while(count--) { word xor_v = (min_ptr[0] ^ (min_ptr[max_offset] >> offset)) & mask; min_ptr[0] ^= xor_v; min_ptr[max_offset] ^= xor_v << offset; min_ptr += rowstride; } block++; if ((count = MIN(mzd_rows_in_block(M,+block), count_remaining)) <= 0) break; ptr = mzd_first_row_next_block(M, block); if (min_bit == a_bit) min_ptr = ptr + a_word; else min_ptr = ptr + b_word; } } __M4RI_DD_MZD(M); } /** * \brief Read the bit at position M[row,col]. * * \param M Matrix * \param row Row index * \param col Column index * * \note No bounds checks whatsoever are performed. * */ static inline BIT mzd_read_bit(mzd_t const *M, rci_t const row, rci_t const col ) { return __M4RI_GET_BIT(M->rows[row][(col+M->offset)/m4ri_radix], (col+M->offset) % m4ri_radix); } /** * \brief Write the bit value to position M[row,col] * * \param M Matrix * \param row Row index * \param col Column index * \param value Either 0 or 1 * * \note No bounds checks whatsoever are performed. * */ static inline void mzd_write_bit(mzd_t *M, rci_t const row, rci_t const col, BIT const value) { __M4RI_WRITE_BIT(M->rows[row][(col + M->offset) / m4ri_radix], (col + M->offset) % m4ri_radix, value); } /** * \brief XOR n bits from values to M starting a position (x,y). * * \param M Source matrix. * \param x Starting row. * \param y Starting column. * \param n Number of bits (<= m4ri_radix); * \param values Word with values; */ static inline void mzd_xor_bits(mzd_t const *M, rci_t const x, rci_t const y, int const n, word values) { int const spot = (y + M->offset) % m4ri_radix; wi_t const block = (y + M->offset) / m4ri_radix; M->rows[x][block] ^= values << spot; int const space = m4ri_radix - spot; if (n > space) M->rows[x][block + 1] ^= values >> space; } /** * \brief AND n bits from values to M starting a position (x,y). * * \param M Source matrix. * \param x Starting row. * \param y Starting column. * \param n Number of bits (<= m4ri_radix); * \param values Word with values; */ static inline void mzd_and_bits(mzd_t const *M, rci_t const x, rci_t const y, int const n, word values) { /* This is the best way, since this will drop out once we inverse the bits in values: */ values >>= (m4ri_radix - n); /* Move the bits to the lowest columns */ int const spot = (y + M->offset) % m4ri_radix; wi_t const block = (y + M->offset) / m4ri_radix; M->rows[x][block] &= values << spot; int const space = m4ri_radix - spot; if (n > space) M->rows[x][block + 1] &= values >> space; } /** * \brief Clear n bits in M starting a position (x,y). * * \param M Source matrix. * \param x Starting row. * \param y Starting column. * \param n Number of bits (0 < n <= m4ri_radix); */ static inline void mzd_clear_bits(mzd_t const *M, rci_t const x, rci_t const y, int const n) { assert(n>0 && n <= m4ri_radix); word values = m4ri_ffff >> (m4ri_radix - n); int const spot = (y + M->offset) % m4ri_radix; wi_t const block = (y + M->offset) / m4ri_radix; M->rows[x][block] &= ~(values << spot); int const space = m4ri_radix - spot; if (n > space) M->rows[x][block + 1] &= ~(values >> space); } /** * \brief Add the rows sourcerow and destrow and stores the total in the row * destrow, but only begins at the column coloffset. * * \param M Matrix * \param dstrow Index of target row * \param srcrow Index of source row * \param coloffset Start column (0 <= coloffset < M->ncols) * * \warning This function expects that there is at least one word worth of work. */ static inline void mzd_row_add_offset(mzd_t *M, rci_t dstrow, rci_t srcrow, rci_t coloffset) { assert(dstrow < M->nrows && srcrow < M->nrows && coloffset < M->ncols); coloffset += M->offset; wi_t const startblock= coloffset/m4ri_radix; wi_t wide = M->width - startblock; word *src = M->rows[srcrow] + startblock; word *dst = M->rows[dstrow] + startblock; word const mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix - coloffset % m4ri_radix); word const mask_end = __M4RI_LEFT_BITMASK((M->ncols + M->offset) % m4ri_radix); *dst++ ^= *src++ & mask_begin; --wide; #if __M4RI_HAVE_SSE2 int not_aligned = __M4RI_ALIGNMENT(src,16) != 0; /* 0: Aligned, 1: Not aligned */ if (wide > not_aligned + 1) /* Speed up for small matrices */ { if (not_aligned) { *dst++ ^= *src++; --wide; } /* Now wide > 1 */ __m128i* __src = (__m128i*)src; __m128i* __dst = (__m128i*)dst; __m128i* const eof = (__m128i*)((unsigned long)(src + wide) & ~0xFUL); do { __m128i xmm1 = _mm_xor_si128(*__dst, *__src); *__dst++ = xmm1; } while(++__src < eof); src = (word*)__src; dst = (word*)__dst; wide = ((sizeof(word)*wide)%16)/sizeof(word); } #endif wi_t i = -1; while(++i < wide) dst[i] ^= src[i]; /* * Revert possibly non-zero excess bits. * Note that i == wide here, and wide can be 0. * But really, src[wide - 1] is M->rows[srcrow][M->width - 1] ;) * We use i - 1 here to let the compiler know these are the same addresses * that we last accessed, in the previous loop. */ dst[i - 1] ^= src[i - 1] & ~mask_end; __M4RI_DD_ROW(M, dstrow); } /** * \brief Add the rows sourcerow and destrow and stores the total in * the row destrow. * * \param M Matrix * \param sourcerow Index of source row * \param destrow Index of target row * * \note this can be done much faster with mzd_combine. */ void mzd_row_add(mzd_t *M, rci_t const sourcerow, rci_t const destrow); /** * \brief Transpose a matrix. * * This function uses the fact that: \verbatim [ A B ]T [AT CT] [ C D ] = [BT DT] \endverbatim * and thus rearranges the blocks recursively. * * \param DST Preallocated return matrix, may be NULL for automatic creation. * \param A Matrix */ mzd_t *mzd_transpose(mzd_t *DST, mzd_t const *A); /** * \brief Naive cubic matrix multiplication. * * That is, compute C such that C == AB. * * \param C Preallocated product matrix, may be NULL for automatic creation. * \param A Input matrix A. * \param B Input matrix B. * * \note Normally, if you will multiply several times by b, it is * smarter to calculate bT yourself, and keep it, and then use the * function called _mzd_mul_naive * */ mzd_t *mzd_mul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B); /** * \brief Naive cubic matrix multiplication and addition * * That is, compute C such that C == C + AB. * * \param C Preallocated product matrix. * \param A Input matrix A. * \param B Input matrix B. * * \note Normally, if you will multiply several times by b, it is * smarter to calculate bT yourself, and keep it, and then use the * function called _mzd_mul_naive */ mzd_t *mzd_addmul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B); /** * \brief Naive cubic matrix multiplication with the pre-transposed B. * * That is, compute C such that C == AB^t. * * \param C Preallocated product matrix. * \param A Input matrix A. * \param B Pre-transposed input matrix B. * \param clear Whether to clear C before accumulating AB */ mzd_t *_mzd_mul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B, int const clear); /** * \brief Matrix multiplication optimized for v*A where v is a vector. * * \param C Preallocated product matrix. * \param v Input matrix v. * \param A Input matrix A. * \param clear If set clear C first, otherwise add result to C. * */ mzd_t *_mzd_mul_va(mzd_t *C, mzd_t const *v, mzd_t const *A, int const clear); /** * \brief Fill matrix M with uniformly distributed bits. * * \param M Matrix * * \todo Allow the user to provide a RNG callback. * * \wordoffset */ void mzd_randomize(mzd_t *M); /** * \brief Set the matrix M to the value equivalent to the integer * value provided. * * Specifically, this function does nothing if value%2 == 0 and * returns the identity matrix if value%2 == 1. * * If the matrix is not square then the largest possible square * submatrix is set to the identity matrix. * * \param M Matrix * \param value Either 0 or 1 */ void mzd_set_ui(mzd_t *M, unsigned int const value); /** * \brief Gaussian elimination. * * This will do Gaussian elimination on the matrix m but will start * not at column 0 necc but at column startcol. If full=FALSE, then it * will do triangular style elimination, and if full=TRUE, it will do * Gauss-Jordan style, or full elimination. * * \param M Matrix * \param startcol First column to consider for reduction. * \param full Gauss-Jordan style or upper triangular form only. * * \wordoffset */ rci_t mzd_gauss_delayed(mzd_t *M, rci_t const startcol, int const full); /** * \brief Gaussian elimination. * * This will do Gaussian elimination on the matrix m. If full=FALSE, * then it will do triangular style elimination, and if full=TRUE, * it will do Gauss-Jordan style, or full elimination. * * \param M Matrix * \param full Gauss-Jordan style or upper triangular form only. * * \wordoffset * * \sa mzd_echelonize_m4ri(), mzd_echelonize_pluq() */ rci_t mzd_echelonize_naive(mzd_t *M, int const full); /** * \brief Return TRUE if A == B. * * \param A Matrix * \param B Matrix * * \wordoffset */ int mzd_equal(mzd_t const *A, mzd_t const *B); /** * \brief Return -1,0,1 if if A < B, A == B or A > B respectively. * * \param A Matrix. * \param B Matrix. * * \note This comparison is not well defined mathematically and * relatively arbitrary since elements of GF(2) don't have an * ordering. * * \wordoffset */ int mzd_cmp(mzd_t const *A, mzd_t const *B); /** * \brief Copy matrix A to DST. * * \param DST May be NULL for automatic creation. * \param A Source matrix. */ mzd_t *mzd_copy(mzd_t *DST, mzd_t const *A); /** * \brief Concatenate B to A and write the result to C. * * That is, * \verbatim [ A ], [ B ] -> [ A B ] = C \endverbatim * * The inputs are not modified but a new matrix is created. * * \param C Matrix, may be NULL for automatic creation * \param A Matrix * \param B Matrix * * \note This is sometimes called augment. * * \wordoffset */ mzd_t *mzd_concat(mzd_t *C, mzd_t const *A, mzd_t const *B); /** * \brief Stack A on top of B and write the result to C. * * That is, * \verbatim [ A ], [ B ] -> [ A ] = C [ B ] \endverbatim * * The inputs are not modified but a new matrix is created. * * \param C Matrix, may be NULL for automatic creation * \param A Matrix * \param B Matrix * * \wordoffset */ mzd_t *mzd_stack(mzd_t *C, mzd_t const *A, mzd_t const *B); /** * \brief Copy a submatrix. * * Note that the upper bounds are not included. * * \param S Preallocated space for submatrix, may be NULL for automatic creation. * \param M Matrix * \param lowr start rows * \param lowc start column * \param highr stop row (this row is \em not included) * \param highc stop column (this column is \em not included) */ mzd_t *mzd_submatrix(mzd_t *S, mzd_t const *M, rci_t const lowr, rci_t const lowc, rci_t const highr, rci_t const highc); /** * \brief Invert the matrix target using Gaussian elimination. * * To avoid recomputing the identity matrix over and over again, I may * be passed in as identity parameter. * * \param INV Preallocated space for inversion matrix, may be NULL for automatic creation. * \param A Matrix to be reduced. * \param I Identity matrix. * * \wordoffset */ mzd_t *mzd_invert_naive(mzd_t *INV, mzd_t const *A, mzd_t const *I); /** * \brief Set C = A+B. * * C is also returned. If C is NULL then a new matrix is created which * must be freed by mzd_free. * * \param C Preallocated sum matrix, may be NULL for automatic creation. * \param A Matrix * \param B Matrix */ mzd_t *mzd_add(mzd_t *C, mzd_t const *A, mzd_t const *B); /** * \brief Same as mzd_add but without any checks on the input. * * \param C Preallocated sum matrix, may be NULL for automatic creation. * \param A Matrix * \param B Matrix * * \wordoffset */ mzd_t *_mzd_add(mzd_t *C, mzd_t const *A, mzd_t const *B); /** * \brief Same as mzd_add. * * \param C Preallocated difference matrix, may be NULL for automatic creation. * \param A Matrix * \param B Matrix * * \wordoffset */ #define mzd_sub mzd_add /** * \brief Same as mzd_sub but without any checks on the input. * * \param C Preallocated difference matrix, may be NULL for automatic creation. * \param A Matrix * \param B Matrix * * \wordoffset */ #define _mzd_sub _mzd_add /** * Get n bits starting a position (x,y) from the matrix M. * * \param M Source matrix. * \param x Starting row. * \param y Starting column. * \param n Number of bits (<= m4ri_radix); */ static inline word mzd_read_bits(mzd_t const *M, rci_t const x, rci_t const y, int const n) { int const spot = (y + M->offset) % m4ri_radix; wi_t const block = (y + M->offset) / m4ri_radix; int const spill = spot + n - m4ri_radix; word temp = (spill <= 0) ? M->rows[x][block] << -spill : (M->rows[x][block + 1] << (m4ri_radix - spill)) | (M->rows[x][block] >> spill); return temp >> (m4ri_radix - n); } /** * \brief row3[col3:] = row1[col1:] + row2[col2:] * * Adds row1 of SC1, starting with startblock1 to the end, to * row2 of SC2, starting with startblock2 to the end. This gets stored * in DST, in row3, starting with startblock3. * * \param DST destination matrix * \param row3 destination row for matrix dst * \param startblock3 starting block to work on in matrix dst * \param SC1 source matrix * \param row1 source row for matrix sc1 * \param startblock1 starting block to work on in matrix sc1 * \param SC2 source matrix * \param startblock2 starting block to work on in matrix sc2 * \param row2 source row for matrix sc2 * */ void mzd_combine(mzd_t *DST, rci_t const row3, wi_t const startblock3, mzd_t const *SC1, rci_t const row1, wi_t const startblock1, mzd_t const *SC2, rci_t const row2, wi_t const startblock2); /** * \brief c_row[c_startblock:] = a_row[a_startblock:] + b_row[b_startblock:] for different offsets * * Adds a_row of A, starting with a_startblock to the end, to * b_row of B, starting with b_startblock to the end. This gets stored * in C, in c_row, starting with c_startblock. * * \param C destination matrix * \param c_row destination row for matrix C * \param A source matrix * \param a_row source row for matrix A * \param B source matrix * \param b_row source row for matrix B * */ static inline void mzd_combine_weird(mzd_t *C, rci_t const c_row, mzd_t const *A, rci_t const a_row, mzd_t const *B, rci_t const b_row) { word tmp; rci_t i = 0; for(; i + m4ri_radix <= A->ncols; i += m4ri_radix) { tmp = mzd_read_bits(A, a_row, i, m4ri_radix) ^ mzd_read_bits(B, b_row, i, m4ri_radix); mzd_clear_bits(C, c_row, i, m4ri_radix); mzd_xor_bits(C, c_row, i, m4ri_radix, tmp); } if(A->ncols - i) { tmp = mzd_read_bits(A, a_row, i, (A->ncols - i)) ^ mzd_read_bits(B, b_row, i, (B->ncols - i)); mzd_clear_bits(C, c_row, i, (C->ncols - i)); mzd_xor_bits(C, c_row, i, (C->ncols - i), tmp); } __M4RI_DD_MZD(C); } /** * \brief a_row[a_startblock:] += b_row[b_startblock:] for offset 0 * * Adds a_row of A, starting with a_startblock to the end, to * b_row of B, starting with b_startblock to the end. This gets stored * in A, in a_row, starting with a_startblock. * * \param A destination matrix * \param a_row destination row for matrix C * \param a_startblock starting block to work on in matrix C * \param B source matrix * \param b_row source row for matrix B * \param b_startblock starting block to work on in matrix B * */ static inline void mzd_combine_even_in_place(mzd_t *A, rci_t const a_row, wi_t const a_startblock, mzd_t const *B, rci_t const b_row, wi_t const b_startblock) { wi_t wide = A->width - a_startblock - 1; word *a = A->rows[a_row] + a_startblock; word *b = B->rows[b_row] + b_startblock; #if __M4RI_HAVE_SSE2 if(wide > __M4RI_SSE2_CUTOFF) { /** check alignments **/ if (__M4RI_ALIGNMENT(a,16)) { *a++ ^= *b++; wide--; } if (__M4RI_ALIGNMENT(a, 16) == 0 && __M4RI_ALIGNMENT(b, 16) == 0) { __m128i *a128 = (__m128i*)a; __m128i *b128 = (__m128i*)b; const __m128i *eof = (__m128i*)((unsigned long)(a + wide) & ~0xFUL); do { *a128 = _mm_xor_si128(*a128, *b128); ++b128; ++a128; } while(a128 < eof); a = (word*)a128; b = (word*)b128; wide = ((sizeof(word) * wide) % 16) / sizeof(word); } } #endif // __M4RI_HAVE_SSE2 if (wide > 0) { wi_t n = (wide + 7) / 8; switch (wide % 8) { case 0: do { *(a++) ^= *(b++); case 7: *(a++) ^= *(b++); case 6: *(a++) ^= *(b++); case 5: *(a++) ^= *(b++); case 4: *(a++) ^= *(b++); case 3: *(a++) ^= *(b++); case 2: *(a++) ^= *(b++); case 1: *(a++) ^= *(b++); } while (--n > 0); } } *a ^= *b & __M4RI_LEFT_BITMASK(A->ncols%m4ri_radix); __M4RI_DD_MZD(A); } /** * \brief c_row[c_startblock:] = a_row[a_startblock:] + b_row[b_startblock:] for offset 0 * * Adds a_row of A, starting with a_startblock to the end, to * b_row of B, starting with b_startblock to the end. This gets stored * in C, in c_row, starting with c_startblock. * * \param C destination matrix * \param c_row destination row for matrix C * \param c_startblock starting block to work on in matrix C * \param A source matrix * \param a_row source row for matrix A * \param a_startblock starting block to work on in matrix A * \param B source matrix * \param b_row source row for matrix B * \param b_startblock starting block to work on in matrix B * */ static inline void mzd_combine_even(mzd_t *C, rci_t const c_row, wi_t const c_startblock, mzd_t const *A, rci_t const a_row, wi_t const a_startblock, mzd_t const *B, rci_t const b_row, wi_t const b_startblock) { wi_t wide = A->width - a_startblock - 1; word *a = A->rows[a_row] + a_startblock; word *b = B->rows[b_row] + b_startblock; word *c = C->rows[c_row] + c_startblock; /* /\* this is a corner case triggered by Strassen multiplication */ /* * which assumes certain (virtual) matrix sizes */ /* * 2011/03/07: I don't think this was ever correct *\/ */ /* if (a_row >= A->nrows) { */ /* assert(a_row < A->nrows); */ /* for(wi_t i = 0; i < wide; ++i) { */ /* c[i] = b[i]; */ /* } */ /* } else { */ #if __M4RI_HAVE_SSE2 if(wide > __M4RI_SSE2_CUTOFF) { /** check alignments **/ if (__M4RI_ALIGNMENT(a,16)) { *c++ = *b++ ^ *a++; wide--; } if ( (__M4RI_ALIGNMENT(b, 16) | __M4RI_ALIGNMENT(c, 16)) == 0) { __m128i *a128 = (__m128i*)a; __m128i *b128 = (__m128i*)b; __m128i *c128 = (__m128i*)c; const __m128i *eof = (__m128i*)((unsigned long)(a + wide) & ~0xFUL); do { *c128 = _mm_xor_si128(*a128, *b128); ++c128; ++b128; ++a128; } while(a128 < eof); a = (word*)a128; b = (word*)b128; c = (word*)c128; wide = ((sizeof(word) * wide) % 16) / sizeof(word); } } #endif // __M4RI_HAVE_SSE2 if (wide > 0) { wi_t n = (wide + 7) / 8; switch (wide % 8) { case 0: do { *(c++) = *(a++) ^ *(b++); case 7: *(c++) = *(a++) ^ *(b++); case 6: *(c++) = *(a++) ^ *(b++); case 5: *(c++) = *(a++) ^ *(b++); case 4: *(c++) = *(a++) ^ *(b++); case 3: *(c++) = *(a++) ^ *(b++); case 2: *(c++) = *(a++) ^ *(b++); case 1: *(c++) = *(a++) ^ *(b++); } while (--n > 0); } } *c ^= ((*a ^ *b ^ *c) & __M4RI_LEFT_BITMASK(C->ncols%m4ri_radix)); __M4RI_DD_MZD(C); } /** * \brief Get n bits starting a position (x,y) from the matrix M. * * This function is in principle the same as mzd_read_bits, * but it explicitely returns an 'int' and is used as * index into an array (Gray code). */ static inline int mzd_read_bits_int(mzd_t const *M, rci_t const x, rci_t const y, int const n) { return __M4RI_CONVERT_TO_INT(mzd_read_bits(M, x, y, n)); } /** * \brief Zero test for matrix. * * \param A Input matrix. * */ int mzd_is_zero(mzd_t const *A); /** * \brief Clear the given row, but only begins at the column coloffset. * * \param M Matrix * \param row Index of row * \param coloffset Column offset */ void mzd_row_clear_offset(mzd_t *M, rci_t const row, rci_t const coloffset); /** * \brief Find the next nonzero entry in M starting at start_row and start_col. * * This function walks down rows in the inner loop and columns in the * outer loop. If a nonzero entry is found this function returns 1 and * zero otherwise. * * If and only if a nonzero entry is found r and c are updated. * * \param M Matrix * \param start_row Index of row where to start search * \param start_col Index of column where to start search * \param r Row index updated if pivot is found * \param c Column index updated if pivot is found */ int mzd_find_pivot(mzd_t const *M, rci_t start_row, rci_t start_col, rci_t *r, rci_t *c); /** * \brief Return the number of nonzero entries divided by nrows * * ncols * * If res = 0 then 100 samples per row are made, if res > 0 the * function takes res sized steps within each row (res = 1 uses every * word). * * \param A Matrix * \param res Resolution of sampling (in words) */ double mzd_density(mzd_t const *A, wi_t res); /** * \brief Return the number of nonzero entries divided by nrows * * ncols considering only the submatrix starting at (r,c). * * If res = 0 then 100 samples per row are made, if res > 0 the * function takes res sized steps within each row (res = 1 uses every * word). * * \param A Matrix * \param res Resolution of sampling (in words) * \param r Row to start counting * \param c Column to start counting */ double _mzd_density(mzd_t const *A, wi_t res, rci_t r, rci_t c); /** * \brief Return the first row with all zero entries. * * If no such row can be found returns nrows. * * \param A Matrix */ rci_t mzd_first_zero_row(mzd_t const *A); /** * \brief Return hash value for matrix. * * \param A Matrix */ static inline word mzd_hash(mzd_t const *A) { word hash = 0; for (rci_t r = 0; r < A->nrows; ++r) hash ^= rotate_word(calculate_hash(A->rows[r], A->width), r % m4ri_radix); return hash; } /** * Return upper triangular submatrix of A * * \param U Output matrix, if NULL a new matrix will be returned * \param A Source matrix * * \return U */ mzd_t *mzd_extract_u(mzd_t *U, mzd_t const *A); /** * Return lower triangular submatrix of A * * \param L Output matrix, if NULL a new matrix will be returned * \param A Source matrix * * \return L */ mzd_t *mzd_extract_l(mzd_t *L, mzd_t const *A); #endif // M4RI_MZD libm4ri-20130416/src/mzp.c000066400000000000000000000402071212302366200150570ustar00rootroot00000000000000/****************************************************************************** * * M4RI: Linear Algebra over GF(2) * * Copyright (C) 2008 Martin Albrecht * * Distributed under the terms of the GNU General Public License (GPL) * version 2 or higher. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ ******************************************************************************/ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include "mzp.h" #include "mzd.h" mzp_t *mzp_init(rci_t length) { mzp_t *P = (mzp_t*)m4ri_mm_malloc(sizeof(mzp_t)); P->values = (rci_t*)m4ri_mm_malloc(sizeof(rci_t) * length); P->length = length; for (rci_t i = 0; i < length; ++i) { P->values[i] = i; } return P; } void mzp_free(mzp_t *P) { m4ri_mm_free(P->values); m4ri_mm_free(P); } mzp_t *mzp_init_window(mzp_t *P, rci_t begin, rci_t end){ mzp_t *window = (mzp_t *)m4ri_mm_malloc(sizeof(mzp_t)); window->values = P->values + begin; window->length = end - begin; __M4RI_DD_MZP(window); return window; } void mzp_free_window(mzp_t *condemned){ m4ri_mm_free(condemned); } mzp_t *mzp_copy(mzp_t *P, const mzp_t *Q) { if(P == NULL) P = mzp_init(Q->length); for(rci_t i=0; ilength; i++) P->values[i] = Q->values[i]; return P; } void mzp_set_ui(mzp_t *P, unsigned int value) { assert(value == 1); for (rci_t i = 0; i < P->length; ++i) { P->values[i] = i; } } void mzd_apply_p_left(mzd_t *A, mzp_t const *P) { if(A->ncols == 0) return; rci_t const length = MIN(P->length, A->nrows); for (rci_t i = 0; i < length; ++i) { assert(P->values[i] >= i); mzd_row_swap(A, i, P->values[i]); } } void mzd_apply_p_left_trans(mzd_t *A, mzp_t const *P) { if(A->ncols == 0) return; rci_t const length = MIN(P->length, A->nrows); for (rci_t i = length - 1; i >= 0; --i) { assert(P->values[i] >= i); mzd_row_swap(A, i, P->values[i]); } } /* optimised column swap operations */ static inline void mzd_write_col_to_rows_blockd(mzd_t *A, mzd_t const *B, rci_t const *permutation, word const *write_mask, rci_t const start_row, rci_t const stop_row, rci_t length) { assert(A->offset == 0); for(rci_t i = 0; i < length; i += m4ri_radix) { /* optimisation for identity permutations */ if (write_mask[i / m4ri_radix] == m4ri_ffff) continue; int const todo = MIN(m4ri_radix, length - i); wi_t const a_word = (i + A->offset) / m4ri_radix; wi_t words[m4ri_radix]; int bits[m4ri_radix]; word bitmasks[m4ri_radix]; /* we pre-compute bit access in advance */ for(int k = 0; k < todo; ++k) { rci_t const colb = permutation[i + k] + B->offset; words[k] = colb / m4ri_radix; bits[k] = colb % m4ri_radix; bitmasks[k] = m4ri_one << bits[k]; } for (rci_t r = start_row; r < stop_row; ++r) { word const *Brow = B->rows[r-start_row]; word *Arow = A->rows[r]; register word value = 0; /* we gather the bits in a register word */ switch(todo-1) { case 63: value |= ((Brow[words[63]] & bitmasks[63]) >> bits[63]) << 63; case 62: value |= ((Brow[words[62]] & bitmasks[62]) >> bits[62]) << 62; case 61: value |= ((Brow[words[61]] & bitmasks[61]) >> bits[61]) << 61; case 60: value |= ((Brow[words[60]] & bitmasks[60]) >> bits[60]) << 60; case 59: value |= ((Brow[words[59]] & bitmasks[59]) >> bits[59]) << 59; case 58: value |= ((Brow[words[58]] & bitmasks[58]) >> bits[58]) << 58; case 57: value |= ((Brow[words[57]] & bitmasks[57]) >> bits[57]) << 57; case 56: value |= ((Brow[words[56]] & bitmasks[56]) >> bits[56]) << 56; case 55: value |= ((Brow[words[55]] & bitmasks[55]) >> bits[55]) << 55; case 54: value |= ((Brow[words[54]] & bitmasks[54]) >> bits[54]) << 54; case 53: value |= ((Brow[words[53]] & bitmasks[53]) >> bits[53]) << 53; case 52: value |= ((Brow[words[52]] & bitmasks[52]) >> bits[52]) << 52; case 51: value |= ((Brow[words[51]] & bitmasks[51]) >> bits[51]) << 51; case 50: value |= ((Brow[words[50]] & bitmasks[50]) >> bits[50]) << 50; case 49: value |= ((Brow[words[49]] & bitmasks[49]) >> bits[49]) << 49; case 48: value |= ((Brow[words[48]] & bitmasks[48]) >> bits[48]) << 48; case 47: value |= ((Brow[words[47]] & bitmasks[47]) >> bits[47]) << 47; case 46: value |= ((Brow[words[46]] & bitmasks[46]) >> bits[46]) << 46; case 45: value |= ((Brow[words[45]] & bitmasks[45]) >> bits[45]) << 45; case 44: value |= ((Brow[words[44]] & bitmasks[44]) >> bits[44]) << 44; case 43: value |= ((Brow[words[43]] & bitmasks[43]) >> bits[43]) << 43; case 42: value |= ((Brow[words[42]] & bitmasks[42]) >> bits[42]) << 42; case 41: value |= ((Brow[words[41]] & bitmasks[41]) >> bits[41]) << 41; case 40: value |= ((Brow[words[40]] & bitmasks[40]) >> bits[40]) << 40; case 39: value |= ((Brow[words[39]] & bitmasks[39]) >> bits[39]) << 39; case 38: value |= ((Brow[words[38]] & bitmasks[38]) >> bits[38]) << 38; case 37: value |= ((Brow[words[37]] & bitmasks[37]) >> bits[37]) << 37; case 36: value |= ((Brow[words[36]] & bitmasks[36]) >> bits[36]) << 36; case 35: value |= ((Brow[words[35]] & bitmasks[35]) >> bits[35]) << 35; case 34: value |= ((Brow[words[34]] & bitmasks[34]) >> bits[34]) << 34; case 33: value |= ((Brow[words[33]] & bitmasks[33]) >> bits[33]) << 33; case 32: value |= ((Brow[words[32]] & bitmasks[32]) >> bits[32]) << 32; case 31: value |= ((Brow[words[31]] & bitmasks[31]) >> bits[31]) << 31; case 30: value |= ((Brow[words[30]] & bitmasks[30]) >> bits[30]) << 30; case 29: value |= ((Brow[words[29]] & bitmasks[29]) >> bits[29]) << 29; case 28: value |= ((Brow[words[28]] & bitmasks[28]) >> bits[28]) << 28; case 27: value |= ((Brow[words[27]] & bitmasks[27]) >> bits[27]) << 27; case 26: value |= ((Brow[words[26]] & bitmasks[26]) >> bits[26]) << 26; case 25: value |= ((Brow[words[25]] & bitmasks[25]) >> bits[25]) << 25; case 24: value |= ((Brow[words[24]] & bitmasks[24]) >> bits[24]) << 24; case 23: value |= ((Brow[words[23]] & bitmasks[23]) >> bits[23]) << 23; case 22: value |= ((Brow[words[22]] & bitmasks[22]) >> bits[22]) << 22; case 21: value |= ((Brow[words[21]] & bitmasks[21]) >> bits[21]) << 21; case 20: value |= ((Brow[words[20]] & bitmasks[20]) >> bits[20]) << 20; case 19: value |= ((Brow[words[19]] & bitmasks[19]) >> bits[19]) << 19; case 18: value |= ((Brow[words[18]] & bitmasks[18]) >> bits[18]) << 18; case 17: value |= ((Brow[words[17]] & bitmasks[17]) >> bits[17]) << 17; case 16: value |= ((Brow[words[16]] & bitmasks[16]) >> bits[16]) << 16; case 15: value |= ((Brow[words[15]] & bitmasks[15]) >> bits[15]) << 15; case 14: value |= ((Brow[words[14]] & bitmasks[14]) >> bits[14]) << 14; case 13: value |= ((Brow[words[13]] & bitmasks[13]) >> bits[13]) << 13; case 12: value |= ((Brow[words[12]] & bitmasks[12]) >> bits[12]) << 12; case 11: value |= ((Brow[words[11]] & bitmasks[11]) >> bits[11]) << 11; case 10: value |= ((Brow[words[10]] & bitmasks[10]) >> bits[10]) << 10; case 9: value |= ((Brow[words[ 9]] & bitmasks[ 9]) >> bits[ 9]) << 9; case 8: value |= ((Brow[words[ 8]] & bitmasks[ 8]) >> bits[ 8]) << 8; case 7: value |= ((Brow[words[ 7]] & bitmasks[ 7]) >> bits[ 7]) << 7; case 6: value |= ((Brow[words[ 6]] & bitmasks[ 6]) >> bits[ 6]) << 6; case 5: value |= ((Brow[words[ 5]] & bitmasks[ 5]) >> bits[ 5]) << 5; case 4: value |= ((Brow[words[ 4]] & bitmasks[ 4]) >> bits[ 4]) << 4; case 3: value |= ((Brow[words[ 3]] & bitmasks[ 3]) >> bits[ 3]) << 3; case 2: value |= ((Brow[words[ 2]] & bitmasks[ 2]) >> bits[ 2]) << 2; case 1: value |= ((Brow[words[ 1]] & bitmasks[ 1]) >> bits[ 1]) << 1; case 0: value |= ((Brow[words[ 0]] & bitmasks[ 0]) >> bits[ 0]) << 0; default: break; } /* for(int k = 0; k < todo; ++k) { */ /* value |= ((Brow[words[k]] & bitmasks[k]) << bits[k]) >> k; */ /* } */ /* and write the word once */ Arow[a_word] |= value; } } __M4RI_DD_MZD(A); } /** * Implements both apply_p_right and apply_p_right_trans. */ void _mzd_apply_p_right_even(mzd_t *A, mzp_t const *P, rci_t start_row, rci_t start_col, int notrans) { assert(A->offset == 0); if(A->nrows - start_row == 0) return; rci_t const length = MIN(P->length, A->ncols); wi_t const width = A->width; int step_size = MIN(A->nrows - start_row, MAX((__M4RI_CPU_L1_CACHE >> 3) / A->width, 1)); /* our temporary where we store the columns we want to swap around */ mzd_t *B = mzd_init(step_size, A->ncols); word *Arow; word *Brow; /* setup mathematical permutation */ rci_t *permutation = (rci_t*)m4ri_mm_calloc(A->ncols, sizeof(rci_t)); for(rci_t i = 0; i < A->ncols; ++i) permutation[i] = i; if (!notrans) { for(rci_t i = start_col; i < length; ++i) { rci_t t = permutation[i]; permutation[i] = permutation[P->values[i]]; permutation[P->values[i]] = t; } } else { for(rci_t i = start_col; i < length; ++i) { rci_t t = permutation[length - i - 1]; permutation[length - i - 1] = permutation[P->values[length - i - 1]]; permutation[P->values[length - i - 1]] = t; } } /* we have a bitmask to encode where to write to */ word *write_mask = (word*)m4ri_mm_calloc(width, sizeof(word)); #ifdef M4RI_WRAPWORD word::init_array(write_mask, width); #endif for(rci_t i = 0; i < A->ncols; i += m4ri_radix) { int const todo = MIN(m4ri_radix, A->ncols - i); for(int k = 0; k < todo; ++k) { if(permutation[i + k] == i + k) { write_mask[i / m4ri_radix] |= m4ri_one << k; } } } for(rci_t i = start_row; i < A->nrows; i += step_size) { step_size = MIN(step_size, A->nrows - i); for(int k = 0; k < step_size; ++k) { Arow = A->rows[i+k]; Brow = B->rows[k]; /*copy row & clear those values which will be overwritten */ for(wi_t j = 0; j < width; ++j) { Brow[j] = Arow[j]; Arow[j] = Arow[j] & write_mask[j]; } } /* here we actually write out the permutation */ mzd_write_col_to_rows_blockd(A, B, permutation, write_mask, i, i + step_size, length); } m4ri_mm_free(permutation); m4ri_mm_free(write_mask); mzd_free(B); __M4RI_DD_MZD(A); } void _mzd_apply_p_right_trans(mzd_t *A, mzp_t const *P) { if(A->nrows == 0) return; rci_t const length = MIN(P->length, A->ncols); int const step_size = MAX((__M4RI_CPU_L1_CACHE >> 3) / A->width, 1); for(rci_t j = 0; j < A->nrows; j += step_size) { rci_t stop_row = MIN(j + step_size, A->nrows); for (rci_t i = 0; i < length; ++i) { assert(P->values[i] >= i); mzd_col_swap_in_rows(A, i, P->values[i], j, stop_row); } } /* for (i=0; ilength; i++) { */ /* assert(P->values[i] >= i); */ /* mzd_col_swap(A, i, P->values[i]); */ /* } */ __M4RI_DD_MZD(A); } void _mzd_apply_p_right(mzd_t *A, mzp_t const *P) { if(A->nrows == 0) return; int const step_size = MAX((__M4RI_CPU_L1_CACHE >> 3) / A->width, 1); for(rci_t j = 0; j < A->nrows; j += step_size) { rci_t stop_row = MIN(j + step_size, A->nrows); for (rci_t i = P->length - 1; i >= 0; --i) { assert(P->values[i] >= i); mzd_col_swap_in_rows(A, i, P->values[i], j, stop_row); } } /* long i; */ /* for (i=P->length-1; i>=0; i--) { */ /* assert(P->values[i] >= i); */ /* mzd_col_swap(A, i, P->values[i]); */ /* } */ __M4RI_DD_MZD(A); } void mzd_apply_p_right_trans(mzd_t *A, mzp_t const *P) { if(!A->nrows) return; if(A->offset) { _mzd_apply_p_right_trans(A,P); return; } _mzd_apply_p_right_even(A, P, 0, 0, 0); } void mzd_apply_p_right(mzd_t *A, mzp_t const *P) { if(!A->nrows) return; if(A->offset) { _mzd_apply_p_right(A,P); return; } _mzd_apply_p_right_even(A, P, 0, 0, 1); } void mzd_apply_p_right_trans_even_capped(mzd_t *A, mzp_t const *P, rci_t start_row, rci_t start_col) { assert(A->offset == 0); if(!A->nrows) return; _mzd_apply_p_right_even(A, P, start_row, start_col, 0); } void mzd_apply_p_right_even_capped(mzd_t *A, mzp_t const *P, rci_t start_row, rci_t start_col) { assert(A->offset == 0); if(!A->nrows) return; _mzd_apply_p_right_even(A, P, start_row, start_col, 1); } void mzp_print(mzp_t const *P) { printf("[ "); for(rci_t i = 0; i < P->length; ++i) { printf("%zd ", (size_t)P->values[i]); } printf("]"); } void mzd_apply_p_right_trans_tri(mzd_t *A, mzp_t const *P) { assert(P->length == A->ncols); int const step_size = MAX((__M4RI_CPU_L1_CACHE >> 2) / A->width, 1); for(rci_t r = 0; r < A->nrows; r += step_size) { rci_t const row_bound = MIN(r + step_size, A->nrows); for (rci_t i =0 ; i < A->ncols; ++i) { assert(P->values[i] >= i); mzd_col_swap_in_rows(A, i, P->values[i], r, MIN(row_bound, i)); } } __M4RI_DD_MZD(A); } void _mzd_compress_l(mzd_t *A, rci_t r1, rci_t n1, rci_t r2) { /** * We are compressing this matrix \verbatim r1 n1 ------------------------------------------ | \ \____|___ | A01 | | \ | \ | | r1------------------------------------------ | | | | \ \_____ | | L1| | | \ \________| | | | | L2| | ------------------------------------------ \endverbatim * * to this matrix * \verbatim r1 n1 ------------------------------------------ | \ \____|___ | A01 | | \ | \ | | r1------------------------------------------ | \ | | \_____ | | \ | | \________| | | | | | ------------------------------------------ \endverbatim */ if (r1 == n1) return; #if 0 mzp_t *shift = mzp_init(A->ncols); for (rci_t i=r1,j=n1;ivalues[i] = j; } mzd_apply_p_right_trans_even_capped(A, shift, r1+r2, 0); mzp_free(shift); #else for (rci_t i = r1, j = n1; i < r1 + r2; ++i, ++j){ mzd_col_swap_in_rows(A, i, j, i, r1 + r2); } word tmp; wi_t block; for(rci_t i = r1 + r2; i < A->nrows; ++i) { rci_t j = r1; /* first we deal with the rest of the current word we need to write */ int const rest = m4ri_radix - ((j + A->offset) % m4ri_radix); tmp = mzd_read_bits(A, i, n1, rest); mzd_clear_bits(A, i, j, rest); mzd_xor_bits(A, i, j, rest, tmp); j += rest; /* now each write is simply a word write */ block = (n1 + j - r1 + A->offset) / m4ri_radix; if (rest % m4ri_radix == 0) { for( ; j + m4ri_radix <= r1 + r2; j += m4ri_radix, ++block) { tmp = A->rows[i][block]; A->rows[i][j / m4ri_radix] = tmp; } } else { for(; j + m4ri_radix <= r1 + r2; j += m4ri_radix, ++block) { tmp = (A->rows[i][block] >> rest) | ( A->rows[i][block + 1] << (m4ri_radix - rest)); A->rows[i][j / m4ri_radix] = tmp; } } /* we deal with the remaining bits. While we could write past the end of r1+r2 here, but we have no guarantee that we can read past the end of n1+r2. */ if (j < r1 + r2) { tmp = mzd_read_bits(A, i, n1 + j - r1, r1 + r2 - j); A->rows[i][j / m4ri_radix] = tmp; } /* now clear the rest of L2 */ j = r1 + r2; mzd_clear_bits(A, i, j, m4ri_radix - ((j + A->offset) % m4ri_radix)); j += m4ri_radix - ((j + A->offset) % m4ri_radix); /* it's okay to write the full word, i.e. past n1+r2, because everything is zero there anyway. Thus, we can omit the code which deals with last few bits. */ for(; j < n1 + r2; j += m4ri_radix) { A->rows[i][j / m4ri_radix] = 0; } } #endif __M4RI_DD_MZD(A); } libm4ri-20130416/src/mzp.h000066400000000000000000000116601212302366200150650ustar00rootroot00000000000000/** * \file mzp.h * * \brief Permutation matrices. * * \author Martin Albrecht * */ /****************************************************************************** * * M4RI: Linear Algebra over GF(2) * * Copyright (C) 2008 Martin Albrecht * * Distributed under the terms of the GNU General Public License (GPL) * version 2 or higher. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ ******************************************************************************/ #ifndef M4RI_MZP #define M4RI_MZP #include /** * \brief Permutations. */ typedef struct mzp_t { /** * The swap operations in LAPACK format. */ rci_t *values; /** * The length of the swap array. */ rci_t length; } mzp_t; // note that this is NOT mpz_t /** * Construct an identity permutation. * * \param length Length of the permutation. */ mzp_t *mzp_init(rci_t length); /** * Free a mzp_t. * * \param P Permutation to free. */ void mzp_free(mzp_t *P); /** * \brief Create a window/view into the permutation P. * * Use mzp_free_mzp_t_window() to free the window. * * \param P Permutation matrix * \param begin Starting index (inclusive) * \param end Ending index (exclusive) * */ mzp_t *mzp_init_window(mzp_t *P, rci_t begin, rci_t end); /** * \brief Free a permutation window created with * mzp_init_mzp_t_window(). * * \param condemned Permutation Matrix */ void mzp_free_window(mzp_t *condemned); /** * \brief copy permutation Q to P * * \param P Target permutation matrix (may be NULL) * \param Q Source permutation matrix (must not be NULL) */ mzp_t *mzp_copy(mzp_t *P, const mzp_t *Q); /** * \brief Set the permutation P to the identity permutation. The only * allowed value is 1. * * * \param P Permutation * \param value 1 * * \note This interface was chosen to be similar to mzd_set_ui(). */ void mzp_set_ui(mzp_t *P, unsigned int value); /** * Apply the permutation P to A from the left. * * This is equivalent to row swaps walking from 0 to length-1. * * \param A Matrix. * \param P Permutation. */ void mzd_apply_p_left(mzd_t *A, mzp_t const *P); /** * Apply the permutation P to A from the left but transpose P before. * * This is equivalent to row swaps walking from length-1 to 0. * * \param A Matrix. * \param P Permutation. */ void mzd_apply_p_left_trans(mzd_t *A, mzp_t const *P); /** * Apply the permutation P to A from the right. * * This is equivalent to column swaps walking from length-1 to 0. * * \param A Matrix. * \param P Permutation. */ void mzd_apply_p_right(mzd_t *A, mzp_t const *P); /** * Apply the permutation P to A from the right but transpose P before. * * This is equivalent to column swaps walking from 0 to length-1. * * \param A Matrix. * \param P Permutation. */ void mzd_apply_p_right_trans(mzd_t *A, mzp_t const *P); /** * Apply the permutation P to A from the right starting at start_row. * * This is equivalent to column swaps walking from length-1 to 0. * * \param A Matrix. * \param P Permutation. * \param start_row Start swapping at this row. * \param start_col Start swapping at this column. * * \wordoffset */ void mzd_apply_p_right_even_capped(mzd_t *A, mzp_t const *P, rci_t start_row, rci_t start_col); /** * Apply the permutation P^T to A from the right starting at start_row. * * This is equivalent to column swaps walking from 0 to length-1. * * \param A Matrix. * \param P Permutation. * \param start_row Start swapping at this row. * \param start_col Start swapping at this column. * * \wordoffset */ void mzd_apply_p_right_trans_even_capped(mzd_t *A, mzp_t const *P, rci_t start_row, rci_t start_col); /** * Apply the mzp_t P to A from the right but transpose P before. * * This is equivalent to column swaps walking from 0 to length-1. * * \param A Matrix. * \param P Permutation. */ void mzd_apply_p_right_trans(mzd_t *A, mzp_t const *P); /** * Apply the permutation P to A from the right, but only on the upper * the matrix A above the main diagonal. * * This is equivalent to column swaps walking from length-1 to 0. * * \param A Matrix. * \param Q Permutation. */ void mzd_apply_p_right_trans_tri(mzd_t *A, mzp_t const *Q); /** * Print the mzp_t P * * \param P Permutation. */ void mzp_print(mzp_t const *P); /** * Compresses the matrix L in a step in blockwise-recursive PLE * decomposition. * * \param A Matrix. * \param r1 Rank of left matrix. * \param n1 Column cut which separates left and right matrix. * \param r2 Rank of right matrix. */ void _mzd_compress_l(mzd_t *A, rci_t r1, rci_t n1, rci_t r2); #endif // M4RI_MZP libm4ri-20130416/src/parity.h000066400000000000000000000071531212302366200155710ustar00rootroot00000000000000#ifndef M4RI_PARITY_H #define M4RI_PARITY_H /******************************************************************* * * M4RI: Linear Algebra over GF(2) * * Copyright (C) 2008 David Harvey * * Distributed under the terms of the GNU General Public License (GPL) * version 2 or higher. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ * ********************************************************************/ /** * \file parity.h * * \brief Compute the parity of 64 words in parallel. * * \author David Harvey */ #include /** * \brief Step for mixing two 64-bit words to compute their parity. */ #define __M4RI_MIX32(a, b) (((((a) >> 32) ^ (a)) << 32) | \ ((((b) << 32) ^ (b)) >> 32)) /** * \brief Step for mixing two 64-bit words to compute their parity. */ #define __M4RI_MIX16(a, b) (((((a) << 16) ^ (a)) & __M4RI_CONVERT_TO_WORD(0xFFFF0000FFFF0000ull)) | \ ((((b) >> 16) ^ (b)) & __M4RI_CONVERT_TO_WORD(0x0000FFFF0000FFFFull))); /** * \brief Step for mixing two 64-bit words to compute their parity. */ #define __M4RI_MIX8(a, b) (((((a) << 8) ^ (a)) & __M4RI_CONVERT_TO_WORD(0xFF00FF00FF00FF00ull)) | \ ((((b) >> 8) ^ (b)) & __M4RI_CONVERT_TO_WORD(0x00FF00FF00FF00FFull))); /** * \brief Step for mixing two 64-bit words to compute their parity. */ #define __M4RI_MIX4(a, b) (((((a) << 4) ^ (a)) & __M4RI_CONVERT_TO_WORD(0xF0F0F0F0F0F0F0F0ull)) | \ ((((b) >> 4) ^ (b)) & __M4RI_CONVERT_TO_WORD(0x0F0F0F0F0F0F0F0Full))); /** * \brief Step for mixing two 64-bit words to compute their parity. */ #define __M4RI_MIX2(a, b) (((((a) << 2) ^ (a)) & __M4RI_CONVERT_TO_WORD(0xCCCCCCCCCCCCCCCCull)) | \ ((((b) >> 2) ^ (b)) & __M4RI_CONVERT_TO_WORD(0x3333333333333333ull))); /** * \brief Step for mixing two 64-bit words to compute their parity. */ #define __M4RI_MIX1(a, b) (((((a) << 1) ^ (a)) & __M4RI_CONVERT_TO_WORD(0xAAAAAAAAAAAAAAAAull)) | \ ((((b) >> 1) ^ (b)) & __M4RI_CONVERT_TO_WORD(0x5555555555555555ull))); /** * \brief See parity64. */ static inline word m4ri_parity64_helper(word *buf) { word a0, a1, b0, b1, c0, c1; a0 = __M4RI_MIX32(buf[0x20], buf[0x00]); a1 = __M4RI_MIX32(buf[0x30], buf[0x10]); b0 = __M4RI_MIX16(a1, a0); a0 = __M4RI_MIX32(buf[0x28], buf[0x08]); a1 = __M4RI_MIX32(buf[0x38], buf[0x18]); b1 = __M4RI_MIX16(a1, a0); c0 = __M4RI_MIX8(b1, b0); a0 = __M4RI_MIX32(buf[0x24], buf[0x04]); a1 = __M4RI_MIX32(buf[0x34], buf[0x14]); b0 = __M4RI_MIX16(a1, a0); a0 = __M4RI_MIX32(buf[0x2C], buf[0x0C]); a1 = __M4RI_MIX32(buf[0x3C], buf[0x1C]); b1 = __M4RI_MIX16(a1, a0); c1 = __M4RI_MIX8(b1, b0); return __M4RI_MIX4(c1, c0); } /** * \brief Computes parity of each of buf[0], buf[1], ..., buf[63]. * Returns single word whose bits are the parities of buf[0], ..., * buf[63]. * * \param buf buffer of words of length 64 */ static inline word m4ri_parity64(word *buf) { word d0, d1, e0, e1; d0 = m4ri_parity64_helper(buf); d1 = m4ri_parity64_helper(buf + 2); e0 = __M4RI_MIX2(d1, d0); d0 = m4ri_parity64_helper(buf + 1); d1 = m4ri_parity64_helper(buf + 3); e1 = __M4RI_MIX2(d1, d0); return __M4RI_MIX1(e1, e0); } #endif // M4RI_PARITY_H libm4ri-20130416/src/ple.c000066400000000000000000000176201212302366200150340ustar00rootroot00000000000000/******************************************************************* * * M4RI: Linear Algebra over GF(2) * * Copyright (C) 2008 Clement Pernet * * Distributed under the terms of the GNU General Public License (GPL) * version 2 or higher. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ * ********************************************************************/ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include "mzd.h" #include "triangular.h" #include "parity.h" #include "ple_russian.h" #include "strassen.h" #include "ple.h" rci_t mzd_ple(mzd_t *A, mzp_t *P, mzp_t *Q, int const cutoff) { if (P->length != A->nrows) m4ri_die("mzd_ple: Permutation P length (%d) must match A nrows (%d)\n", P->length, A->nrows); if (Q->length != A->ncols) m4ri_die("mzd_ple: Permutation Q length (%d) must match A ncols (%d)\n", Q->length, A->ncols); return _mzd_ple(A, P, Q, cutoff); } rci_t mzd_pluq (mzd_t *A, mzp_t *P, mzp_t *Q, int const cutoff) { if (P->length != A->nrows) m4ri_die("mzd_pluq: Permutation P length (%d) must match A nrows (%d)\n", P->length, A->nrows); if (Q->length != A->ncols) m4ri_die("mzd_pluq: Permutation Q length (%d) must match A ncols (%d)\n", Q->length, A->ncols); rci_t r = _mzd_pluq(A, P, Q, cutoff); return r; } rci_t _mzd_pluq(mzd_t *A, mzp_t *P, mzp_t *Q, int const cutoff) { rci_t r = _mzd_ple(A, P, Q, cutoff); if(r && r < A->nrows) { mzd_t *A0 = mzd_init_window(A, 0, 0, r, A->ncols); mzd_apply_p_right_trans_tri(A0, Q); mzd_free_window(A0); } else { mzd_apply_p_right_trans_tri(A, Q); } return r; } rci_t _mzd_ple(mzd_t *A, mzp_t *P, mzp_t *Q, int const cutoff) { assert(A->offset == 0); rci_t ncols = A->ncols; #if 1 rci_t nrows = mzd_first_zero_row(A); for(rci_t i = nrows; i < A->nrows; ++i) P->values[i] = i; for(rci_t i = 0; i < A->ncols; ++i) Q->values[i] = i; if(!nrows) { return 0; } #else rci_t nrows = A->nrows; #endif if (ncols <= m4ri_radix || A->width * A->nrows <= __M4RI_PLE_CUTOFF) { /* if(ncols <= __M4RI_PLUQ_CUTOFF) { */ /* this improves data locality and runtime considerably */ mzd_t *Abar = mzd_copy(NULL, A); // FIXME: Why is A copied to Abar rci_t r = _mzd_ple_russian(Abar, P, Q, 0); // changed //rci_t r = _mzd_ple_naive(Abar, P, Q); mzd_copy(A, Abar); // and copied back to A? Can't we work on A directly? mzd_free(Abar); return r; } { /* Block divide and conquer algorithm */ /* n1 * ------------------------------------------ * | A0 | A1 | * | | | * | | | * | | | * ------------------------------------------ */ rci_t n1 = (((ncols - 1) / m4ri_radix + 1) >> 1) * m4ri_radix; mzd_t *A0 = mzd_init_window(A, 0, 0, nrows, n1); mzd_t *A1 = mzd_init_window(A, 0, n1, nrows, ncols); /* First recursive call */ mzp_t *P1 = mzp_init_window(P, 0, nrows); mzp_t *Q1 = mzp_init_window(Q, 0, A0->ncols); rci_t r1 = _mzd_ple(A0, P1, Q1, cutoff); /* r1 n1 * ------------------------------------------ * | A00 | | A01 | * | | | | * r1------------------------------------------ * * | A10 | | A11 | * | | | | * ------------------------------------------ */ mzd_t *A00 = mzd_init_window(A, 0, 0, r1, r1); mzd_t *A10 = mzd_init_window(A, r1, 0, nrows, r1); mzd_t *A01 = mzd_init_window(A, 0, n1, r1, ncols); mzd_t *A11 = mzd_init_window(A, r1, n1, nrows, ncols); if (r1) { /* Computation of the Schur complement */ mzd_apply_p_left(A1, P1); _mzd_trsm_lower_left(A00, A01, cutoff); mzd_addmul(A11, A10, A01, cutoff); } mzp_free_window(P1); mzp_free_window(Q1); /* Second recursive call */ mzp_t *P2 = mzp_init_window(P, r1, nrows); mzp_t *Q2 = mzp_init_window(Q, n1, ncols); rci_t r2 = _mzd_ple(A11, P2, Q2, cutoff); /* n * ------------------- * | A0b | * | | * r1----------------- * | A1b | * | | * ------------------- */ /* Update A10 */ mzd_apply_p_left(A10, P2); /* Update P */ for (rci_t i = 0; i < nrows - r1; ++i) P2->values[i] += r1; // Update the A0b block (permutation + rotation) for(rci_t i = 0, j = n1; j < ncols; ++i, ++j) Q2->values[i] += n1; for(rci_t i = n1, j = r1; i < n1 + r2; ++i, ++j) Q->values[j] = Q->values[i]; /* Compressing L */ _mzd_compress_l(A, r1, n1, r2); mzp_free_window(Q2); mzp_free_window(P2); mzd_free_window(A0); mzd_free_window(A1); mzd_free_window(A00); mzd_free_window(A01); mzd_free_window(A10); mzd_free_window(A11); __M4RI_DD_MZD(A); __M4RI_DD_MZP(P); __M4RI_DD_MZP(Q); __M4RI_DD_RCI(r1 + r2); return r1 + r2; } } rci_t _mzd_pluq_naive(mzd_t *A, mzp_t *P, mzp_t *Q) { rci_t curr_pos = 0; for (curr_pos = 0; curr_pos < A->ncols; ) { int found = 0; /* search for some pivot */ rci_t i, j; for (j = curr_pos; j < A->ncols; ++j) { for (i = curr_pos; i< A->nrows; ++i ) { if (mzd_read_bit(A, i, j)) { found = 1; break; } } if(found) break; } if(found) { P->values[curr_pos] = i; Q->values[curr_pos] = j; mzd_row_swap(A, curr_pos, i); mzd_col_swap(A, curr_pos, j); /* clear below but preserve transformation matrix */ if (curr_pos +1 < A->ncols){ for(rci_t l = curr_pos + 1; l < A->nrows; ++l) { if (mzd_read_bit(A, l, curr_pos)) { mzd_row_add_offset(A, l, curr_pos, curr_pos + 1); } } } ++curr_pos; } else { break; } } for (rci_t i = curr_pos; i < A->nrows; ++i) P->values[i] = i; for (rci_t i = curr_pos; i < A->ncols; ++i) Q->values[i] = i; __M4RI_DD_MZD(A); __M4RI_DD_MZP(P); __M4RI_DD_MZP(Q); __M4RI_DD_RCI(curr_pos); return curr_pos; } rci_t _mzd_ple_naive(mzd_t *A, mzp_t *P, mzp_t *Q) { rci_t col_pos = 0; rci_t row_pos = 0; /* search for some pivot */ while (row_pos < A->nrows && col_pos < A->ncols) { int found = 0; rci_t i, j; for (j = col_pos; j < A->ncols; ++j) { for (i = row_pos; i < A->nrows; ++i) { if (mzd_read_bit(A, i, j)) { found = 1; break; } } if(found) break; } if(found) { P->values[row_pos] = i; Q->values[row_pos] = j; mzd_row_swap(A, row_pos, i); //mzd_col_swap(A, curr_pos, j); /* clear below but preserve transformation matrix */ if (j + 1 < A->ncols){ for(rci_t l = row_pos + 1; l < A->nrows; ++l) { if (mzd_read_bit(A, l, j)) { mzd_row_add_offset(A, l, row_pos, j + 1); } } } ++row_pos; col_pos = j + 1; } else { break; } } for (rci_t i = row_pos; i < A->nrows; ++i) P->values[i] = i; for (rci_t i = row_pos; i < A->ncols; ++i) Q->values[i] = i; /* Now compressing L */ for (rci_t j = 0; j < row_pos; ++j){ if (Q->values[j] > j) { // To be optimized by a copy_row function mzd_col_swap_in_rows (A,Q->values[j], j, j, A->nrows); } } __M4RI_DD_MZD(A); __M4RI_DD_MZP(P); __M4RI_DD_MZP(Q); __M4RI_DD_RCI(row_pos); return row_pos; } libm4ri-20130416/src/ple.h000066400000000000000000000111171212302366200150340ustar00rootroot00000000000000/** * \file ple.h * * \brief PLE and PLUQ matrix decomposition routines. * * \author Clement Pernet * */ #ifndef M4RI_PLUQ_H #define M4RI_PLUQ_H /******************************************************************* * * M4RI: Linear Algebra over GF(2) * * Copyright (C) 2008, 2009 Clement Pernet * * Distributed under the terms of the GNU General Public License (GPL) * version 2 or higher. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ * ********************************************************************/ #include #include /** * Crossover point for PLUQ factorization. */ #define __M4RI_PLE_CUTOFF MIN(524288, __M4RI_CPU_L3_CACHE >> 3) /** * \brief PLUQ matrix decomposition. * * Returns (P,L,U,Q) satisfying PLUQ = A where P and Q are two * permutation matrices, of dimension respectively m x m and n x n, L * is m x r unit lower triangular and U is r x n upper triangular. * * P and Q must be preallocated but they don't have to be * identity permutations. If cutoff is zero a value is chosen * automatically. It is recommended to set cutoff to zero for most * applications. * * The row echelon form (not reduced) can be read from the upper * triangular matrix U. See mzd_echelonize_pluq() for details. * * This is the wrapper function including bounds checks. See * _mzd_pluq() for implementation details. * * \param A Input m x n matrix * \param P Output row permutation of length m * \param Q Output column permutation matrix of length n * \param cutoff Minimal dimension for Strassen recursion. * * \sa _mzd_pluq() _mzd_pluq_mmpf() mzd_echelonize_pluq() * * \wordoffset * * \return Rank of A. */ rci_t mzd_pluq(mzd_t *A, mzp_t *P, mzp_t *Q, const int cutoff); /** * \brief PLE matrix decomposition. * * Computes the PLE matrix decomposition using a block recursive * algorithm. * * Returns (P,L,S,Q) satisfying PLE = A where P is a permutation matrix * of dimension m x m, L is m x r unit lower triangular and S is an r * x n matrix which is upper triangular except that its columns are * permuted, that is S = UQ for U r x n upper triangular and Q is a n * x n permutation matrix. The matrix L and S are stored in place over * A. * * P and Q must be preallocated but they don't have to be * identity permutations. If cutoff is zero a value is chosen * automatically. It is recommended to set cutoff to zero for most * applications. * * This is the wrapper function including bounds checks. See * _mzd_ple() for implementation details. * * \param A Input m x n matrix * \param P Output row permutation of length m * \param Q Output column permutation matrix of length n * \param cutoff Minimal dimension for Strassen recursion. * * \sa _mzd_ple() _mzd_pluq() _mzd_pluq_mmpf() mzd_echelonize_pluq() * * \wordoffset * * \return Rank of A. */ rci_t mzd_ple(mzd_t *A, mzp_t *P, mzp_t *Q, const int cutoff); /** * \brief PLUQ matrix decomposition. * * See mzd_pluq() for details. * * \param A Input matrix * \param P Output row mzp_t matrix * \param Q Output column mzp_t matrix * \param cutoff Minimal dimension for Strassen recursion. * * \sa mzd_pluq() * * \wordoffset * \return Rank of A. */ rci_t _mzd_pluq(mzd_t *A, mzp_t *P, mzp_t *Q, const int cutoff); /** * \brief PLE matrix decomposition. * * See mzd_ple() for details. * * \param A Input matrix * \param P Output row mzp_t matrix * \param Qt Output column mzp_t matrix * \param cutoff Minimal dimension for Strassen recursion. * * \sa mzd_ple() * * \wordoffset * \return Rank of A. */ rci_t _mzd_ple(mzd_t *A, mzp_t *P, mzp_t *Qt, const int cutoff); /** * \brief PLUQ matrix decomposition (naive base case). * * See mzd_pluq() for details. * * \param A Input matrix * \param P Output row mzp_t matrix * \param Q Output column mzp_t matrix * * \sa mzd_pluq() * * \wordoffset * \return Rank of A. */ rci_t _mzd_pluq_naive(mzd_t *A, mzp_t *P, mzp_t *Q); /** * \brief PLE matrix decomposition (naive base case). * * See mzd_ple() for details. * * \param A Input matrix * \param P Output row mzp_t matrix * \param Qt Output column mzp_t matrix * * \sa mzd_ple() * * \wordoffset * \return Rank of A. */ rci_t _mzd_ple_naive(mzd_t *A, mzp_t *P, mzp_t *Qt); #endif // M4RI_PLUQ_H libm4ri-20130416/src/ple_russian.c000066400000000000000000001152251212302366200166000ustar00rootroot00000000000000/******************************************************************* * * M4RI: Linear Algebra over GF(2) * * Copyright (C) 2008-2011 Martin Albrecht * * Distributed under the terms of the GNU General Public License (GPL) * version 2 or higher. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ * ********************************************************************/ #include "m4ri_config.h" #include #if __M4RI_HAVE_SSE2 #include #endif #include "ple_russian.h" #include "brilliantrussian.h" #include "graycode.h" #include "xor.h" #ifndef NDEBUG #include "mmc.h" #endif /** the number of tables used in PLE decomposition **/ #define __M4RI_PLE_NTABLES 5 static inline rci_t _max_value(rci_t *data, int length) { rci_t max = 0; for(int i = 0; i < length; ++i) { max = MAX(max, data[i]); } return max; } static inline void _kk_setup(int const kk, int const knar, int *k_, int *knar_, int const *pivots, int const ntables) { int i,j, rem; int lb[__M4RI_PLE_NTABLES], ub[__M4RI_PLE_NTABLES]; assert(ntables <= __M4RI_PLE_NTABLES && ntables > 0); switch(ntables) { case 6: rem = kk % 6; k_[0] = kk / 6 + ((rem >= 5) ? 1 : 0); k_[1] = kk / 6 + ((rem >= 4) ? 1 : 0); k_[2] = kk / 6 + ((rem >= 3) ? 1 : 0); k_[3] = kk / 6 + ((rem >= 2) ? 1 : 0); k_[4] = kk / 6 + ((rem >= 1) ? 1 : 0);; k_[5] = kk / 6; knar_[0] = 0; knar_[1] = 0; knar_[2] = 0; knar_[3] = 0; knar_[4] = 0; knar_[5] = 0; lb[0] = 0; lb[1] = k_[0]; lb[2] = lb[1]+k_[1]; lb[3] = lb[2]+k_[2]; lb[4] = lb[3]+k_[3]; lb[5] = lb[4]+k_[4]; ub[0] = 0+k_[0]; ub[1] = ub[0]+k_[1]; ub[2] = ub[1]+k_[2]; ub[3] = ub[2]+k_[3]; ub[4] = ub[3]+k_[4]; ub[5] = ub[4]+k_[5]; assert((k_[0] > 0) && (k_[1] > 0) && (k_[2] > 0) && (k_[3] > 0) && (k_[4] > 0) && (k_[5] > 0)); break; case 5: rem = kk % 5; k_[0] = kk / 5 + ((rem >= 4) ? 1 : 0); k_[1] = kk / 5 + ((rem >= 3) ? 1 : 0); k_[2] = kk / 5 + ((rem >= 2) ? 1 : 0); k_[3] = kk / 5 + ((rem >= 1) ? 1 : 0); k_[4] = kk / 5; knar_[0] = 0; knar_[1] = 0; knar_[2] = 0; knar_[3] = 0; knar_[4] = 0; lb[0] = 0; lb[1] = k_[0]; lb[2] = lb[1]+k_[1]; lb[3] = lb[2]+k_[2]; lb[4] = lb[3]+k_[3]; ub[0] = 0+k_[0]; ub[1] = ub[0]+k_[1]; ub[2] = ub[1]+k_[2]; ub[3] = ub[2]+k_[3]; ub[4] = ub[3]+k_[4]; assert((k_[0] > 0) && (k_[1] > 0) && (k_[2] > 0) && (k_[3] > 0) && (k_[4] > 0)); break; case 4: rem = kk % 4; k_[0] = kk / 4 + ((rem >= 3) ? 1 : 0); k_[1] = kk / 4 + ((rem >= 2) ? 1 : 0); k_[2] = kk / 4 + ((rem >= 1) ? 1 : 0); k_[3] = kk / 4; knar_[0] = 0; knar_[1] = 0; knar_[2] = 0; knar_[3] = 0; lb[0] = 0; lb[1] = k_[0]; lb[2] = lb[1]+k_[1]; lb[3] = lb[2]+k_[2]; ub[0] = 0+k_[0]; ub[1] = ub[0]+k_[1]; ub[2] = ub[1]+k_[2]; ub[3] = ub[2]+k_[3]; assert((k_[0] > 0) && (k_[1] > 0) && (k_[2] > 0) && (k_[3] > 0)); break; case 3: rem = kk % 3; k_[0] = kk / 3 + ((rem >= 2) ? 1 : 0); k_[1] = kk / 3 + ((rem >= 1) ? 1 : 0); k_[2] = kk / 3; knar_[0] = 0; knar_[1] = 0; knar_[2] = 0; lb[0] = 0; lb[1] = k_[0]; lb[2] = lb[1]+k_[1]; ub[0] = 0+k_[0]; ub[1] = ub[0]+k_[1]; ub[2] = ub[1]+k_[2]; assert((k_[0] > 0) && (k_[1] > 0) && (k_[2] > 0)); break; case 2: k_[0] = kk / 2; k_[1] = kk - k_[0]; knar_[0] = 0; knar_[1] = 0; lb[0] = 0; lb[1] = k_[0]; ub[0] = 0+k_[0]; ub[1] = ub[0]+k_[1]; assert((k_[0] > 0) && (k_[1] > 0)); break; case 1: k_[0] = kk; knar_[0] = 0; lb[0] = 0; ub[0] = 0+k_[0]; break; default: m4ri_die("Only %d tables are supported at the moment.", __M4RI_PLE_NTABLES); } for(i=0; i= lb[j] && pivots[i] < ub[j]) { knar_[j]++; } } } int _mzd_ple_submatrix(mzd_t *A, rci_t const start_row, rci_t const stop_row, rci_t const start_col, int const k, mzp_t *P, mzp_t *Q, rci_t *pivots, rci_t *done, rci_t *done_row, wi_t const splitblock) { word bm[__M4RI_PLE_NTABLES * __M4RI_MAXKAY]; wi_t os[__M4RI_PLE_NTABLES * __M4RI_MAXKAY]; /* we're essentially constructing a submatrix but cheaply */ wi_t const width = A->width; rci_t const ncols = A->ncols; int const flags = A->flags; word low_bitmask = A->low_bitmask; word high_bitmask = A->high_bitmask; if (A->width > splitblock) { A->width = splitblock; A->ncols = splitblock * m4ri_radix; assert(A->offset == 0); A->flags &= mzd_flag_multiple_blocks; A->flags |= (mzd_flag_windowed_zerooffset | mzd_flag_windowed_zeroexcess); A->high_bitmask = A->low_bitmask = m4ri_ffff; /* No need to set mzd_flag_windowed_ownsblocks, because we won't free A until it's elements are restored below. */ } int curr_pos; int rank = 0; for(curr_pos = 0; curr_pos < k; ++curr_pos) { os[curr_pos] = (start_col + curr_pos) / m4ri_radix; bm[curr_pos] = m4ri_one << ((start_col + curr_pos) % m4ri_radix); int found = 0; /* search for some pivot */ rci_t i; for(i = start_row + rank; i < stop_row; ++i) { word const tmp = mzd_read_bits(A, i, start_col, curr_pos + 1); if(tmp) { word *Arow = A->rows[i]; /* clear before but preserve transformation matrix */ for (rci_t l = 0; l < rank; ++l) if(done[l] < i) { if((Arow[os[pivots[l]]] & bm[pivots[l]])) mzd_row_add_offset(A, i, start_row + l, start_col + pivots[l] + 1); done[l] = i; /* encode up to which row we added for l already */ } if(mzd_read_bit(A, i, start_col + curr_pos)) { found = 1; break; } } } if (found) { P->values[start_row + rank] = i; mzd_row_swap(A, i, start_row + rank); Q->values[start_row + rank] = start_col + curr_pos; pivots[rank] = curr_pos; done[rank] = i; rank++; } } /* finish submatrix */ *done_row = _max_value(done, rank); for(rci_t c2 = 0; c2 < rank && start_col + pivots[c2] < A->ncols -1; ++c2) for(rci_t r2 = done[c2] + 1; r2 <= *done_row; ++r2) if(mzd_read_bit(A, r2, start_col + pivots[c2])) mzd_row_add_offset(A, r2, start_row + c2, start_col + pivots[c2] + 1); /* reset to original size */ A->ncols = ncols; A->width = width; A->flags = flags; A->low_bitmask = low_bitmask; A->high_bitmask = high_bitmask; __M4RI_DD_MZD(A); __M4RI_DD_MZP(P); __M4RI_DD_MZP(Q); __M4RI_DD_INT(curr_pos); return rank; } /* create a table of all 2^k linear combinations */ void mzd_make_table_ple(mzd_t const *M, rci_t r, rci_t c, int k, int knar, mzd_t *T, rci_t *Le, rci_t *Lm, rci_t *offsets, int base) { // Note that this restricts the number of columns of any matrix to // __M4RI_MAX_MZD_BLOCKSIZE * radix / twokay = 268 million. assert(!(T->flags & mzd_flag_multiple_blocks)); wi_t const blockoffset= c / m4ri_radix; int const twokay= __M4RI_TWOPOW(knar); wi_t const wide = T->width - blockoffset; wi_t const count = (wide + 7) / 8; int const entry_point = wide % 8; wi_t const next_row_offset = blockoffset + T->rowstride - T->width; word *ti, *ti1, *m; ti1 = T->rows[0] + blockoffset; ti = ti1 + T->rowstride; Le[0] = 0; Lm[0] = 0; for (int i = 1; i < twokay; ++i) { rci_t rowneeded = r + m4ri_codebook[knar]->inc[i - 1]; m = M->rows[rowneeded] + blockoffset; /* Duff's device loop unrolling */ wi_t n = count; switch (entry_point) { case 0: do { *(ti++) = *(m++) ^ *(ti1++); case 7: *(ti++) = *(m++) ^ *(ti1++); case 6: *(ti++) = *(m++) ^ *(ti1++); case 5: *(ti++) = *(m++) ^ *(ti1++); case 4: *(ti++) = *(m++) ^ *(ti1++); case 3: *(ti++) = *(m++) ^ *(ti1++); case 2: *(ti++) = *(m++) ^ *(ti1++); case 1: *(ti++) = *(m++) ^ *(ti1++); } while (--n > 0); } ti += next_row_offset; ti1 += next_row_offset; /* U is a basis but not the canonical basis, so we need to read what element we just created from T */ Le[mzd_read_bits_int(T,i,c,k)] = i; Lm[m4ri_spread_bits(m4ri_codebook[k]->ord[i],offsets,knar,base)] = i; } /* We need fix the table to update the transformation matrix correctly; e.g. if the first row has [1 0 1] and we clear a row below with [1 0 1] we need to encode that this row is cleared by adding the first row only ([1 0 0]). */ for(int i = 1; i < twokay; ++i) { word const correction = m4ri_spread_bits(__M4RI_CONVERT_TO_WORD(m4ri_codebook[k]->ord[i]), offsets, knar,base); mzd_xor_bits(T, i,c, k, correction); } __M4RI_DD_MZD(T); __M4RI_DD_RCI_ARRAY(Le, twokay); __M4RI_DD_RCI_ARRAY(Lm, twokay); } static inline int _mzd_read_bits_int_raw(word *row, int const spot, wi_t const block, int const spill, int const n) { word temp = (spill <= 0) ? row[block] << -spill : (row[block + 1] << (m4ri_radix - spill)) | (row[block] >> spill); return temp >> (m4ri_radix - n); } void mzd_process_rows2_ple(mzd_t *M, rci_t startrow, rci_t stoprow, rci_t startcol, int const k0, mzd_t const *T0, rci_t const *E0, int const k1, mzd_t const *T1, rci_t const *E1) { assert(k0+k1 <= m4ri_radix); int const spot0 = (startcol) % m4ri_radix; int const spot1 = (startcol + k0) % m4ri_radix; wi_t const block0 = startcol / m4ri_radix; wi_t const block1 = (startcol + k0) / m4ri_radix; int const spill0 = spot0 + k0 - m4ri_radix; int const spill1 = spot1 + k1 - m4ri_radix; wi_t const blockdiff1 = block1 - block0; wi_t wide = M->width - block0; if(wide < 3) { mzd_process_rows(M, startrow, stoprow, startcol, k0, T0, E0); mzd_process_rows(M, startrow, stoprow, startcol + k0, k1, T1, E1); return; } for(rci_t r = startrow; r < stoprow; ++r) { word *m0 = M->rows[r+0] + block0; rci_t const x0 = E0[ _mzd_read_bits_int_raw(m0, spot0, 0, spill0, k0) ]; word const *t0 = T0->rows[x0] + block0; m0[0] ^= t0[0]; m0[1] ^= t0[1]; t0 += 2; rci_t const x1 = E1[ _mzd_read_bits_int_raw(m0, spot1, blockdiff1, spill1, k1) ]; word const *t1 = T1->rows[x1] + block1; switch(blockdiff1) { case 0: m0[0] ^= t1[0 - blockdiff1]; case 1: m0[1] ^= t1[1 - blockdiff1]; break; } t1 += 2 - blockdiff1; _mzd_combine2(m0+2, t0, t1, wide-2); } __M4RI_DD_MZD(M); } void mzd_process_rows3_ple(mzd_t *M, rci_t startrow, rci_t stoprow, rci_t startcol, int const k0, mzd_t const *T0, rci_t const *E0, int const k1, mzd_t const *T1, rci_t const *E1, int const k2, mzd_t const *T2, rci_t const *E2) { int const spot0 = (startcol) % m4ri_radix; int const spot1 = (startcol + k0) % m4ri_radix; int const spot2 = (startcol + k0 + k1) % m4ri_radix; wi_t const block0 = startcol / m4ri_radix; wi_t const block1 = (startcol + k0) / m4ri_radix; wi_t const block2 = (startcol + k0 + k1) / m4ri_radix; int const spill0 = spot0 + k0 - m4ri_radix; int const spill1 = spot1 + k1 - m4ri_radix; int const spill2 = spot2 + k2 - m4ri_radix; wi_t const blockdiff1 = block1 - block0; wi_t const blockdiff2 = block2 - block0; wi_t wide = M->width - block0; if(wide < 3) { mzd_process_rows(M, startrow, stoprow, startcol, k0, T0, E0); mzd_process_rows(M, startrow, stoprow, startcol + k0, k1, T1, E1); mzd_process_rows(M, startrow, stoprow, startcol + k0 + k1, k2, T2, E2); return; } for(rci_t r = startrow; r < stoprow; ++r) { word *m0 = M->rows[r] + block0; rci_t const x0 = E0[_mzd_read_bits_int_raw(m0, spot0, 0, spill0, k0)]; word const *t0 = T0->rows[x0] + block0; m0[0] ^= t0[0]; m0[1] ^= t0[1]; t0 += 2; rci_t const x1 = E1[ _mzd_read_bits_int_raw(m0, spot1, blockdiff1, spill1, k1) ]; word *t1 = T1->rows[x1] + block1; switch(blockdiff1) { case 0: m0[0] ^= t1[0 - blockdiff1]; case 1: m0[1] ^= t1[1 - blockdiff1]; break; } t1 += 2 - blockdiff1; rci_t const x2 = E2[ _mzd_read_bits_int_raw(m0, spot2, blockdiff2, spill2, k2) ]; word *t2 = T2->rows[x2] + block2; switch(blockdiff2) { case 0: m0[0] ^= t2[0 - blockdiff2]; case 1: m0[1] ^= t2[1 - blockdiff2]; break; } t2 += 2 - blockdiff2; _mzd_combine3(m0+2,t0,t1,t2,wide-2); } __M4RI_DD_MZD(M); } void mzd_process_rows4_ple(mzd_t *M, rci_t startrow, rci_t stoprow, rci_t startcol, int const k0, mzd_t const *T0, rci_t const *E0, int const k1, mzd_t const *T1, rci_t const *E1, int const k2, mzd_t const *T2, rci_t const *E2, int const k3, mzd_t const *T3, rci_t const *E3) { assert(k0+k1+k2+k3 <= m4ri_radix); int const spot0 = (startcol) % m4ri_radix; int const spot1 = (startcol + k0) % m4ri_radix; int const spot2 = (startcol + k0 + k1) % m4ri_radix; int const spot3 = (startcol + k0 + k1 + k2) % m4ri_radix; wi_t const block0 = startcol / m4ri_radix; wi_t const block1 = (startcol + k0) / m4ri_radix; wi_t const block2 = (startcol + k0 + k1) / m4ri_radix; wi_t const block3 = (startcol + k0 + k1 + k2) / m4ri_radix; int const spill0 = spot0 + k0 - m4ri_radix; int const spill1 = spot1 + k1 - m4ri_radix; int const spill2 = spot2 + k2 - m4ri_radix; int const spill3 = spot3 + k3 - m4ri_radix; wi_t const blockdiff1 = block1 - block0; wi_t const blockdiff2 = block2 - block0; wi_t const blockdiff3 = block3 - block0; wi_t wide = M->width - block0; if(wide < 3) { mzd_process_rows(M, startrow, stoprow, startcol, k0, T0, E0); mzd_process_rows(M, startrow, stoprow, startcol + k0, k1, T1, E1); mzd_process_rows(M, startrow, stoprow, startcol + k0 + k1, k2, T2, E2); mzd_process_rows(M, startrow, stoprow, startcol + k0 + k1 + k2, k3, T3, E3); return; } for(rci_t r = startrow; r < stoprow; ++r) { word *m0 = M->rows[r] + block0; rci_t const x0 = E0[_mzd_read_bits_int_raw(m0, spot0, 0, spill0, k0)]; word *t0 = T0->rows[x0] + block0; m0[0] ^= t0[0]; m0[1] ^= t0[1]; t0 += 2; rci_t const x1 = E1[ _mzd_read_bits_int_raw(m0, spot1, blockdiff1, spill1, k1) ]; word *t1 = T1->rows[x1] + block1; switch(blockdiff1) { case 0: m0[0] ^= t1[0 - blockdiff1]; case 1: m0[1] ^= t1[1 - blockdiff1]; break; } t1 += 2 - blockdiff1; rci_t const x2 = E2[ _mzd_read_bits_int_raw(m0, spot2, blockdiff2, spill2, k2) ]; word *t2 = T2->rows[x2] + block2; switch(blockdiff2) { case 0: m0[0] ^= t2[0 - blockdiff2]; case 1: m0[1] ^= t2[1 - blockdiff2]; break; } t2 += 2 - blockdiff2; rci_t const x3 = E3[ _mzd_read_bits_int_raw(m0, spot3, blockdiff3, spill3, k3) ]; word *t3 = T3->rows[x3] + block3; switch(blockdiff3) { case 0: m0[0] ^= t3[0 - blockdiff3]; case 1: m0[1] ^= t3[1 - blockdiff3]; break; } t3 += 2 - blockdiff3; _mzd_combine4(m0+2, t0, t1, t2, t3, wide-2); } __M4RI_DD_MZD(M); } void mzd_process_rows5_ple(mzd_t *M, rci_t startrow, rci_t stoprow, rci_t startcol, int const k0, mzd_t const *T0, rci_t const *E0, int const k1, mzd_t const *T1, rci_t const *E1, int const k2, mzd_t const *T2, rci_t const *E2, int const k3, mzd_t const *T3, rci_t const *E3, int const k4, mzd_t const *T4, rci_t const *E4) { assert(k0+k1+k2+k3+k4 <= m4ri_radix); int const spot0 = (startcol) % m4ri_radix; int const spot1 = (startcol + k0) % m4ri_radix; int const spot2 = (startcol + k0 + k1) % m4ri_radix; int const spot3 = (startcol + k0 + k1 + k2) % m4ri_radix; int const spot4 = (startcol + k0 + k1 + k2 + k3) % m4ri_radix; wi_t const block0 = startcol / m4ri_radix; wi_t const block1 = (startcol + k0) / m4ri_radix; wi_t const block2 = (startcol + k0 + k1) / m4ri_radix; wi_t const block3 = (startcol + k0 + k1 + k2) / m4ri_radix; wi_t const block4 = (startcol + k0 + k1 + k2 + k3) / m4ri_radix; int const spill0 = spot0 + k0 - m4ri_radix; int const spill1 = spot1 + k1 - m4ri_radix; int const spill2 = spot2 + k2 - m4ri_radix; int const spill3 = spot3 + k3 - m4ri_radix; int const spill4 = spot4 + k4 - m4ri_radix; wi_t const blockdiff1 = block1 - block0; wi_t const blockdiff2 = block2 - block0; wi_t const blockdiff3 = block3 - block0; wi_t const blockdiff4 = block4 - block0; wi_t wide = M->width - block0; if(wide < 3) { mzd_process_rows(M, startrow, stoprow, startcol, k0, T0, E0); mzd_process_rows(M, startrow, stoprow, startcol + k0, k1, T1, E1); mzd_process_rows(M, startrow, stoprow, startcol + k0 + k1, k2, T2, E2); mzd_process_rows(M, startrow, stoprow, startcol + k0 + k1 + k2, k3, T3, E3); mzd_process_rows(M, startrow, stoprow, startcol + k0 + k1 + k2 + k3, k4, T4, E4); return; } for(rci_t r = startrow; r < stoprow; ++r) { word *m0 = M->rows[r] + block0; rci_t const x0 = E0[_mzd_read_bits_int_raw(m0, spot0, 0, spill0, k0)]; word *t0 = T0->rows[x0] + block0; m0[0] ^= t0[0]; m0[1] ^= t0[1]; t0 += 2; rci_t const x1 = E1[ _mzd_read_bits_int_raw(m0, spot1, blockdiff1, spill1, k1) ]; word *t1 = T1->rows[x1] + block1; switch(blockdiff1) { case 0: m0[0] ^= t1[0 - blockdiff1]; case 1: m0[1] ^= t1[1 - blockdiff1]; break; } t1 += 2 - blockdiff1; rci_t const x2 = E2[ _mzd_read_bits_int_raw(m0, spot2, blockdiff2, spill2, k2) ]; word *t2 = T2->rows[x2] + block2; switch(blockdiff2) { case 0: m0[0] ^= t2[0 - blockdiff2]; case 1: m0[1] ^= t2[1 - blockdiff2]; break; } t2 += 2 - blockdiff2; rci_t const x3 = E3[ _mzd_read_bits_int_raw(m0, spot3, blockdiff3, spill3, k3) ]; word *t3 = T3->rows[x3] + block3; switch(blockdiff3) { case 0: m0[0] ^= t3[0 - blockdiff3]; case 1: m0[1] ^= t3[1 - blockdiff3]; break; } t3 += 2 - blockdiff3; rci_t const x4 = E4[ _mzd_read_bits_int_raw(m0, spot4, blockdiff4, spill4, k4) ]; word *t4 = T4->rows[x4] + block4; switch(blockdiff4) { case 0: m0[0] ^= t4[0 - blockdiff4]; case 1: m0[1] ^= t4[1 - blockdiff4]; break; } t4 += 2 - blockdiff4; _mzd_combine5(m0+2, t0, t1, t2, t3, t4, wide-2); } __M4RI_DD_MZD(M); } void mzd_process_rows6_ple(mzd_t *M, rci_t startrow, rci_t stoprow, rci_t startcol, int const k0, mzd_t const *T0, rci_t const *E0, int const k1, mzd_t const *T1, rci_t const *E1, int const k2, mzd_t const *T2, rci_t const *E2, int const k3, mzd_t const *T3, rci_t const *E3, int const k4, mzd_t const *T4, rci_t const *E4, int const k5, mzd_t const *T5, rci_t const *E5) { assert(k0+k1+k2+k3+k4+k5 <= m4ri_radix); int const spot0 = (startcol) % m4ri_radix; int const spot1 = (startcol + k0) % m4ri_radix; int const spot2 = (startcol + k0 + k1) % m4ri_radix; int const spot3 = (startcol + k0 + k1 + k2) % m4ri_radix; int const spot4 = (startcol + k0 + k1 + k2 + k3) % m4ri_radix; int const spot5 = (startcol + k0 + k1 + k2 + k3 + k4) % m4ri_radix; wi_t const block0 = startcol / m4ri_radix; wi_t const block1 = (startcol + k0) / m4ri_radix; wi_t const block2 = (startcol + k0 + k1) / m4ri_radix; wi_t const block3 = (startcol + k0 + k1 + k2) / m4ri_radix; wi_t const block4 = (startcol + k0 + k1 + k2 + k3) / m4ri_radix; wi_t const block5 = (startcol + k0 + k1 + k2 + k3 + k4) / m4ri_radix; int const spill0 = spot0 + k0 - m4ri_radix; int const spill1 = spot1 + k1 - m4ri_radix; int const spill2 = spot2 + k2 - m4ri_radix; int const spill3 = spot3 + k3 - m4ri_radix; int const spill4 = spot4 + k4 - m4ri_radix; int const spill5 = spot5 + k5 - m4ri_radix; wi_t const blockdiff1 = block1 - block0; wi_t const blockdiff2 = block2 - block0; wi_t const blockdiff3 = block3 - block0; wi_t const blockdiff4 = block4 - block0; wi_t const blockdiff5 = block5 - block0; wi_t wide = M->width - block0; if(wide < 3) { mzd_process_rows(M, startrow, stoprow, startcol, k0, T0, E0); mzd_process_rows(M, startrow, stoprow, startcol + k0, k1, T1, E1); mzd_process_rows(M, startrow, stoprow, startcol + k0 + k1, k2, T2, E2); mzd_process_rows(M, startrow, stoprow, startcol + k0 + k1 + k2, k3, T3, E3); mzd_process_rows(M, startrow, stoprow, startcol + k0 + k1 + k2 + k3, k4, T4, E4); mzd_process_rows(M, startrow, stoprow, startcol + k0 + k1 + k2 + k3 + k4, k5, T5, E5); return; } #if __M4RI_HAVE_OPENMP #pragma omp parallel for schedule(static,512) #endif for(rci_t r = startrow; r < stoprow; ++r) { word *m0 = M->rows[r] + block0; rci_t const x0 = E0[_mzd_read_bits_int_raw(m0, spot0, 0, spill0, k0)]; word *t0 = T0->rows[x0] + block0; m0[0] ^= t0[0]; m0[1] ^= t0[1]; t0 += 2; rci_t const x1 = E1[ _mzd_read_bits_int_raw(m0, spot1, blockdiff1, spill1, k1) ]; word *t1 = T1->rows[x1] + block1; switch(blockdiff1) { case 0: m0[0] ^= t1[0 - blockdiff1]; case 1: m0[1] ^= t1[1 - blockdiff1]; break; } t1 += 2 - blockdiff1; rci_t const x2 = E2[ _mzd_read_bits_int_raw(m0, spot2, blockdiff2, spill2, k2) ]; word *t2 = T2->rows[x2] + block2; switch(blockdiff2) { case 0: m0[0] ^= t2[0 - blockdiff2]; case 1: m0[1] ^= t2[1 - blockdiff2]; break; } t2 += 2 - blockdiff2; rci_t const x3 = E3[ _mzd_read_bits_int_raw(m0, spot3, blockdiff3, spill3, k3) ]; word *t3 = T3->rows[x3] + block3; switch(blockdiff3) { case 0: m0[0] ^= t3[0 - blockdiff3]; case 1: m0[1] ^= t3[1 - blockdiff3]; break; } t3 += 2 - blockdiff3; rci_t const x4 = E4[ _mzd_read_bits_int_raw(m0, spot4, blockdiff4, spill4, k4) ]; word *t4 = T4->rows[x4] + block4; switch(blockdiff4) { case 0: m0[0] ^= t4[0 - blockdiff4]; case 1: m0[1] ^= t4[1 - blockdiff4]; break; } t4 += 2 - blockdiff4; rci_t const x5 = E5[ _mzd_read_bits_int_raw(m0, spot5, blockdiff5, spill5, k5) ]; word *t5 = T5->rows[x5] + block5; switch(blockdiff5) { case 0: m0[0] ^= t5[0 - blockdiff5]; case 1: m0[1] ^= t5[1 - blockdiff5]; break; } t5 += 2 - blockdiff5; _mzd_combine6(m0+2, t0, t1, t2, t3, t4, t5, wide-2); } __M4RI_DD_MZD(M); } void _mzd_ple_a10(mzd_t *A, mzp_t const *P, rci_t const start_row, rci_t const start_col, wi_t const addblock, int const k, rci_t *pivots) { /* perform needed row swaps */ for(rci_t i = start_row; i < start_row + k; ++i) { _mzd_row_swap(A, i, P->values[i], addblock); } for(int i = 1; i < k; ++i) { word const tmp = mzd_read_bits(A, start_row + i, start_col, pivots[i]); word *target = A->rows[start_row + i]; for(int j = 0; j < i; ++j) { if((tmp & m4ri_one << pivots[j])) { word const *source = A->rows[start_row + j]; for(wi_t w = addblock; w < A->width; ++w) { target[w] ^= source[w]; } } } } __M4RI_DD_MZD(A); __M4RI_DD_MZP(P); } void _mzd_ple_a11_1(mzd_t *A, rci_t const start_row, rci_t const stop_row, rci_t const start_col, wi_t const addblock, int const k, int const knar, mzd_t const *T0, rci_t const *M0) { wi_t const wide = A->width - addblock; if (wide <= 0) return; for(rci_t i = start_row + knar; i < stop_row; ++i) { rci_t x0 = M0[mzd_read_bits_int(A,i,start_col, k)]; word const *s0 = T0->rows[x0] + addblock; word *t = A->rows[i] + addblock; _mzd_combine(t, s0, wide); } __M4RI_DD_MZD(A); } void _mzd_ple_a11_2(mzd_t *A, rci_t const start_row, rci_t const stop_row, rci_t const start_col, wi_t const addblock, int const k0, int const knar0, mzd_t const *T0, rci_t const *M0, int const k1, int const knar1, mzd_t const *T1, rci_t const *M1) { wi_t const wide = A->width - addblock; if (wide <= 0) return; for(rci_t i = start_row + knar0 + knar1; i < stop_row; ++i) { rci_t x0 = M0[mzd_read_bits_int(A,i,start_col,k0)]; rci_t x1 = M1[mzd_read_bits_int(A,i,start_col+k0,k1)]; word const *s0 = T0->rows[x0] + addblock; word const *s1 = T1->rows[x1] + addblock; word *t = A->rows[i] + addblock; _mzd_combine2(t, s0, s1, wide); } __M4RI_DD_MZD(A); } void _mzd_ple_a11_3(mzd_t *A, rci_t const start_row, rci_t const stop_row, rci_t const start_col, wi_t const addblock, int const k0, int const knar0, mzd_t const *T0, rci_t const *M0, int const k1, int const knar1, mzd_t const *T1, rci_t const *M1, int const k2, int const knar2, mzd_t const *T2, rci_t const *M2) { wi_t const wide = A->width - addblock; if (wide <= 0) return; for(rci_t i = start_row + knar0 + knar1 + knar2; i < stop_row; ++i) { rci_t x0 = M0[mzd_read_bits_int(A,i,start_col, k0)]; rci_t x1 = M1[mzd_read_bits_int(A,i,start_col+k0, k1)]; rci_t x2 = M2[mzd_read_bits_int(A,i,start_col+k0+k1, k2)]; word const *s0 = T0->rows[x0] + addblock; word const *s1 = T1->rows[x1] + addblock; word const *s2 = T2->rows[x2] + addblock; word *t = A->rows[i] + addblock; _mzd_combine3(t, s0, s1, s2, wide); } __M4RI_DD_MZD(A); } void _mzd_ple_a11_4(mzd_t *A, rci_t const start_row, rci_t const stop_row, rci_t const start_col, wi_t const addblock, int const k0, int const knar0, mzd_t const *T0, rci_t const *M0, int const k1, int const knar1, mzd_t const *T1, rci_t const *M1, int const k2, int const knar2, mzd_t const *T2, rci_t const *M2, int const k3, int const knar3, mzd_t const *T3, rci_t const *M3) { wi_t const wide = A->width - addblock; if(wide <= 0) return; for(rci_t i = start_row + knar0 + knar1 + knar2 + knar3; i < stop_row; ++i) { rci_t x0 = M0[mzd_read_bits_int(A,i,start_col, k0)]; rci_t x1 = M1[mzd_read_bits_int(A,i,start_col+k0, k1)]; rci_t x2 = M2[mzd_read_bits_int(A,i,start_col+k0+k1, k2)]; rci_t x3 = M3[mzd_read_bits_int(A,i,start_col+k0+k1+k2, k3)]; word const *s0 = T0->rows[x0] + addblock; word const *s1 = T1->rows[x1] + addblock; word const *s2 = T2->rows[x2] + addblock; word const *s3 = T3->rows[x3] + addblock; word *t = A->rows[i] + addblock; _mzd_combine4(t, s0, s1, s2, s3, wide); } __M4RI_DD_MZD(A); } void _mzd_ple_a11_5(mzd_t *A, rci_t const start_row, rci_t const stop_row, rci_t const start_col, wi_t const addblock, int const k0, int const knar0, mzd_t const *T0, rci_t const *M0, int const k1, int const knar1, mzd_t const *T1, rci_t const *M1, int const k2, int const knar2, mzd_t const *T2, rci_t const *M2, int const k3, int const knar3, mzd_t const *T3, rci_t const *M3, int const k4, int const knar4, mzd_t const *T4, rci_t const *M4) { wi_t const wide = A->width - addblock; if(wide <= 0) return; for(rci_t i = start_row + knar0 + knar1 + knar2 + knar3 + knar4; i < stop_row; ++i) { rci_t x0 = M0[mzd_read_bits_int(A,i,start_col, k0)]; rci_t x1 = M1[mzd_read_bits_int(A,i,start_col+k0, k1)]; rci_t x2 = M2[mzd_read_bits_int(A,i,start_col+k0+k1, k2)]; rci_t x3 = M3[mzd_read_bits_int(A,i,start_col+k0+k1+k2, k3)]; rci_t x4 = M4[mzd_read_bits_int(A,i,start_col+k0+k1+k2+k3, k4)]; word const *s0 = T0->rows[x0] + addblock; word const *s1 = T1->rows[x1] + addblock; word const *s2 = T2->rows[x2] + addblock; word const *s3 = T3->rows[x3] + addblock; word const *s4 = T4->rows[x4] + addblock; word *t = A->rows[i] + addblock; _mzd_combine5(t, s0, s1, s2, s3, s4, wide); } __M4RI_DD_MZD(A); } void _mzd_ple_a11_6(mzd_t *A, rci_t const start_row, rci_t const stop_row, rci_t const start_col, wi_t const addblock, int const k0, int const knar0, mzd_t const *T0, rci_t const *M0, int const k1, int const knar1, mzd_t const *T1, rci_t const *M1, int const k2, int const knar2, mzd_t const *T2, rci_t const *M2, int const k3, int const knar3, mzd_t const *T3, rci_t const *M3, int const k4, int const knar4, mzd_t const *T4, rci_t const *M4, int const k5, int const knar5, mzd_t const *T5, rci_t const *M5) { wi_t const wide = A->width - addblock; if(wide <= 0) return; for(rci_t i = start_row + knar0 + knar1 + knar2 + knar3 + knar4 + knar5; i < stop_row; ++i) { rci_t x0 = M0[mzd_read_bits_int(A,i,start_col, k0)]; rci_t x1 = M1[mzd_read_bits_int(A,i,start_col+k0, k1)]; rci_t x2 = M2[mzd_read_bits_int(A,i,start_col+k0+k1, k2)]; rci_t x3 = M3[mzd_read_bits_int(A,i,start_col+k0+k1+k2, k3)]; rci_t x4 = M4[mzd_read_bits_int(A,i,start_col+k0+k1+k2+k3, k4)]; rci_t x5 = M4[mzd_read_bits_int(A,i,start_col+k0+k1+k2+k3+k4, k5)]; word const *s0 = T0->rows[x0] + addblock; word const *s1 = T1->rows[x1] + addblock; word const *s2 = T2->rows[x2] + addblock; word const *s3 = T3->rows[x3] + addblock; word const *s4 = T4->rows[x4] + addblock; word const *s5 = T5->rows[x5] + addblock; word *t = A->rows[i] + addblock; _mzd_combine6(t, s0, s1, s2, s3, s4, s5, wide); } __M4RI_DD_MZD(A); } /* extract E from A for table creation */ mzd_t *_mzd_ple_to_e(mzd_t *E, mzd_t const *A, rci_t r, rci_t c, int k, rci_t *offsets) { /* this function call is now rather cheap, but it could be avoided completetly if needed */ assert(E->offset == 0); assert(A->offset == 0); rci_t startcol = (c / m4ri_radix) * m4ri_radix; mzd_submatrix(E, A, r, 0, r+k, A->ncols); for(rci_t i = 0; i < k; ++i) { for(rci_t j = startcol; j < c + offsets[i]; j+=m4ri_radix) mzd_clear_bits(E, i, j, MIN(c + offsets[i] - j, m4ri_radix)); } __M4RI_DD_MZD(E); return E; } /* method of many people factorisation */ rci_t _mzd_ple_russian(mzd_t *A, mzp_t *P, mzp_t *Q, int k) { assert(A->offset == 0); rci_t const nrows = A->nrows; rci_t const ncols = A->ncols; rci_t curr_row = 0; rci_t curr_col = 0; rci_t done_row = 0; int knar = 0; /** compute good k **/ if(k == 0) { /* __M4RI_CPU_L2_CACHE == __M4RI_PLE_NTABLES * 2^k * B->width * 8 */ k = (int)log2((__M4RI_CPU_L2_CACHE/8)/(double)A->width/(double)__M4RI_PLE_NTABLES); rci_t const klog = round(0.75 * log2_floor(MIN(nrows, ncols))); if(klog < k) k = klog; if (k<2) k=2; else if(k>8) k=8; } int kk = __M4RI_PLE_NTABLES * k; assert(kk <= m4ri_radix); /** initialise permutations as identity **/ for(rci_t i = 0; i < ncols; ++i) Q->values[i] = i; for(rci_t i = 0; i < nrows; ++i) P->values[i] = i; mzd_t *T[__M4RI_PLE_NTABLES]; for(int i=0; i<__M4RI_PLE_NTABLES; i++) T[i] = mzd_init(__M4RI_TWOPOW(k), ncols); mzd_t *U = mzd_init(kk, ncols); /* these are the elimination lookups */ rci_t *ebuf = (rci_t*)m4ri_mm_calloc(__M4RI_PLE_NTABLES * __M4RI_TWOPOW(k), sizeof(rci_t)); rci_t *E[__M4RI_PLE_NTABLES]; for(int i=0; i<__M4RI_PLE_NTABLES; i++) E[i] = ebuf + i*__M4RI_TWOPOW(k); /* these are the multiplication lookups */ rci_t *mbuf = (rci_t*)m4ri_mm_calloc(__M4RI_PLE_NTABLES * __M4RI_TWOPOW(k), sizeof(rci_t)); rci_t *M[__M4RI_PLE_NTABLES]; for(int i=0; i<__M4RI_PLE_NTABLES; i++) M[i] = mbuf + i*__M4RI_TWOPOW(k); rci_t *done = (rci_t*)m4ri_mm_malloc(kk * sizeof(rci_t)); rci_t *pivots = (rci_t*)m4ri_mm_malloc(kk * sizeof(rci_t)); /** * The algorithm proceeds as follows */ while(curr_col < ncols && curr_row < nrows) { if(curr_col + kk > ncols) kk = ncols - curr_col; /** * 1. compute PLE factorisation for the knar x knar submatrix A00 \verbatim m4ri_radix * splitblock -------------------------------------- | A00 | A10 | | | | -------------------------------------- knar | A01 | A11 | | | | -------------------------------------- done_row | A02 | A12 | | | | | | | | | | | | | | | | -------------------------------------- \endverbatim */ wi_t splitblock = (curr_col + kk) / m4ri_radix + 1; knar = _mzd_ple_submatrix(A, curr_row, nrows, curr_col, kk, P, Q, pivots, done, &done_row, splitblock); /** * 2. update A10 */ _mzd_ple_a10(A, P, curr_row, curr_col, splitblock, knar, pivots); /** * 3. extract U from A0 = (A00 | A10) */ _mzd_ple_to_e(U, A, curr_row, curr_col, knar, pivots); // treat no pivot was found case if (knar == 0) { curr_col += kk; curr_row += knar; rci_t i = curr_row; rci_t j = curr_col; int found = mzd_find_pivot(A, curr_row, curr_col, &i, &j); if(found) { P->values[curr_row] = i; Q->values[curr_row] = j; mzd_row_swap(A, curr_row, i); wi_t const wrd = j / m4ri_radix; word const bm = m4ri_one << (j % m4ri_radix); if (j + 1 < A->ncols) for(rci_t l = curr_row + 1; l < nrows; ++l) if(A->rows[l][wrd] & bm) mzd_row_add_offset(A, l, curr_row, j + 1); curr_col = j + 1; ++curr_row; } else { break; } continue; } int k_[__M4RI_PLE_NTABLES], knar_[__M4RI_PLE_NTABLES], ntables = 0; if (__M4RI_PLE_NTABLES >= 6 && kk >= 5*k && kk >= 6) { ntables = 6; } else if (__M4RI_PLE_NTABLES >= 5 && kk >= 4*k && kk >= 5) { ntables = 5; } else if (__M4RI_PLE_NTABLES >= 4 && kk >= 3*k && kk >= 4) { ntables = 4; } else if (__M4RI_PLE_NTABLES >= 3 && kk >= 2*k && kk >= 3) { ntables = 3; } else if (__M4RI_PLE_NTABLES >= 2 && kk >= k && kk >= 2) { ntables = 2; } else { ntables = 1; } _kk_setup(kk, knar, k_, knar_, pivots, ntables); /** * 4. generate multiplication and inversion tables T amd E from U */ rci_t i_knar = 0; rci_t i_curr_col = curr_col; rci_t *i_pivots = pivots; int i_base = 0; for(int i=0; i= 6 case 6: /** * 5. update A1 = (A01 | A11) */ _mzd_ple_a11_6(A, curr_row, done_row+1, curr_col, splitblock, k_[0], knar_[0], T[0], M[0], k_[1], knar_[1], T[1], M[1], k_[2], knar_[2], T[2], M[2], k_[3], knar_[3], T[3], M[3], k_[4], knar_[4], T[4], M[4], k_[5], knar_[5], T[5], M[5]); /** * 6. update A2 = (A02 | A12) */ if (done_row < nrows) { mzd_process_rows6_ple(A, done_row + 1, nrows, curr_col, k_[0], T[0], E[0], k_[1], T[1], E[1], k_[2], T[2], E[2], k_[3], T[3], E[3], k_[4], T[4], E[4], k_[5], T[5], E[5]); } break; #endif #if __M4RI_PLE_NTABLES >= 5 case 5: _mzd_ple_a11_5(A, curr_row, done_row+1, curr_col, splitblock, k_[0], knar_[0], T[0], M[0], k_[1], knar_[1], T[1], M[1], k_[2], knar_[2], T[2], M[2], k_[3], knar_[3], T[3], M[3], k_[4], knar_[4], T[4], M[4]); if (done_row < nrows) { mzd_process_rows5_ple(A, done_row + 1, nrows, curr_col, k_[0], T[0], E[0], k_[1], T[1], E[1], k_[2], T[2], E[2], k_[3], T[3], E[3], k_[4], T[4], E[4]); } break; #endif #if __M4RI_PLE_NTABLES >= 4 case 4: _mzd_ple_a11_4(A, curr_row, done_row+1, curr_col, splitblock, k_[0], knar_[0], T[0], M[0], k_[1], knar_[1], T[1], M[1], k_[2], knar_[2], T[2], M[2], k_[3], knar_[3], T[3], M[3]); if (done_row < nrows) { mzd_process_rows4_ple(A, done_row + 1, nrows, curr_col, k_[0], T[0], E[0], k_[1], T[1], E[1], k_[2], T[2], E[2], k_[3], T[3], E[3]); } break; #endif #if __M4RI_PLE_NTABLES >= 3 case 3: _mzd_ple_a11_3(A, curr_row, done_row+1, curr_col, splitblock, k_[0], knar_[0], T[0], M[0], k_[1], knar_[1], T[1], M[1], k_[2], knar_[2], T[2], M[2]); if (done_row < nrows) { mzd_process_rows3_ple(A, done_row + 1, nrows, curr_col, k_[0], T[0], E[0], k_[1], T[1], E[1], k_[2], T[2], E[2]); } break; #endif #if __M4RI_PLE_NTABLES >= 2 case 2: _mzd_ple_a11_2(A, curr_row, done_row+1, curr_col, splitblock, k_[0], knar_[0], T[0], M[0], k_[1], knar_[1], T[1], M[1]); if(done_row < nrows) { mzd_process_rows2_ple(A, done_row + 1, nrows, curr_col, k_[0], T[0], E[0], k_[1], T[1], E[1]); } break; #endif case 1: _mzd_ple_a11_1(A, curr_row, done_row+1, curr_col, splitblock, kk, knar, T[0], M[0]); if(done_row < nrows) { mzd_process_rows(A, done_row + 1, nrows, curr_col, kk, T[0], E[0]); } break; default: m4ri_die("ntables = %d not supported.\n",ntables); } curr_col += kk; curr_row += knar; } /* Now compressing L */ for (rci_t j = 0; j < curr_row; ++j){ if (Q->values[j] > j) { mzd_col_swap_in_rows(A, Q->values[j], j, j, curr_row); } } mzp_t *Qbar = mzp_init_window(Q, 0, curr_row); mzd_apply_p_right_trans_even_capped(A, Qbar, curr_row, 0); mzp_free_window(Qbar); mzd_free(U); for(int i=0; i<__M4RI_PLE_NTABLES; i++) mzd_free(T[i]); m4ri_mm_free(ebuf); m4ri_mm_free(mbuf); m4ri_mm_free(done); m4ri_mm_free(pivots); __M4RI_DD_MZD(A); __M4RI_DD_MZP(P); __M4RI_DD_MZP(Q); __M4RI_DD_RCI(curr_row); return curr_row; } rci_t _mzd_pluq_russian(mzd_t *A, mzp_t *P, mzp_t *Q, int const k) { rci_t r = _mzd_ple_russian(A, P, Q, k); mzd_apply_p_right_trans_tri(A, Q); return r; } libm4ri-20130416/src/ple_russian.h000066400000000000000000000151021212302366200165760ustar00rootroot00000000000000/** * \file ple_mmpf.h * \brief PLE and PLUQ factorization using Gray codes. * * \author Martin Albrecht * * \example testsuite/test_ple.c */ #ifndef M4RI_PLE_RUSSIAN #define M4RI_PLE_RUSSIAN /******************************************************************* * * M4RI: Linear Algebra over GF(2) * * Copyright (C) 2008-2011 Martin Albrecht * * Distributed under the terms of the GNU General Public License (GPL) * version 2 or higher. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ * ********************************************************************/ #include #include /** * \brief PLE matrix decomposition of A using Gray codes. * * Returns (P,L,E,Q) satisfying PLE = A where P is a permutation * matrix of dimension m x m, L is m x r unit lower triangular and S * is an r x n matrix which is upper triangular except that its * columns are permuted, that is E = UQ for U r x n upper triangular * and Q is a n x n permutation matrix. The matrix L and E are stored * in place over A. * * \param A Matrix. * \param P Preallocated row permutation. * \param Q Preallocated column permutation. * \param k Size of Gray code tables. * * \wordoffset * * \return Rank of A. */ rci_t _mzd_ple_russian(mzd_t *A, mzp_t *P, mzp_t *Q, int k); /** * \brief PLUQ matrix decomposition of A using Gray codes. * * Returns (P,L,U,Q) satisfying PLUQ = A where P and Q are two * permutation matrices, of dimension respectively m x m and n x n, L * is m x r unit lower triangular and U is r x n upper triangular. * * \param A Matrix. * \param P Preallocated row permutation. * \param Q Preallocated column permutation. * \param k Size of Gray code tables. * * \wordoffset * * \return Rank of A. */ rci_t _mzd_pluq_russian(mzd_t *A, mzp_t *P, mzp_t *Q, int k); /** * \brief PLE matrix decomposition of a submatrix for up to k columns * starting at (r,c). * * Updates P and Q and modifies A in place. The buffer done afterwards * holds how far a particular row was already eliminated. * * \param A Matrix. * \param start_row Row Offset. * \param stop_row Up to which row the matrix should be processed (exclusive). * \param start_col Column Offset. * \param k Size of Gray code tables. * \param P Preallocated row permutation. * \param Q Preallocated column permutation. * \param pivots which column holds the i-th pivot * \param done Preallocated temporary buffer. * \param done_row Stores the last row which is already reduced processed after function execution. * \param splitblock First block which is not considered by this function. * * \retval knar rank of the considered submatrix */ int _mzd_ple_submatrix(mzd_t *A, rci_t const start_row, rci_t const stop_row, rci_t const start_col, int const k, mzp_t *P, mzp_t *Q, rci_t *pivots, rci_t *done, rci_t *done_row, wi_t const splitblock); /** * \brief Extract the k x A::ncols echelon form submatrix of A starting at row r and column c. * * \param E Storage for k x A::ncols matrix. * \param A Source matrix. * \param r Row index. * \param c Column index. * \param k Rank of E. * \param k Map from i to column of i-th pivot. */ mzd_t *_mzd_ple_to_e(mzd_t *E, mzd_t const *A, rci_t r, rci_t c, int k, rci_t *offsets); /** * \brief add rows E0,E1 to M between startrow and stoprow, starting at startcol. * * \param M Matrix * \param startrow Start processing in this row * \param stoprow Stop processing in this row * \param startcol Start processing in this column * \param k0 Number of bits to read for E0 * \param T0 Lookup index -> row for E0 * \param E0 2^k0 x A::ncols table * \param k1 Number of bits to read for E1 * \param T1 Lookup index -> row for E1 * \param E1 2^k1 x A::ncols table */ void mzd_process_rows2_ple(mzd_t *M, rci_t startrow, rci_t stoprow, rci_t startcol, int const k0, mzd_t const *T0, rci_t const *E0, int const k1, mzd_t const *T1, rci_t const *E1); /** * \brief add rows E0,E1,E2 to M between startrow and stoprow, starting at startcol. * * \param M Matrix * \param startrow Start processing in this row * \param stoprow Stop processing in this row * \param startcol Start processing in this column * \param k0 Number of bits to read for E0 * \param T0 Lookup index -> row for E0 * \param E0 2^k0 x A::ncols table * \param k1 Number of bits to read for E1 * \param T1 Lookup index -> row for E1 * \param E1 2^k1 x A::ncols table * \param k2 Number of bits to read for E2 * \param T2 Lookup index -> row for E2 * \param E2 2^k2 x A::ncols table */ void mzd_process_rows3_ple(mzd_t *M, rci_t startrow, rci_t stoprow, rci_t startcol, int const k0, mzd_t const *T0, rci_t const *E0, int const k1, mzd_t const *T1, rci_t const *E1, int const k2, mzd_t const *T2, rci_t const *E2); /** * \brief add rows E0,E1,E2,E3 to M between startrow and stoprow, starting at startcol. * * \param M Matrix * \param startrow Start processing in this row * \param stoprow Stop processing in this row * \param startcol Start processing in this column * \param k0 Number of bits to read for E0 * \param T0 Lookup index -> row for E0 * \param E0 2^k0 x A::ncols table * \param k1 Number of bits to read for E1 * \param T1 Lookup index -> row for E1 * \param E1 2^k1 x A::ncols table * \param k2 Number of bits to read for E2 * \param T2 Lookup index -> row for E2 * \param E2 2^k2 x A::ncols table * \param k3 Number of bits to read for E3 * \param T3 Lookup index -> row for E3 * \param E3 3^k3 x A::ncols table */ void mzd_process_rows4_ple(mzd_t *M, rci_t startrow, rci_t stoprow, rci_t startcol, int const k0, mzd_t const *T0, rci_t const *E0, int const k1, mzd_t const *T1, rci_t const *E1, int const k2, mzd_t const *T2, rci_t const *E2, int const k3, mzd_t const *T3, rci_t const *E3); #endif // M4RI_PLE_RUSSIAN libm4ri-20130416/src/solve.c000066400000000000000000000122421212302366200153770ustar00rootroot00000000000000 /******************************************************************* * * M4RI: Linear Algebra over GF(2) * * Copyright (C) 2008 Jean-Guillaume.Dumas@imag.fr * * Distributed under the terms of the GNU General Public License (GPL) * version 2 or higher. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ * ********************************************************************/ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include "solve.h" #include "strassen.h" #include "ple.h" #include "triangular.h" #include "mzp.h" int mzd_solve_left(mzd_t *A, mzd_t *B, int const cutoff, int const inconsistency_check) { if(A->ncols > B->nrows) m4ri_die("mzd_solve_left: A ncols (%d) must be smaller than B nrows (%d).\n", A->ncols, B->nrows); return _mzd_solve_left(A, B, cutoff, inconsistency_check); } int mzd_pluq_solve_left (mzd_t const *A, rci_t rank, mzp_t const *P, mzp_t const *Q, mzd_t *B, int const cutoff, int const inconsistency_check) { if(A->ncols > B->nrows) m4ri_die("mzd_pluq_solve_left: A ncols (%d) need to be lower than B nrows (%d).\n", A->ncols, B->nrows); if(P->length != A->nrows) m4ri_die("mzd_pluq_solve_left: A nrows (%d) need to match P size (%d).\n", A->nrows, P->length); if(Q->length != A->ncols) m4ri_die("mzd_pluq_solve_left: A ncols (%d) need to match Q size (%d).\n", A->ncols, P->length); return _mzd_pluq_solve_left (A, rank, P, Q, B, cutoff, inconsistency_check); } int _mzd_pluq_solve_left(mzd_t const *A, rci_t rank, mzp_t const *P, mzp_t const *Q, mzd_t *B, int const cutoff, int const inconsistency_check) { /** A is supposed to store L lower triangular and U upper triangular * B is modified in place * (Bi's in the comments are just modified versions of B) * PLUQ = A * 1) P B2 = B1 * 2) L B3 = B2 * 3) U B4 = B3 * 4) Q B5 = B4 */ int retval = 0; /* P B2 = B1 or B2 = P^T B1 */ mzd_apply_p_left(B, P); /* L B3 = B2 */ /* view on the upper part of L */ mzd_t const *LU = mzd_init_window_const(A, 0, 0, rank, rank); mzd_t *Y1 = mzd_init_window(B, 0, 0, rank, B->ncols); mzd_trsm_lower_left(LU, Y1, cutoff); if (inconsistency_check) { /* Check for inconsistency */ /** FASTER without this check; update with the lower part of L */ mzd_t const *H = mzd_init_window_const(A, rank, 0, A->nrows, rank); mzd_t *Y2 = mzd_init_window(B, rank, 0, A->nrows, B->ncols); if(A->nrows < B->nrows) { mzd_t *Y3 = mzd_init_window(B, A->nrows, 0, B->nrows, B->ncols); mzd_set_ui(Y3, 0); mzd_free_window(Y3); } mzd_addmul(Y2, H, Y1, cutoff); /* * test whether Y2 is the zero matrix */ if(!mzd_is_zero(Y2)) { retval = -1; } mzd_free_window((mzd_t*)H); mzd_free_window(Y2); } /* U B4 = B3 */ mzd_trsm_upper_left(LU, Y1, cutoff); mzd_free_window((mzd_t*)LU); mzd_free_window(Y1); if (!inconsistency_check) { /** Default is to set the undefined bits to zero if inconsistency * has been checked then Y2 bits are already all zeroes thus this * clearing is not needed */ for(rci_t i = rank; i < B->nrows; ++i) { for(rci_t j = 0; j < B->ncols; j += m4ri_radix) { mzd_clear_bits(B, i, j, MIN(m4ri_radix, B->ncols - j)); } } } /* Q B5 = B4 or B5 = Q^T B4 */ mzd_apply_p_left_trans(B, Q); /* P L U Q B5 = B1 */ __M4RI_DD_MZD(B); __M4RI_DD_INT(retval); return retval; } int _mzd_solve_left(mzd_t *A, mzd_t *B, int const cutoff, int const inconsistency_check) { /** * B is modified in place * (Bi's in the comments are just modified versions of B) * 1) PLUQ = A * 2) P B2 = B1 * 3) L B3 = B2 * 4) U B4 = B3 * 5) Q B5 = B4 */ mzp_t *P = mzp_init(A->nrows); mzp_t *Q = mzp_init(A->ncols); /* PLUQ = A */ rci_t rank = _mzd_pluq(A, P, Q, cutoff); /* 2, 3, 4, 5 */ int retval = mzd_pluq_solve_left(A, rank, P, Q, B, cutoff, inconsistency_check); mzp_free(P); mzp_free(Q); __M4RI_DD_MZD(A); __M4RI_DD_MZD(B); return retval; } mzd_t *mzd_kernel_left_pluq(mzd_t *A, int const cutoff) { mzp_t *P = mzp_init(A->nrows); mzp_t *Q = mzp_init(A->ncols); rci_t r = mzd_pluq(A, P, Q, cutoff); if (r == A->ncols) { mzp_free(P); mzp_free(Q); __M4RI_DD_MZD(A); return NULL; } mzd_t *U = mzd_init_window(A, 0, 0, r, r); mzd_t *B = mzd_init_window(A, 0, r, r, A->ncols); mzd_trsm_upper_left(U, B, cutoff); mzd_t *R = mzd_init(A->ncols, A->ncols - r); mzd_t *RU = mzd_init_window(R, 0, 0, r, R->ncols); mzd_copy(RU, B); for(rci_t i = 0; i < R->ncols; ++i) { mzd_write_bit(R, r + i, i, 1); } mzd_apply_p_left_trans(R, Q); mzp_free(P); mzp_free(Q); mzd_free_window(RU); mzd_free_window(U); mzd_free_window(B); __M4RI_DD_MZD(A); __M4RI_DD_MZD(R); return R; } libm4ri-20130416/src/solve.h000066400000000000000000000120471212302366200154070ustar00rootroot00000000000000/** * \file solve.h * * \brief System solving with matrix routines. * * \author Jean-Guillaume Dumas * */ #ifndef M4RI_SOLVE_H #define M4RI_SOLVE_H /******************************************************************* * * M4RI: Linear Algebra over GF(2) * * Copyright (C) 2008 Jean-Guillaume.Dumas@imag.fr * * Distributed under the terms of the GNU General Public License (GPL) * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ * ********************************************************************/ #include #include /** * \brief Solves A X = B with A and B matrices. * * The solution X is stored inplace on B. * * \param A Input matrix (overwritten). * \param B Input matrix, being overwritten by the solution matrix X * \param cutoff Minimal dimension for Strassen recursion (default: 0). * \param inconsistency_check decide wether or not to perform a check * for incosistency (faster without but output not defined if * system is not consistent). * \return 0 if a solution was found, -1 otherwise */ int mzd_solve_left(mzd_t *A, mzd_t *B, int const cutoff, int const inconsistency_check); /** * \brief Solves (P L U Q) X = B * * A is an input matrix supposed to store both: * \li an upper right triangular matrix U * \li a lower left unitary triangular matrix L. * * The solution X is stored inplace on B * * This version assumes that the matrices are at an even position on * the m4ri_radix grid and that their dimension is a multiple of m4ri_radix. * * \param A Input upper/lower triangular matrices. * \param rank is rank of A. * \param P Input row permutation matrix. * \param Q Input column permutation matrix. * \param B Input matrix, being overwritten by the solution matrix X. * \param cutoff Minimal dimension for Strassen recursion (default: 0). * \param inconsistency_check decide whether or not to perform a check * for incosistency (faster without but output not defined if * system is not consistent). \return 0 if a solution was * found, -1 otherwise * \return 0 if a solution was found, -1 otherwise */ int mzd_pluq_solve_left (mzd_t const *A, rci_t rank, mzp_t const *P, mzp_t const *Q, mzd_t *B, int const cutoff, int const inconsistency_check); /** * \brief Solves (P L U Q) X = B * * A is an input matrix supposed to store both: * \li an upper right triangular matrix U * \li a lower left unitary triangular matrix L. * The solution X is stored inplace on B. * * This version assumes that the matrices are at an even position on * the m4ri_radix grid and that their dimension is a multiple of m4ri_radix. * * \param A Input upper/lower triangular matrices. * \param rank is rank of A. * \param P Input row permutation matrix. * \param Q Input column permutation matrix. * \param B Input matrix, being overwritten by the solution matrix X. * \param cutoff Minimal dimension for Strassen recursion (default: 0). * \param inconsistency_check decide whether or not to perform a check * for incosistency (faster without but output not defined if * system is not consistent). \return 0 if a solution was * found, -1 otherwise * \return 0 if a solution was found, -1 otherwise */ int _mzd_pluq_solve_left(mzd_t const *A, rci_t rank, mzp_t const *P, mzp_t const *Q, mzd_t *B, int const cutoff, int const inconsistency_check); /** * \brief Solves A X = B with A and B matrices. * * The solution X is stored inplace on B. * * This version assumes that the matrices are at an even position on * the m4ri_radix grid and that their dimension is a multiple of m4ri_radix. * * \param A Input matrix (overwritten). * \param B Input matrix, being overwritten by the solution matrix X. * \param cutoff Minimal dimension for Strassen recursion (default: 0). * \param inconsistency_check decide whether or not to perform a check * for incosistency (faster without but output not defined if * system is not consistent). \return 0 if a solution was * found, -1 otherwise * \return 0 if a solution was found, -1 otherwise */ int _mzd_solve_left(mzd_t *A, mzd_t *B, int const cutoff, int const inconsistency_check); /** * \brief Solve X for A X = 0. * * If r is the rank of the nr x nc matrix A, return the nc x (nc-r) * matrix X such that A*X == 0 and that the columns of X are linearly * independent. * * \param A Input matrix (overwritten). * \param cutoff Minimal dimension for Strassen recursion (default: 0). * * \wordoffset * * \sa mzd_pluq() * * \return X, NULL if kernel is empty */ mzd_t *mzd_kernel_left_pluq(mzd_t *A, int const cutoff); #endif // M4RI_SOLVE_H libm4ri-20130416/src/strassen.c000066400000000000000000001103731212302366200161150ustar00rootroot00000000000000/******************************************************************* * * M4RI: Linear Algebra over GF(2) * * Copyright (C) 2008 Martin Albrecht * Copyright (C) 2008 Clement Pernet * Copyright (C) 2008 Marco Bodrato * * Distributed under the terms of the GNU General Public License (GPL) * version 2 or higher. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ * ********************************************************************/ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include "graycode.h" #include "strassen.h" #include "parity.h" #ifndef MIN #define MIN(a,b) (((a)<(b))?(a):(b)) #endif #if __M4RI_HAVE_OPENMP #include #endif // Returns true if a is closer to cutoff than a/2. static inline int CLOSER(rci_t a, int cutoff) { return 3 * a < 4 * cutoff; } /** * Simple blockwise product */ mzd_t *_mzd_addmul_mp_even(mzd_t *C, mzd_t const *A, mzd_t const *B, int cutoff); mzd_t *_mzd_mul_even(mzd_t *C, mzd_t const *A, mzd_t const *B, int cutoff) { rci_t mmm, kkk, nnn; if(C->nrows == 0 || C->ncols == 0) return C; rci_t m = A->nrows; rci_t k = A->ncols; rci_t n = B->ncols; /* handle case first, where the input matrices are too small already */ if (CLOSER(m, cutoff) || CLOSER(k, cutoff) || CLOSER(n, cutoff)) { /* we copy the matrices first since it is only constant memory overhead and improves data locality */ if(mzd_is_windowed(A)|mzd_is_windowed(B)|mzd_is_windowed(C)) { mzd_t *Abar = mzd_copy(NULL, A); mzd_t *Bbar = mzd_copy(NULL, B); mzd_t *Cbar = mzd_init(m, n); _mzd_mul_m4rm(Cbar, Abar, Bbar, 0, FALSE); mzd_copy(C, Cbar); mzd_free(Cbar); mzd_free(Bbar); mzd_free(Abar); } else { _mzd_mul_m4rm(C, A, B, 0, TRUE); } return C; } /* adjust cutting numbers to work on words */ { rci_t mult = m4ri_radix; rci_t width = MIN(MIN(m, n), k) / 2; while (width > cutoff) { width /= 2; mult *= 2; } mmm = (((m - m % mult) / m4ri_radix) >> 1) * m4ri_radix; kkk = (((k - k % mult) / m4ri_radix) >> 1) * m4ri_radix; nnn = (((n - n % mult) / m4ri_radix) >> 1) * m4ri_radix; } /* |A | |B | |C | * Compute | | x | | = | | */ { mzd_t const *A11 = mzd_init_window_const(A, 0, 0, mmm, kkk); mzd_t const *A12 = mzd_init_window_const(A, 0, kkk, mmm, 2*kkk); mzd_t const *A21 = mzd_init_window_const(A, mmm, 0, 2*mmm, kkk); mzd_t const *A22 = mzd_init_window_const(A, mmm, kkk, 2*mmm, 2*kkk); mzd_t const *B11 = mzd_init_window_const(B, 0, 0, kkk, nnn); mzd_t const *B12 = mzd_init_window_const(B, 0, nnn, kkk, 2*nnn); mzd_t const *B21 = mzd_init_window_const(B, kkk, 0, 2*kkk, nnn); mzd_t const *B22 = mzd_init_window_const(B, kkk, nnn, 2*kkk, 2*nnn); mzd_t *C11 = mzd_init_window(C, 0, 0, mmm, nnn); mzd_t *C12 = mzd_init_window(C, 0, nnn, mmm, 2*nnn); mzd_t *C21 = mzd_init_window(C, mmm, 0, 2*mmm, nnn); mzd_t *C22 = mzd_init_window(C, mmm, nnn, 2*mmm, 2*nnn); /** * \note See Marco Bodrato; "A Strassen-like Matrix Multiplication * Suited for Squaring and Highest Power Computation"; * http://bodrato.it/papres/#CIVV2008 for reference on the used * sequence of operations. */ /* change this to mzd_init(mmm, MAX(nnn,kkk)) to fix the todo below */ mzd_t *Wmk = mzd_init(mmm, kkk); mzd_t *Wkn = mzd_init(kkk, nnn); _mzd_add(Wkn, B22, B12); /* Wkn = B22 + B12 */ _mzd_add(Wmk, A22, A12); /* Wmk = A22 + A12 */ _mzd_mul_even(C21, Wmk, Wkn, cutoff);/* C21 = Wmk * Wkn */ _mzd_add(Wmk, A22, A21); /* Wmk = A22 - A21 */ _mzd_add(Wkn, B22, B21); /* Wkn = B22 - B21 */ _mzd_mul_even(C22, Wmk, Wkn, cutoff);/* C22 = Wmk * Wkn */ _mzd_add(Wkn, Wkn, B12); /* Wkn = Wkn + B12 */ _mzd_add(Wmk, Wmk, A12); /* Wmk = Wmk + A12 */ _mzd_mul_even(C11, Wmk, Wkn, cutoff);/* C11 = Wmk * Wkn */ _mzd_add(Wmk, Wmk, A11); /* Wmk = Wmk - A11 */ _mzd_mul_even(C12, Wmk, B12, cutoff);/* C12 = Wmk * B12 */ _mzd_add(C12, C12, C22); /* C12 = C12 + C22 */ /** * \todo ideally we would use the same Wmk throughout the function * but some called function doesn't like that and we end up with a * wrong result if we use virtual Wmk matrices. Ideally, this should * be fixed not worked around. The check whether the bug has been * fixed, use only one Wmk and check if mzd_mul(4096, 3528, * 4096, 2124) still returns the correct answer. */ mzd_free(Wmk); Wmk = mzd_mul(NULL, A12, B21, cutoff);/*Wmk = A12 * B21 */ _mzd_add(C11, C11, Wmk); /* C11 = C11 + Wmk */ _mzd_add(C12, C11, C12); /* C12 = C11 - C12 */ _mzd_add(C11, C21, C11); /* C11 = C21 - C11 */ _mzd_add(Wkn, Wkn, B11); /* Wkn = Wkn - B11 */ _mzd_mul_even(C21, A21, Wkn, cutoff); /* C21 = A21 * Wkn */ mzd_free(Wkn); _mzd_add(C21, C11, C21); /* C21 = C11 - C21 */ _mzd_add(C22, C22, C11); /* C22 = C22 + C11 */ _mzd_mul_even(C11, A11, B11, cutoff); /* C11 = A11 * B11 */ _mzd_add(C11, C11, Wmk); /* C11 = C11 + Wmk */ /* clean up */ mzd_free_window((mzd_t*)A11); mzd_free_window((mzd_t*)A12); mzd_free_window((mzd_t*)A21); mzd_free_window((mzd_t*)A22); mzd_free_window((mzd_t*)B11); mzd_free_window((mzd_t*)B12); mzd_free_window((mzd_t*)B21); mzd_free_window((mzd_t*)B22); mzd_free_window(C11); mzd_free_window(C12); mzd_free_window(C21); mzd_free_window(C22); mzd_free(Wmk); } /* deal with rest */ nnn *= 2; if (n > nnn) { /* |AA| | B| | C| * Compute |AA| x | B| = | C| */ mzd_t const *B_last_col = mzd_init_window_const(B, 0, nnn, k, n); mzd_t *C_last_col = mzd_init_window(C, 0, nnn, m, n); _mzd_mul_m4rm(C_last_col, A, B_last_col, 0, TRUE); mzd_free_window((mzd_t*)B_last_col); mzd_free_window(C_last_col); } mmm *= 2; if (m > mmm) { /* | | |B | | | * Compute |AA| x |B | = |C | */ mzd_t const *A_last_row = mzd_init_window_const(A, mmm, 0, m, k); mzd_t const *B_first_col= mzd_init_window_const(B, 0, 0, k, nnn); mzd_t *C_last_row = mzd_init_window(C, mmm, 0, m, nnn); _mzd_mul_m4rm(C_last_row, A_last_row, B_first_col, 0, TRUE); mzd_free_window((mzd_t*)A_last_row); mzd_free_window((mzd_t*)B_first_col); mzd_free_window(C_last_row); } kkk *= 2; if (k > kkk) { /* Add to | | | B| |C | * result |A | x | | = | | */ mzd_t const *A_last_col = mzd_init_window_const(A, 0, kkk, mmm, k); mzd_t const *B_last_row = mzd_init_window_const(B, kkk, 0, k, nnn); mzd_t *C_bulk = mzd_init_window(C, 0, 0, mmm, nnn); mzd_addmul_m4rm(C_bulk, A_last_col, B_last_row, 0); mzd_free_window((mzd_t*)A_last_col); mzd_free_window((mzd_t*)B_last_row); mzd_free_window(C_bulk); } __M4RI_DD_MZD(C); return C; } mzd_t *_mzd_sqr_even(mzd_t *C, mzd_t const *A, int cutoff) { rci_t m; m = A->nrows; /* handle case first, where the input matrices are too small already */ if (CLOSER(m, cutoff)) { /* we copy the matrices first since it is only constant memory overhead and improves data locality */ if(mzd_is_windowed(A)|mzd_is_windowed(C)) { mzd_t *Abar = mzd_copy(NULL, A); mzd_t *Cbar = mzd_init(m, m); _mzd_mul_m4rm(Cbar, Abar, Abar, 0, FALSE); mzd_copy(C, Cbar); mzd_free(Cbar); mzd_free(Abar); } else { _mzd_mul_m4rm(C, A, A, 0, TRUE); } return C; } /* adjust cutting numbers to work on words */ rci_t mmm; { rci_t mult = m4ri_radix; rci_t width = m / 2; while (width > cutoff) { width /= 2; mult *= 2; } mmm = (((m - m % mult) / m4ri_radix) >> 1) * m4ri_radix; } /* |A | |A | |C | * Compute | | x | | = | | */ { mzd_t const *A11 = mzd_init_window_const(A, 0, 0, mmm, mmm); mzd_t const *A12 = mzd_init_window_const(A, 0, mmm, mmm, 2*mmm); mzd_t const *A21 = mzd_init_window_const(A, mmm, 0, 2*mmm, mmm); mzd_t const *A22 = mzd_init_window_const(A, mmm, mmm, 2*mmm, 2*mmm); mzd_t *C11 = mzd_init_window(C, 0, 0, mmm, mmm); mzd_t *C12 = mzd_init_window(C, 0, mmm, mmm, 2*mmm); mzd_t *C21 = mzd_init_window(C, mmm, 0, 2*mmm, mmm); mzd_t *C22 = mzd_init_window(C, mmm, mmm, 2*mmm, 2*mmm); /** * \note See Marco Bodrato; "A Strassen-like Matrix Multiplication * Suited for Squaring and Highest Power Computation"; * http://bodrato.it/papres/#CIVV2008 for reference on the used * sequence of operations. */ mzd_t *Wmk; mzd_t *Wkn = mzd_init(mmm, mmm); _mzd_add(Wkn, A22, A12); /* Wkn = A22 + A12 */ _mzd_sqr_even(C21, Wkn, cutoff); /* C21 = Wkn^2 */ _mzd_add(Wkn, A22, A21); /* Wkn = A22 - A21 */ _mzd_sqr_even(C22, Wkn, cutoff); /* C22 = Wkn^2 */ _mzd_add(Wkn, Wkn, A12); /* Wkn = Wkn + A12 */ _mzd_sqr_even(C11, Wkn, cutoff); /* C11 = Wkn^2 */ _mzd_add(Wkn, Wkn, A11); /* Wkn = Wkn - A11 */ _mzd_mul_even(C12, Wkn, A12, cutoff);/* C12 = Wkn * A12 */ _mzd_add(C12, C12, C22); /* C12 = C12 + C22 */ Wmk = mzd_mul(NULL, A12, A21, cutoff);/*Wmk = A12 * A21 */ _mzd_add(C11, C11, Wmk); /* C11 = C11 + Wmk */ _mzd_add(C12, C11, C12); /* C12 = C11 - C12 */ _mzd_add(C11, C21, C11); /* C11 = C21 - C11 */ _mzd_mul_even(C21, A21, Wkn, cutoff);/* C21 = A21 * Wkn */ mzd_free(Wkn); _mzd_add(C21, C11, C21); /* C21 = C11 - C21 */ _mzd_add(C22, C22, C11); /* C22 = C22 + C11 */ _mzd_sqr_even(C11, A11, cutoff); /* C11 = A11^2 */ _mzd_add(C11, C11, Wmk); /* C11 = C11 + Wmk */ /* clean up */ mzd_free_window((mzd_t*)A11); mzd_free_window((mzd_t*)A12); mzd_free_window((mzd_t*)A21); mzd_free_window((mzd_t*)A22); mzd_free_window(C11); mzd_free_window(C12); mzd_free_window(C21); mzd_free_window(C22); mzd_free(Wmk); } /* deal with rest */ mmm *= 2; if (m > mmm) { /* |AA| | A| | C| * Compute |AA| x | A| = | C| */ { mzd_t const *A_last_col = mzd_init_window_const(A, 0, mmm, m, m); mzd_t *C_last_col = mzd_init_window(C, 0, mmm, m, m); _mzd_mul_m4rm(C_last_col, A, A_last_col, 0, TRUE); mzd_free_window((mzd_t*)A_last_col); mzd_free_window(C_last_col); } /* | | |A | | | * Compute |AA| x |A | = |C | */ { mzd_t const *A_last_row = mzd_init_window_const(A, mmm, 0, m, m); mzd_t const *A_first_col= mzd_init_window_const(A, 0, 0, m, mmm); mzd_t *C_last_row = mzd_init_window(C, mmm, 0, m, mmm); _mzd_mul_m4rm(C_last_row, A_last_row, A_first_col, 0, TRUE); mzd_free_window((mzd_t*)A_last_row); mzd_free_window((mzd_t*)A_first_col); mzd_free_window(C_last_row); } /* Add to | | | A| |C | * result |A | x | | = | | */ { mzd_t const *A_last_col = mzd_init_window_const(A, 0, mmm, mmm, m); mzd_t const *A_last_row = mzd_init_window_const(A, mmm, 0, m, mmm); mzd_t *C_bulk = mzd_init_window(C, 0, 0, mmm, mmm); mzd_addmul_m4rm(C_bulk, A_last_col, A_last_row, 0); mzd_free_window((mzd_t*)A_last_col); mzd_free_window((mzd_t*)A_last_row); mzd_free_window(C_bulk); } } __M4RI_DD_MZD(C); return C; } #if __M4RI_HAVE_OPENMP mzd_t *_mzd_addmul_mp_even(mzd_t *C, mzd_t const *A, mzd_t const *B, int cutoff) { /** * \todo make sure not to overwrite crap after ncols and before width * m4ri_radix */ rci_t a = A->nrows; rci_t b = A->ncols; rci_t c = B->ncols; /* handle case first, where the input matrices are too small already */ if (CLOSER(A->nrows, cutoff) || CLOSER(A->ncols, cutoff) || CLOSER(B->ncols, cutoff)) { /* we copy the matrix first since it is only constant memory overhead and improves data locality, if you remove it make sure there are no speed regressions */ /* C = _mzd_mul_m4rm(C, A, B, 0, TRUE); */ mzd_t *Cbar = mzd_init(C->nrows, C->ncols); Cbar = _mzd_mul_m4rm(Cbar, A, B, 0, FALSE); mzd_copy(C, Cbar); mzd_free(Cbar); return C; } /* adjust cutting numbers to work on words */ { rci_t mult = 2 * m4ri_radix; /* rci_t width = a; */ /* while (width > 2 * cutoff) { */ /* width /= 2; */ /* mult *= 2; */ /* } */ a -= a % mult; b -= b % mult; c -= c % mult; } rci_t anr = ((a / m4ri_radix) >> 1) * m4ri_radix; rci_t anc = ((b / m4ri_radix) >> 1) * m4ri_radix; rci_t bnr = anc; rci_t bnc = ((c / m4ri_radix) >> 1) * m4ri_radix; mzd_t const *A00 = mzd_init_window_const(A, 0, 0, anr, anc); mzd_t const *A01 = mzd_init_window_const(A, 0, anc, anr, 2*anc); mzd_t const *A10 = mzd_init_window_const(A, anr, 0, 2*anr, anc); mzd_t const *A11 = mzd_init_window_const(A, anr, anc, 2*anr, 2*anc); mzd_t const *B00 = mzd_init_window_const(B, 0, 0, bnr, bnc); mzd_t const *B01 = mzd_init_window_const(B, 0, bnc, bnr, 2*bnc); mzd_t const *B10 = mzd_init_window_const(B, bnr, 0, 2*bnr, bnc); mzd_t const *B11 = mzd_init_window_const(B, bnr, bnc, 2*bnr, 2*bnc); mzd_t *C00 = mzd_init_window(C, 0, 0, anr, bnc); mzd_t *C01 = mzd_init_window(C, 0, bnc, anr, 2*bnc); mzd_t *C10 = mzd_init_window(C, anr, 0, 2*anr, bnc); mzd_t *C11 = mzd_init_window(C, anr, bnc, 2*anr, 2*bnc); #pragma omp parallel sections num_threads(4) { #pragma omp section { _mzd_addmul_even(C00, A00, B00, cutoff); _mzd_addmul_even(C00, A01, B10, cutoff); } #pragma omp section { _mzd_addmul_even(C01, A00, B01, cutoff); _mzd_addmul_even(C01, A01, B11, cutoff); } #pragma omp section { _mzd_addmul_even(C10, A10, B00, cutoff); _mzd_addmul_even(C10, A11, B10, cutoff); } #pragma omp section { _mzd_addmul_even(C11, A10, B01, cutoff); _mzd_addmul_even(C11, A11, B11, cutoff); } } /* deal with rest */ if (B->ncols > 2 * bnc) { mzd_t const *B_last_col = mzd_init_window_const(B, 0, 2*bnc, A->ncols, B->ncols); mzd_t *C_last_col = mzd_init_window(C, 0, 2*bnc, A->nrows, C->ncols); mzd_addmul_m4rm(C_last_col, A, B_last_col, 0); mzd_free_window((mzd_t*)B_last_col); mzd_free_window(C_last_col); } if (A->nrows > 2 * anr) { mzd_t const *A_last_row = mzd_init_window_const(A, 2*anr, 0, A->nrows, A->ncols); mzd_t const *B_bulk = mzd_init_window_const(B, 0, 0, B->nrows, 2*bnc); mzd_t *C_last_row = mzd_init_window(C, 2*anr, 0, C->nrows, 2*bnc); mzd_addmul_m4rm(C_last_row, A_last_row, B_bulk, 0); mzd_free_window((mzd_t*)A_last_row); mzd_free_window((mzd_t*)B_bulk); mzd_free_window(C_last_row); } if (A->ncols > 2 * anc) { mzd_t const *A_last_col = mzd_init_window_const(A, 0, 2*anc, 2*anr, A->ncols); mzd_t const *B_last_row = mzd_init_window_const(B, 2*bnr, 0, B->nrows, 2*bnc); mzd_t *C_bulk = mzd_init_window(C, 0, 0, 2*anr, 2*bnc); mzd_addmul_m4rm(C_bulk, A_last_col, B_last_row, 0); mzd_free_window((mzd_t*)A_last_col); mzd_free_window((mzd_t*)B_last_row); mzd_free_window(C_bulk); } /* clean up */ mzd_free_window((mzd_t*)A00); mzd_free_window((mzd_t*)A01); mzd_free_window((mzd_t*)A10); mzd_free_window((mzd_t*)A11); mzd_free_window((mzd_t*)B00); mzd_free_window((mzd_t*)B01); mzd_free_window((mzd_t*)B10); mzd_free_window((mzd_t*)B11); mzd_free_window(C00); mzd_free_window(C01); mzd_free_window(C10); mzd_free_window(C11); __M4RI_DD_MZD(C); return C; } #endif mzd_t *mzd_mul(mzd_t *C, mzd_t const *A, mzd_t const *B, int cutoff) { if(A->ncols != B->nrows) m4ri_die("mzd_mul: A ncols (%d) need to match B nrows (%d).\n", A->ncols, B->nrows); if (cutoff < 0) m4ri_die("mzd_mul: cutoff must be >= 0.\n"); if(cutoff == 0) { cutoff = __M4RI_STRASSEN_MUL_CUTOFF; } cutoff = cutoff / m4ri_radix * m4ri_radix; if (cutoff < m4ri_radix) { cutoff = m4ri_radix; }; if (C == NULL) { C = mzd_init(A->nrows, B->ncols); } else if (C->nrows != A->nrows || C->ncols != B->ncols){ m4ri_die("mzd_mul: C (%d x %d) has wrong dimensions, expected (%d x %d)\n", C->nrows, C->ncols, A->nrows, B->ncols); } if(A->offset || B->offset || C->offset) { mzd_set_ui(C, 0); mzd_addmul(C, A, B, cutoff); return C; } C = (A == B) ? _mzd_sqr_even(C, A, cutoff) : _mzd_mul_even(C, A, B, cutoff); return C; } mzd_t *_mzd_addmul_even(mzd_t *C, mzd_t const *A, mzd_t const *B, int cutoff) { /** * \todo make sure not to overwrite crap after ncols and before width * m4ri_radix */ if(C->nrows == 0 || C->ncols == 0) return C; rci_t m = A->nrows; rci_t k = A->ncols; rci_t n = B->ncols; /* handle case first, where the input matrices are too small already */ if (CLOSER(m, cutoff) || CLOSER(k, cutoff) || CLOSER(n, cutoff)) { /* we copy the matrices first since it is only constant memory overhead and improves data locality */ if(mzd_is_windowed(A)|mzd_is_windowed(B)|mzd_is_windowed(C)) { mzd_t *Abar = mzd_copy(NULL, A); mzd_t *Bbar = mzd_copy(NULL, B); mzd_t *Cbar = mzd_copy(NULL, C); mzd_addmul_m4rm(Cbar, Abar, Bbar, 0); mzd_copy(C, Cbar); mzd_free(Cbar); mzd_free(Bbar); mzd_free(Abar); } else { mzd_addmul_m4rm(C, A, B, 0); } return C; } /* adjust cutting numbers to work on words */ rci_t mmm, kkk, nnn; { rci_t mult = m4ri_radix; rci_t width = MIN(MIN(m, n), k) / 2; while (width > cutoff) { width /= 2; mult *= 2; } mmm = (((m - m % mult) / m4ri_radix) >> 1) * m4ri_radix; kkk = (((k - k % mult) / m4ri_radix) >> 1) * m4ri_radix; nnn = (((n - n % mult) / m4ri_radix) >> 1) * m4ri_radix; } /* |C | |A | |B | * Compute | | += | | x | | */ { mzd_t const *A11 = mzd_init_window_const(A, 0, 0, mmm, kkk); mzd_t const *A12 = mzd_init_window_const(A, 0, kkk, mmm, 2*kkk); mzd_t const *A21 = mzd_init_window_const(A, mmm, 0, 2*mmm, kkk); mzd_t const *A22 = mzd_init_window_const(A, mmm, kkk, 2*mmm, 2*kkk); mzd_t const *B11 = mzd_init_window_const(B, 0, 0, kkk, nnn); mzd_t const *B12 = mzd_init_window_const(B, 0, nnn, kkk, 2*nnn); mzd_t const *B21 = mzd_init_window_const(B, kkk, 0, 2*kkk, nnn); mzd_t const *B22 = mzd_init_window_const(B, kkk, nnn, 2*kkk, 2*nnn); mzd_t *C11 = mzd_init_window(C, 0, 0, mmm, nnn); mzd_t *C12 = mzd_init_window(C, 0, nnn, mmm, 2*nnn); mzd_t *C21 = mzd_init_window(C, mmm, 0, 2*mmm, nnn); mzd_t *C22 = mzd_init_window(C, mmm, nnn, 2*mmm, 2*nnn); /** * \note See Marco Bodrato; "A Strassen-like Matrix Multiplication * Suited for Squaring and Highest Power Computation"; * http://bodrato.it/papres/#CIVV2008 for reference on the used * sequence of operations. */ mzd_t *S = mzd_init(mmm, kkk); mzd_t *T = mzd_init(kkk, nnn); mzd_t *U = mzd_init(mmm, nnn); _mzd_add(S, A22, A21); /* 1 S = A22 - A21 */ _mzd_add(T, B22, B21); /* 2 T = B22 - B21 */ _mzd_mul_even(U, S, T, cutoff); /* 3 U = S*T */ _mzd_add(C22, U, C22); /* 4 C22 = U + C22 */ _mzd_add(C12, U, C12); /* 5 C12 = U + C12 */ _mzd_mul_even(U, A12, B21, cutoff); /* 8 U = A12*B21 */ _mzd_add(C11, U, C11); /* 9 C11 = U + C11 */ _mzd_addmul_even(C11, A11, B11, cutoff); /* 11 C11 = A11*B11 + C11 */ _mzd_add(S, S, A12); /* 6 S = S - A12 */ _mzd_add(T, T, B12); /* 7 T = T - B12 */ _mzd_addmul_even(U, S, T, cutoff); /* 10 U = S*T + U */ _mzd_add(C12, C12, U); /* 15 C12 = U + C12 */ _mzd_add(S, A11, S); /* 12 S = A11 - S */ _mzd_addmul_even(C12, S, B12, cutoff); /* 14 C12 = S*B12 + C12 */ _mzd_add(T, B11, T); /* 13 T = B11 - T */ _mzd_addmul_even(C21, A21, T, cutoff); /* 16 C21 = A21*T + C21 */ _mzd_add(S, A22, A12); /* 17 S = A22 + A21 */ _mzd_add(T, B22, B12); /* 18 T = B22 + B21 */ _mzd_addmul_even(U, S, T, cutoff); /* 19 U = U - S*T */ _mzd_add(C21, C21, U); /* 20 C21 = C21 - U */ _mzd_add(C22, C22, U); /* 21 C22 = C22 - U */ /* clean up */ mzd_free_window((mzd_t*)A11); mzd_free_window((mzd_t*)A12); mzd_free_window((mzd_t*)A21); mzd_free_window((mzd_t*)A22); mzd_free_window((mzd_t*)B11); mzd_free_window((mzd_t*)B12); mzd_free_window((mzd_t*)B21); mzd_free_window((mzd_t*)B22); mzd_free_window(C11); mzd_free_window(C12); mzd_free_window(C21); mzd_free_window(C22); mzd_free(S); mzd_free(T); mzd_free(U); } /* deal with rest */ nnn *= 2; if (n > nnn) { /* | C| |AA| | B| * Compute | C| += |AA| x | B| */ mzd_t const *B_last_col = mzd_init_window_const(B, 0, nnn, k, n); mzd_t *C_last_col = mzd_init_window(C, 0, nnn, m, n); mzd_addmul_m4rm(C_last_col, A, B_last_col, 0); mzd_free_window((mzd_t*)B_last_col); mzd_free_window(C_last_col); } mmm *= 2; if (m > mmm) { /* | | | | |B | * Compute |C | += |AA| x |B | */ mzd_t const *A_last_row = mzd_init_window_const(A, mmm, 0, m, k); mzd_t const *B_first_col= mzd_init_window_const(B, 0, 0, k, nnn); mzd_t *C_last_row = mzd_init_window(C, mmm, 0, m, nnn); mzd_addmul_m4rm(C_last_row, A_last_row, B_first_col, 0); mzd_free_window((mzd_t*)A_last_row); mzd_free_window((mzd_t*)B_first_col); mzd_free_window(C_last_row); } kkk *= 2; if (k > kkk) { /* Add to | | | B| |C | * result |A | x | | = | | */ mzd_t const *A_last_col = mzd_init_window_const(A, 0, kkk, mmm, k); mzd_t const *B_last_row = mzd_init_window_const(B, kkk, 0, k, nnn); mzd_t *C_bulk = mzd_init_window(C, 0, 0, mmm, nnn); mzd_addmul_m4rm(C_bulk, A_last_col, B_last_row, 0); mzd_free_window((mzd_t*)A_last_col); mzd_free_window((mzd_t*)B_last_row); mzd_free_window(C_bulk); } __M4RI_DD_MZD(C); return C; } mzd_t *_mzd_addsqr_even(mzd_t *C, mzd_t const *A, int cutoff) { /** * \todo make sure not to overwrite crap after ncols and before width * m4ri_radix */ if(C->nrows == 0) return C; rci_t m = A->nrows; /* handle case first, where the input matrices are too small already */ if (CLOSER(m, cutoff)) { /* we copy the matrices first since it is only constant memory overhead and improves data locality */ if(mzd_is_windowed(A)|mzd_is_windowed(C)) { mzd_t *Cbar = mzd_copy(NULL, C); mzd_t *Abar = mzd_copy(NULL, A); mzd_addmul_m4rm(Cbar, Abar, Abar, 0); mzd_copy(C, Cbar); mzd_free(Cbar); mzd_free(Abar); } else { mzd_addmul_m4rm(C, A, A, 0); } return C; } /* adjust cutting numbers to work on words */ rci_t mmm; { rci_t mult = m4ri_radix; rci_t width = m / 2; while (width > cutoff) { width /= 2; mult *= 2; } mmm = (((m - m % mult) / m4ri_radix) >> 1) * m4ri_radix; } /* |C | |A | |B | * Compute | | += | | x | | */ { mzd_t const *A11 = mzd_init_window_const(A, 0, 0, mmm, mmm); mzd_t const *A12 = mzd_init_window_const(A, 0, mmm, mmm, 2*mmm); mzd_t const *A21 = mzd_init_window_const(A, mmm, 0, 2*mmm, mmm); mzd_t const *A22 = mzd_init_window_const(A, mmm, mmm, 2*mmm, 2*mmm); mzd_t *C11 = mzd_init_window(C, 0, 0, mmm, mmm); mzd_t *C12 = mzd_init_window(C, 0, mmm, mmm, 2*mmm); mzd_t *C21 = mzd_init_window(C, mmm, 0, 2*mmm, mmm); mzd_t *C22 = mzd_init_window(C, mmm, mmm, 2*mmm, 2*mmm); /** * \note See Marco Bodrato; "A Strassen-like Matrix Multiplication * Suited for Squaring and Highest Power Computation"; on-line v. * http://bodrato.it/papres/#CIVV2008 for reference on the used * sequence of operations. */ mzd_t *S = mzd_init(mmm, mmm); mzd_t *U = mzd_init(mmm, mmm); _mzd_add(S, A22, A21); /* 1 S = A22 - A21 */ _mzd_sqr_even(U, S, cutoff); /* 3 U = S^2 */ _mzd_add(C22, U, C22); /* 4 C22 = U + C22 */ _mzd_add(C12, U, C12); /* 5 C12 = U + C12 */ _mzd_mul_even(U, A12, A21, cutoff); /* 8 U = A12*A21 */ _mzd_add(C11, U, C11); /* 9 C11 = U + C11 */ _mzd_addsqr_even(C11, A11, cutoff); /* 11 C11 = A11^2 + C11 */ _mzd_add(S, S, A12); /* 6 S = S + A12 */ _mzd_addsqr_even(U, S, cutoff); /* 10 U = S^2 + U */ _mzd_add(C12, C12, U); /* 15 C12 = U + C12 */ _mzd_add(S, A11, S); /* 12 S = A11 - S */ _mzd_addmul_even(C12, S, A12, cutoff); /* 14 C12 = S*B12 + C12 */ _mzd_addmul_even(C21, A21, S, cutoff); /* 16 C21 = A21*T + C21 */ _mzd_add(S, A22, A12); /* 17 S = A22 + A21 */ _mzd_addsqr_even(U, S, cutoff); /* 19 U = U - S^2 */ _mzd_add(C21, C21, U); /* 20 C21 = C21 - U3 */ _mzd_add(C22, C22, U); /* 21 C22 = C22 - U3 */ /* clean up */ mzd_free_window((mzd_t*)A11); mzd_free_window((mzd_t*)A12); mzd_free_window((mzd_t*)A21); mzd_free_window((mzd_t*)A22); mzd_free_window(C11); mzd_free_window(C12); mzd_free_window(C21); mzd_free_window(C22); mzd_free(S); mzd_free(U); } /* deal with rest */ mmm *= 2; if (m > mmm) { /* | C| |AA| | B| * Compute | C| += |AA| x | B| */ { mzd_t const *A_last_col = mzd_init_window_const(A, 0, mmm, m, m); mzd_t *C_last_col = mzd_init_window(C, 0, mmm, m, m); mzd_addmul_m4rm(C_last_col, A, A_last_col, 0); mzd_free_window((mzd_t*)A_last_col); mzd_free_window(C_last_col); } /* | | | | |B | * Compute |C | += |AA| x |B | */ { mzd_t const *A_last_row = mzd_init_window_const(A, mmm, 0, m, m); mzd_t const *A_first_col= mzd_init_window_const(A, 0, 0, m, mmm); mzd_t *C_last_row = mzd_init_window(C, mmm, 0, m, mmm); mzd_addmul_m4rm(C_last_row, A_last_row, A_first_col, 0); mzd_free_window((mzd_t*)A_last_row); mzd_free_window((mzd_t*)A_first_col); mzd_free_window(C_last_row); } /* Add to | | | B| |C | * result |A | x | | = | | */ { mzd_t const *A_last_col = mzd_init_window_const(A, 0, mmm, mmm, m); mzd_t const *A_last_row = mzd_init_window_const(A, mmm, 0, m, mmm); mzd_t *C_bulk = mzd_init_window(C, 0, 0, mmm, mmm); mzd_addmul_m4rm(C_bulk, A_last_col, A_last_row, 0); mzd_free_window((mzd_t*)A_last_col); mzd_free_window((mzd_t*)A_last_row); mzd_free_window(C_bulk); } } __M4RI_DD_MZD(C); return C; } mzd_t *_mzd_addmul(mzd_t *C, mzd_t const *A, mzd_t const *B, int cutoff) { /** * Assumes that B and C are aligned in the same manner (as in a Schur complement) */ if (!A->offset){ if (!B->offset) /* A even, B even */ return (A == B) ? _mzd_addsqr_even(C, A, cutoff) : _mzd_addmul_even(C, A, B, cutoff); else { /* A even, B weird */ int const bnc = m4ri_radix - B->offset; if (B->ncols <= bnc){ _mzd_addmul_even_weird (C, A, B, cutoff); } else { mzd_t const *B0 = mzd_init_window_const (B, 0, 0, B->nrows, bnc); mzd_t *C0 = mzd_init_window (C, 0, 0, C->nrows, bnc); mzd_t const *B1 = mzd_init_window_const (B, 0, bnc, B->nrows, B->ncols); mzd_t *C1 = mzd_init_window (C, 0, bnc, C->nrows, C->ncols); _mzd_addmul_even_weird (C0, A, B0, cutoff); _mzd_addmul_even(C1, A, B1, cutoff); mzd_free_window ((mzd_t*)B0); mzd_free_window ((mzd_t*)B1); mzd_free_window (C0); mzd_free_window (C1); } } } else if (B->offset) { /* A weird, B weird */ int const anc = m4ri_radix - A->offset; int const bnc = m4ri_radix - B->offset; if (B->ncols <= bnc){ if (A->ncols <= anc) _mzd_addmul_weird_weird (C, A, B); else { mzd_t const *A0 = mzd_init_window_const (A, 0, 0, A->nrows, anc); mzd_t const *A1 = mzd_init_window_const (A, 0, anc, A->nrows, A->ncols); mzd_t const *B0 = mzd_init_window_const (B, 0, 0, anc, B->ncols); mzd_t const *B1 = mzd_init_window_const (B, anc, 0, B->nrows, B->ncols); _mzd_addmul_weird_weird (C, A0, B0); _mzd_addmul_even_weird (C, A1, B1, cutoff); mzd_free_window ((mzd_t*)A0); mzd_free_window ((mzd_t*)A1); mzd_free_window ((mzd_t*)B0); mzd_free_window ((mzd_t*)B1); } } else if (A->ncols <= anc) { mzd_t const *B0 = mzd_init_window_const (B, 0, 0, B->nrows, bnc); mzd_t const *B1 = mzd_init_window_const (B, 0, bnc, B->nrows, B->ncols); mzd_t *C0 = mzd_init_window (C, 0, 0, C->nrows, bnc); mzd_t *C1 = mzd_init_window (C, 0, bnc, C->nrows, C->ncols); _mzd_addmul_weird_weird (C0, A, B0); _mzd_addmul_weird_even (C1, A, B1, cutoff); mzd_free_window ((mzd_t*)B0); mzd_free_window ((mzd_t*)B1); mzd_free_window (C0); mzd_free_window (C1); } else { mzd_t const *A0 = mzd_init_window_const (A, 0, 0, A->nrows, anc); mzd_t const *A1 = mzd_init_window_const (A, 0, anc, A->nrows, A->ncols); mzd_t const *B00 = mzd_init_window_const (B, 0, 0, anc, bnc); mzd_t const *B01 = mzd_init_window_const (B, 0, bnc, anc, B->ncols); mzd_t const *B10 = mzd_init_window_const (B, anc, 0, B->nrows, bnc); mzd_t const *B11 = mzd_init_window_const (B, anc, bnc, B->nrows, B->ncols); mzd_t *C0 = mzd_init_window (C, 0, 0, C->nrows, bnc); mzd_t *C1 = mzd_init_window (C, 0, bnc, C->nrows, C->ncols); _mzd_addmul_weird_weird (C0, A0, B00); _mzd_addmul_even_weird (C0, A1, B10, cutoff); _mzd_addmul_weird_even (C1, A0, B01, cutoff); _mzd_addmul_even (C1, A1, B11, cutoff); mzd_free_window ((mzd_t*)A0); mzd_free_window ((mzd_t*)A1); mzd_free_window (C0); mzd_free_window (C1); mzd_free_window ((mzd_t*)B00); mzd_free_window ((mzd_t*)B01); mzd_free_window ((mzd_t*)B10); mzd_free_window ((mzd_t*)B11); } } else { /* A weird, B even */ int const anc = m4ri_radix - A->offset; if (A->ncols <= anc){ _mzd_addmul_weird_even (C, A, B, cutoff); } else { mzd_t const *A0 = mzd_init_window_const (A, 0, 0, A->nrows, anc); mzd_t const *A1 = mzd_init_window_const (A, 0, anc, A->nrows, A->ncols); mzd_t const *B0 = mzd_init_window_const (B, 0, 0, anc, B->ncols); mzd_t const *B1 = mzd_init_window_const (B, anc, 0, B->nrows, B->ncols); _mzd_addmul_weird_even (C, A0, B0, cutoff); _mzd_addmul_even (C, A1, B1, cutoff); mzd_free_window ((mzd_t*)A0); mzd_free_window ((mzd_t*)A1); mzd_free_window ((mzd_t*)B0); mzd_free_window ((mzd_t*)B1); } } __M4RI_DD_MZD(C); return C; } mzd_t *_mzd_addmul_weird_even (mzd_t *C, mzd_t const *A, mzd_t const *B, int cutoff) { mzd_t *tmp = mzd_init(A->nrows, MIN((rci_t)m4ri_radix - A->offset, A->ncols)); for (rci_t i = 0; i < A->nrows; ++i){ tmp->rows[i][0] = (A->rows[i][0] >> A->offset); } _mzd_addmul_even (C, tmp, B, cutoff); mzd_free(tmp); return C; } mzd_t *_mzd_addmul_even_weird (mzd_t *C, mzd_t const *A, mzd_t const *B, int cutoff) { assert(B->width == 1 && C->width == 1); assert(mzd_is_windowed(B)); // Otherwise the whole copying makes no sense. // D will contain the first 64 columns of B. mzd_t *D = mzd_init(B->nrows, (rci_t)m4ri_radix); // Make a backup of the shape of C. int const offset = C->offset; rci_t const cncols = C->ncols; int const flags = C->flags; word const bitmask = C->low_bitmask; // Extend it's columns to the right to include the whole first word, and only the first word. C->offset = 0; C->ncols = m4ri_radix; C->flags = (flags & (mzd_flag_multiple_blocks | mzd_flag_windowed_ownsblocks)) | (mzd_flag_windowed_zerooffset | mzd_flag_windowed_zeroexcess); C->low_bitmask = C->high_bitmask = m4ri_ffff; // Run Bw and Dw over all (now single word) rows of B and D respectively. word* RESTRICT Bw = mzd_first_row(B); word* RESTRICT Dw = mzd_first_row(D); int Bblock = 0; int Dblock = 0; int Bcount = mzd_rows_in_block(B, 0); int Dcount = mzd_rows_in_block(D, 0); // This is true because D->row_offset == 0, and D contains much less columns // (well, at least, it's rowstride is less than or equal B's rowstride). // It can at most contain more rows in the first block than B. assert(Bcount <= Dcount); int count = Bcount; // Already substract 'count' from Bcount and Dcount. Dcount -= Bcount; Bcount = 0; wi_t const Browstride = B->rowstride; word const mask = B->low_bitmask; while(1) { // Make count even. if ((count & 1)) { *Dw = *Bw & mask; Bw += Browstride; Dw += 1; } // Unroll the loop a factor of two. count >>= 1; while (count--) { // Inner loop. Copy the first word of B to D, setting the extra columns to zero. Dw[0] = *Bw & mask; Bw += Browstride; Dw[1] = *Bw & mask; Bw += Browstride; Dw += 2; // D->rowstride == 1 } // Unless we have more than one block, we're done. // This is always the case the first time we get here, and almost always true subsequent times. if (__M4RI_LIKELY(Bcount == 0)) { // This is true if we just processed the last block; optimize for the case // of a single block matrix and mark it as likely. if (__M4RI_LIKELY((Bcount = mzd_rows_in_block(B, ++Bblock)) <= 0)) break; // Put Bw at the start of the next block. Bw = mzd_first_row_next_block(B, Bblock); } else { // then Dcount == 0. Do the same as above but for D. if ((Dcount = mzd_rows_in_block(D, ++Dblock)) <= 0) break; Dw = mzd_first_row_next_block(D, Dblock); } count = MIN(Bcount, Dcount); Bcount -= count; Dcount -= count; } _mzd_addmul_even (C, A, D, cutoff); C->offset = offset; C->ncols = cncols; C->flags = flags; C->low_bitmask = C->high_bitmask = bitmask; mzd_free(D); return C; } mzd_t *_mzd_addmul_weird_weird (mzd_t *C, mzd_t const *A, mzd_t const *B) { mzd_t *BT = mzd_init( B->ncols, B->nrows ); for (rci_t i = 0; i < B->ncols; ++i) { word *dstp = BT->rows[i]; wi_t const ii = (i + B->offset) / m4ri_radix; int const is = (i + B->offset) % m4ri_radix; word const mask = m4ri_one << is; int const ke = (is - A->offset < 0) ? -1 : is - A->offset; rci_t k = B->nrows - 1; while (k > ke) { *dstp |= (B->rows[k][ii] & mask) << (k - ke); --k; } while (k >= 0) { *dstp |= (B->rows[k][ii] & mask) >> (ke - k); --k; } } assert(C->offset + C->ncols - 1 < 64); word parity[64]; memset(parity, 0, sizeof(parity)); #ifdef M4RI_WRAPWORD word::init_array(parity, 64); #endif for (rci_t i = 0; i < A->nrows; ++i) { word *a = A->rows[i]; word *c = C->rows[i]; for (rci_t k = 0; k < C->ncols; ++k) { word *b = BT->rows[k]; parity[(k + C->offset)] = (*a) & (*b); } word par = m4ri_parity64(parity); *c ^= par;//m4ri_parity64(parity); } mzd_free (BT); __M4RI_DD_MZD(C); return C; } mzd_t *mzd_addmul(mzd_t *C, mzd_t const *A, mzd_t const *B, int cutoff) { if(A->ncols != B->nrows) m4ri_die("mzd_addmul: A ncols (%d) need to match B nrows (%d).\n", A->ncols, B->nrows); if (cutoff < 0) m4ri_die("mzd_addmul: cutoff must be >= 0.\n"); if(cutoff == 0) { cutoff = __M4RI_STRASSEN_MUL_CUTOFF; } cutoff = cutoff / m4ri_radix * m4ri_radix; if (cutoff < m4ri_radix) { cutoff = m4ri_radix; }; if (C == NULL) { C = mzd_init(A->nrows, B->ncols); } else if (C->nrows != A->nrows || C->ncols != B->ncols){ m4ri_die("mzd_addmul: C (%d x %d) has wrong dimensions, expected (%d x %d)\n", C->nrows, C->ncols, A->nrows, B->ncols); } if(A->nrows == 0 || A->ncols == 0 || B->ncols == 0) { __M4RI_DD_MZD(C); return C; } C = _mzd_addmul(C, A, B, cutoff); __M4RI_DD_MZD(C); return C; } libm4ri-20130416/src/strassen.h000066400000000000000000000116171212302366200161230ustar00rootroot00000000000000/** * \file strassen.h * * \brief Matrix operations using Strassen's formulas including * Winograd's improvements. * * \author Gregory Bard * \author Martin Albrecht */ #ifndef M4RI_STRASSEN_H #define M4RI_STRASSEN_H /******************************************************************* * * M4RI: Linear Algebra over GF(2) * * Copyright (C) 2008 Martin Albrecht * Copyright (C) 2008 Clement Pernet * * Distributed under the terms of the GNU General Public License (GPL) * version 2 or higher. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ * ********************************************************************/ #include #include #include /** * \brief Matrix multiplication via the Strassen-Winograd matrix * multiplication algorithm, i.e. compute C = AB. * * This is the wrapper function including bounds checks. See * _mzd_mul_even for implementation details. * * \param C Preallocated product matrix, may be NULL for automatic creation. * \param A Input matrix A * \param B Input matrix B * \param cutoff Minimal dimension for Strassen recursion. */ mzd_t *mzd_mul(mzd_t *C, mzd_t const *A, mzd_t const *B, int cutoff); /** * \brief Matrix multiplication and in-place addition via the * Strassen-Winograd matrix multiplication algorithm, i.e. compute * C = C+ AB. * * This is the wrapper function including bounds checks. See * _mzd_addmul_even for implementation details. * * \param C product matrix * \param A Input matrix A * \param B Input matrix B * \param cutoff Minimal dimension for Strassen recursion. */ mzd_t *mzd_addmul(mzd_t *C, mzd_t const *A, mzd_t const *B, int cutoff); /** * \brief Matrix multiplication via the Strassen-Winograd matrix * multiplication algorithm, i.e. compute C = AB. * * This is the actual implementation. Any matrix where either the * number of rows or the number of columns is smaller than cutoff is * processed using the M4RM algorithm. * * \param C Preallocated product matrix, may be NULL for automatic creation. * \param A Input matrix A * \param B Input matrix B * \param cutoff Minimal dimension for Strassen recursion. * * \note This implementation is heavily inspired by the function * strassen_window_multiply_c in Sage 3.0; For reference see * http://www.sagemath.org */ mzd_t *_mzd_mul_even(mzd_t *C, mzd_t const *A, mzd_t const *B, int cutoff); /** * \brief Matrix multiplication and in-place addition via the * Strassen-Winograd matrix multiplication algorithm, i.e. compute * C = C+ AB. * * This is the actual implementation. Any matrix where either the * number of rows or the number of columns is smaller than cutoff is * processed using the M4RM algorithm. * * \param C Preallocated product matrix, may be NULL for automatic creation. * \param A Input matrix A * \param B Input matrix B * \param cutoff Minimal dimension for Strassen recursion. * * \note This implementation is heavily inspired by the function * strassen_window_multiply_c in Sage 3.0; For reference see * http://www.sagemath.org */ mzd_t *_mzd_addmul_even(mzd_t *C, mzd_t const *A, mzd_t const *B, int cutoff); /** * \brief Matrix multiplication and in-place addition via the * Strassen-Winograd matrix multiplication algorithm, i.e. compute * C = C + AB. * * The matrices A and B are respectively m x k and k x n, and can be not * aligned on the m4ri_radix grid. * * \param C Preallocated product matrix, may be NULL for automatic creation. * \param A Input matrix A * \param B Input matrix B * \param cutoff Minimal dimension for Strassen recursion. * */ mzd_t *_mzd_addmul (mzd_t *C, mzd_t const *A, mzd_t const *B, int cutoff); /** * C = A*B + C for matrices with offsets != 0 * * This is scratch code. * * \internal */ mzd_t *_mzd_addmul_weird_weird (mzd_t *C, mzd_t const *A, mzd_t const *B); /** * C = A*B + C for A with offset == 0 and B with offset != 0. * * This is scratch code. * * \internal */ mzd_t *_mzd_addmul_weird_even (mzd_t *C, mzd_t const *A, mzd_t const *B, int cutoff); /** * C = A*B + C for A with offset != 0 and B with offset == 0. * * This is scratch code. * * \internal */ mzd_t *_mzd_addmul_even_weird (mzd_t *C, mzd_t const *A, mzd_t const *B, int cutoff); /** * The default cutoff for Strassen-Winograd multiplication. It should * hold hold that 2 * (n^2)/8 fits into the L2 cache. */ #ifndef __M4RI_STRASSEN_MUL_CUTOFF #define __M4RI_STRASSEN_MUL_CUTOFF MIN(((int)sqrt((double)(4 * __M4RI_CPU_L3_CACHE))), 4096) #endif #endif // M4RI_STRASSEN_H libm4ri-20130416/src/triangular.c000066400000000000000000000723541212302366200164310ustar00rootroot00000000000000/******************************************************************* * * M4RI: Linear Algebra over GF(2) * * Copyright (C) 2008 Clement Pernet * * Distributed under the terms of the GNU General Public License (GPL) * version 2 or higher. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ * ********************************************************************/ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include "triangular.h" #include "triangular_russian.h" #include "strassen.h" #include "mzd.h" #include "parity.h" #define TRSM_THRESHOLD m4ri_radix /***************** * UPPER RIGHT ****************/ /* * This version assumes that the matrices are at an even position on * the m4ri_radix grid and that their dimension is a multiple of m4ri_radix. */ void _mzd_trsm_upper_right_even(mzd_t const *U, mzd_t *B, const int cutoff); /* * Variant where U and B start at an odd bit position. Assumes that * U->ncols < 64 */ void _mzd_trsm_upper_right_weird(mzd_t const *U, mzd_t *B); void _mzd_trsm_upper_right_base(mzd_t const *U, mzd_t *B); void mzd_trsm_upper_right(mzd_t const *U, mzd_t *B, const int cutoff) { if(U->nrows != B->ncols) m4ri_die("mzd_trsm_upper_right: U nrows (%d) need to match B ncols (%d).\n", U->nrows, B->ncols); if(U->nrows != U->ncols) m4ri_die("mzd_trsm_upper_right: U must be square and is found to be (%d) x (%d).\n", U->nrows, U->ncols); _mzd_trsm_upper_right(U, B, cutoff); } void _mzd_trsm_upper_right(mzd_t const *U, mzd_t *B, const int cutoff) { rci_t const nb = B->ncols; rci_t const mb = B->nrows; int const n1 = m4ri_radix-B->offset; if(nb <= n1) { _mzd_trsm_upper_right_weird(U, B); return; } /** \verbatim _________ \U00| | \ |U01| \ | | \|___| \U11| \ | \ | \| _______ |B0 |B1 | |___|___| \endverbatim * \li U00 and B0 are possibly located at uneven locations. * \li Their column dimension is lower than 64. * \li The first column of U01, U11, B1 are aligned at words. */ mzd_t *B0 = mzd_init_window (B, 0, 0, mb, n1); mzd_t *B1 = mzd_init_window (B, 0, n1, mb, nb); mzd_t const *U00 = mzd_init_window_const (U, 0, 0, n1, n1); mzd_t const *U01 = mzd_init_window_const (U, 0, n1, n1, nb); mzd_t const *U11 = mzd_init_window_const (U, n1, n1, nb, nb); _mzd_trsm_upper_right_weird (U00, B0); mzd_addmul (B1, B0, U01, cutoff); _mzd_trsm_upper_right_even (U11, B1, cutoff); mzd_free_window(B0); mzd_free_window(B1); mzd_free_window((mzd_t*)U00); mzd_free_window((mzd_t*)U01); mzd_free_window((mzd_t*)U11); __M4RI_DD_MZD(B); } void _mzd_trsm_upper_right_weird(mzd_t const *U, mzd_t *B) { rci_t const mb = B->nrows; rci_t const nb = B->ncols; int const offset = B->offset; for(rci_t i = 1; i < nb; ++i) { /* Computes X_i = B_i + X_{0..i-1} U_{0..i-1,i} */ register word ucol = 0; for(rci_t k = 0; k < i; ++k) { if(__M4RI_GET_BIT(U->rows[k][0], i + U->offset)) __M4RI_SET_BIT(ucol, k + offset); } /* doing 64 dotproducts at a time, to use the parity64 parallelism */ rci_t giantstep; word tmp[64]; for(giantstep = 0; giantstep + m4ri_radix < mb; giantstep += m4ri_radix) { for(int babystep = 0; babystep < m4ri_radix; ++babystep) tmp[babystep] = B->rows[giantstep + babystep][0] & ucol; word const dotprod = m4ri_parity64(tmp); for(int babystep = 0; babystep < m4ri_radix; ++babystep) if(__M4RI_GET_BIT(dotprod, babystep)) __M4RI_FLIP_BIT(B->rows[giantstep + babystep][0], i + offset); } for(int babystep = 0; giantstep + babystep < mb; ++babystep){ tmp[babystep] = B->rows[giantstep + babystep][0] & ucol; } for(int babystep = mb - giantstep; babystep < 64; ++babystep){ tmp[babystep] = 0; } word const dotprod = m4ri_parity64(tmp); for(int babystep = 0; giantstep + babystep < mb; ++babystep) if(__M4RI_GET_BIT(dotprod, babystep)) __M4RI_FLIP_BIT(B->rows[giantstep + babystep][0], i + offset); } __M4RI_DD_MZD(B); } void _mzd_trsm_upper_right_trtri(mzd_t const *U, mzd_t *B) { mzd_t *u = mzd_extract_u(NULL, U); mzd_trtri_upper(u); mzd_t *C = mzd_mul(NULL, B, u, 0); mzd_copy(B, C); mzd_free(C); mzd_free(u); } void _mzd_trsm_upper_right_even(mzd_t const *U, mzd_t *B, const int cutoff) { rci_t const mb = B->nrows; rci_t const nb = B->ncols; if(nb <= TRSM_THRESHOLD) { /* base case */ _mzd_trsm_upper_right_base (U, B); return; } else if(nb <= __M4RI_MUL_BLOCKSIZE) { _mzd_trsm_upper_right_trtri(U, B); return; } rci_t const nb1 = (((nb - 1) / m4ri_radix + 1) >> 1) * m4ri_radix; mzd_t *B0 = mzd_init_window(B, 0, 0, mb, nb1); mzd_t *B1 = mzd_init_window(B, 0, nb1, mb, nb); mzd_t const *U00 = mzd_init_window_const(U, 0, 0, nb1, nb1); mzd_t const *U01 = mzd_init_window_const(U, 0, nb1, nb1, nb); mzd_t const *U11 = mzd_init_window_const(U, nb1, nb1, nb, nb); _mzd_trsm_upper_right_even (U00, B0, cutoff); mzd_addmul (B1, B0, U01, cutoff); _mzd_trsm_upper_right_even (U11, B1, cutoff); mzd_free_window(B0); mzd_free_window(B1); mzd_free_window((mzd_t*)U00); mzd_free_window((mzd_t*)U01); mzd_free_window((mzd_t*)U11); __M4RI_DD_MZD(B); } void _mzd_trsm_upper_right_base(mzd_t const *U, mzd_t *B) { rci_t const mb = B->nrows; rci_t const nb = B->ncols; assert(m4ri_radix == 64); for(rci_t i = 1; i < nb; ++i) { /* Computes X_i = B_i + X_{0..i-1} U_{0..i-1,i} */ register word ucol = 0; for(rci_t k = 0; k < i; ++k) { if(__M4RI_GET_BIT(U->rows[k][0], i)) __M4RI_SET_BIT(ucol, k); } /* doing 64 dotproducts at a time, to use the m4ri_parity64 parallelism */ rci_t giantstep; word tmp[64]; for(giantstep = 0; giantstep + m4ri_radix < mb; giantstep += m4ri_radix) { /* for(int babystep = 0; babystep < m4ri_radix; ++babystep) */ /* tmp[babystep] = B->rows[giantstep + babystep][0] & ucol; */ word **src = B->rows + giantstep; tmp[ 0] = src[ 0][0] & ucol, tmp[ 1] = src[ 1][0] & ucol, tmp[ 2] = src[ 2][0] & ucol, tmp[ 3] = src[ 3][0] & ucol; tmp[ 4] = src[ 4][0] & ucol, tmp[ 5] = src[ 5][0] & ucol, tmp[ 6] = src[ 6][0] & ucol, tmp[ 7] = src[ 7][0] & ucol; tmp[ 8] = src[ 8][0] & ucol, tmp[ 9] = src[ 9][0] & ucol, tmp[10] = src[10][0] & ucol, tmp[11] = src[11][0] & ucol; tmp[12] = src[12][0] & ucol, tmp[13] = src[13][0] & ucol, tmp[14] = src[14][0] & ucol, tmp[15] = src[15][0] & ucol; tmp[16] = src[16][0] & ucol, tmp[17] = src[17][0] & ucol, tmp[18] = src[18][0] & ucol, tmp[19] = src[19][0] & ucol; tmp[20] = src[20][0] & ucol, tmp[21] = src[21][0] & ucol, tmp[22] = src[22][0] & ucol, tmp[23] = src[23][0] & ucol; tmp[24] = src[24][0] & ucol, tmp[25] = src[25][0] & ucol, tmp[26] = src[26][0] & ucol, tmp[27] = src[27][0] & ucol; tmp[28] = src[28][0] & ucol, tmp[29] = src[29][0] & ucol, tmp[30] = src[30][0] & ucol, tmp[31] = src[31][0] & ucol; tmp[32] = src[32][0] & ucol, tmp[33] = src[33][0] & ucol, tmp[34] = src[34][0] & ucol, tmp[35] = src[35][0] & ucol; tmp[36] = src[36][0] & ucol, tmp[37] = src[37][0] & ucol, tmp[38] = src[38][0] & ucol, tmp[39] = src[39][0] & ucol; tmp[40] = src[40][0] & ucol, tmp[41] = src[41][0] & ucol, tmp[42] = src[42][0] & ucol, tmp[43] = src[43][0] & ucol; tmp[44] = src[44][0] & ucol, tmp[45] = src[45][0] & ucol, tmp[46] = src[46][0] & ucol, tmp[47] = src[47][0] & ucol; tmp[48] = src[48][0] & ucol, tmp[49] = src[49][0] & ucol, tmp[50] = src[50][0] & ucol, tmp[51] = src[51][0] & ucol; tmp[52] = src[52][0] & ucol, tmp[53] = src[53][0] & ucol, tmp[54] = src[54][0] & ucol, tmp[55] = src[55][0] & ucol; tmp[56] = src[56][0] & ucol, tmp[57] = src[57][0] & ucol, tmp[58] = src[58][0] & ucol, tmp[59] = src[59][0] & ucol; tmp[60] = src[60][0] & ucol, tmp[61] = src[61][0] & ucol, tmp[62] = src[62][0] & ucol, tmp[63] = src[63][0] & ucol; word const dotprod = m4ri_parity64(tmp); /* for(int babystep = 0; babystep < m4ri_radix; ++babystep) */ /* if(__M4RI_GET_BIT(dotprod, babystep)) */ /* __M4RI_FLIP_BIT(B->rows[giantstep + babystep][0], i); */ src[ 0][0] ^= ((dotprod>> 0)&m4ri_one)<> 1)&m4ri_one)<> 2)&m4ri_one)<> 3)&m4ri_one)<> 4)&m4ri_one)<> 5)&m4ri_one)<> 6)&m4ri_one)<> 7)&m4ri_one)<> 8)&m4ri_one)<> 9)&m4ri_one)<>10)&m4ri_one)<>11)&m4ri_one)<>12)&m4ri_one)<>13)&m4ri_one)<>14)&m4ri_one)<>15)&m4ri_one)<>16)&m4ri_one)<>17)&m4ri_one)<>18)&m4ri_one)<>19)&m4ri_one)<>20)&m4ri_one)<>21)&m4ri_one)<>22)&m4ri_one)<>23)&m4ri_one)<>24)&m4ri_one)<>25)&m4ri_one)<>26)&m4ri_one)<>27)&m4ri_one)<>28)&m4ri_one)<>29)&m4ri_one)<>30)&m4ri_one)<>31)&m4ri_one)<>32)&m4ri_one)<>33)&m4ri_one)<>34)&m4ri_one)<>35)&m4ri_one)<>36)&m4ri_one)<>37)&m4ri_one)<>38)&m4ri_one)<>39)&m4ri_one)<>40)&m4ri_one)<>41)&m4ri_one)<>42)&m4ri_one)<>43)&m4ri_one)<>44)&m4ri_one)<>45)&m4ri_one)<>46)&m4ri_one)<>47)&m4ri_one)<>48)&m4ri_one)<>49)&m4ri_one)<>50)&m4ri_one)<>51)&m4ri_one)<>52)&m4ri_one)<>53)&m4ri_one)<>54)&m4ri_one)<>55)&m4ri_one)<>56)&m4ri_one)<>57)&m4ri_one)<>58)&m4ri_one)<>59)&m4ri_one)<>60)&m4ri_one)<>61)&m4ri_one)<>62)&m4ri_one)<>63)&m4ri_one)<rows[giantstep + babystep][0] & ucol; for(int babystep = mb - giantstep; babystep < 64; ++babystep) tmp[babystep] = 0; word const dotprod = m4ri_parity64(tmp); for(int babystep = 0; giantstep + babystep < mb; ++babystep) if(__M4RI_GET_BIT(dotprod, babystep)) __M4RI_FLIP_BIT(B->rows[giantstep + babystep][0], i); } __M4RI_DD_MZD(B); } /***************** * LOWER RIGHT ****************/ /* * Variant where L and B start at an odd bit position Assumes that * L->ncols < 64 */ void _mzd_trsm_lower_right_weird(mzd_t const *L, mzd_t *B); /* * Variant where L and B start at an even bit position Assumes that * L->ncols < 64 */ void _mzd_trsm_lower_right_even(mzd_t const *L, mzd_t *B, const int cutoff); void _mzd_trsm_lower_right_base(mzd_t const *L, mzd_t *B); void mzd_trsm_lower_right(mzd_t const *L, mzd_t *B, const int cutoff) { if(L->nrows != B->ncols) m4ri_die("mzd_trsm_lower_right: L nrows (%d) need to match B ncols (%d).\n", L->nrows, B->ncols); if(L->nrows != L->ncols) m4ri_die("mzd_trsm_lower_right: L must be square and is found to be (%d) x (%d).\n", L->nrows, L->ncols); _mzd_trsm_lower_right (L, B, cutoff); } void _mzd_trsm_lower_right(mzd_t const *L, mzd_t *B, const int cutoff) { rci_t const nb = B->ncols; rci_t const mb = B->nrows; int const n1 = m4ri_radix-B->offset; if(nb <= n1) _mzd_trsm_lower_right_weird (L, B); else{ /** \verbatim |\ | \ | \ |L00\ |____\ | |\ | | \ | | \ |L10 |L11\ |____|____\ _________ |B0 |B1 | |____|____| \endverbatim * \li L00 and B0 are possibly located at uneven locations. * \li Their column dimension is lower than 64. * \li The first column of L10, L11, B1 are aligned to words. */ mzd_t *B0 = mzd_init_window (B, 0, 0, mb, n1); mzd_t *B1 = mzd_init_window (B, 0, n1, mb, nb); mzd_t const *L00 = mzd_init_window_const (L, 0, 0, n1, n1); mzd_t const *L10 = mzd_init_window_const (L, n1, 0, nb, n1); mzd_t const *L11 = mzd_init_window_const (L, n1, n1, nb, nb); _mzd_trsm_lower_right_even (L11, B1, cutoff); mzd_addmul (B0, B1, L10, cutoff); _mzd_trsm_lower_right_weird (L00, B0); mzd_free_window(B0); mzd_free_window(B1); mzd_free_window((mzd_t*)L00); mzd_free_window((mzd_t*)L10); mzd_free_window((mzd_t*)L11); } __M4RI_DD_MZD(B); } void _mzd_trsm_lower_right_weird(mzd_t const *L, mzd_t *B) { rci_t const mb = B->nrows; rci_t const nb = B->ncols; int const offset = B->offset; for(rci_t i = nb - 1; i >= 0; --i) { /* Computes X_i = B_i + X_{i+1,n} L_{i+1..n,i} */ register word ucol = 0; for(rci_t k = i + 1; k < nb; ++k) { if(__M4RI_GET_BIT(L->rows[k][0], i + L->offset)) __M4RI_SET_BIT(ucol, k + offset); } /* doing 64 dotproducts at a time, to use the m4ri_parity64 parallelism */ rci_t giantstep; word tmp[64]; for(giantstep = 0; giantstep + m4ri_radix < mb; giantstep += m4ri_radix) { for(int babystep = 0; babystep < m4ri_radix; ++babystep) tmp[babystep] = B->rows[giantstep + babystep][0] & ucol; word const dotprod = m4ri_parity64(tmp); for(int babystep = 0; babystep < m4ri_radix; ++babystep) if(__M4RI_GET_BIT(dotprod, babystep)) __M4RI_FLIP_BIT(B->rows[giantstep + babystep][0], i + offset); } for(int babystep = 0; giantstep + babystep < mb; ++babystep){ tmp[babystep] = B->rows[giantstep + babystep][0] & ucol; } for(int babystep = mb - giantstep; babystep < 64; ++babystep){ tmp[babystep] = 0; } word const dotprod = m4ri_parity64(tmp); for(int babystep = 0; giantstep + babystep < mb; ++babystep) if(__M4RI_GET_BIT(dotprod, babystep)) __M4RI_FLIP_BIT(B->rows[giantstep + babystep ][0], i + offset); } __M4RI_DD_MZD(B); } void _mzd_trsm_lower_right_even(mzd_t const *L, mzd_t *B, const int cutoff) { rci_t const mb = B->nrows; rci_t const nb = B->ncols; if(nb <= TRSM_THRESHOLD){ /* base case */ _mzd_trsm_lower_right_base (L, B); } else { rci_t const nb1 = (((nb - 1) / m4ri_radix + 1) >> 1) * m4ri_radix; mzd_t *B0 = mzd_init_window(B, 0, 0, mb, nb1); mzd_t *B1 = mzd_init_window(B, 0, nb1, mb, nb); mzd_t const *L00 = mzd_init_window_const(L, 0, 0, nb1, nb1); mzd_t const *L10 = mzd_init_window_const(L, nb1, 0, nb, nb1); mzd_t const *L11 = mzd_init_window_const(L, nb1, nb1, nb, nb); _mzd_trsm_lower_right_even (L11, B1, cutoff); mzd_addmul (B0, B1, L10, cutoff); _mzd_trsm_lower_right_even (L00, B0, cutoff); mzd_free_window(B0); mzd_free_window(B1); mzd_free_window((mzd_t*)L00); mzd_free_window((mzd_t*)L10); mzd_free_window((mzd_t*)L11); } __M4RI_DD_MZD(B); } void _mzd_trsm_lower_right_base(mzd_t const *L, mzd_t *B) { rci_t const mb = B->nrows; rci_t const nb = B->ncols; for(rci_t i = nb - 1; i >= 0; --i) { /* Computes X_i = B_i + X_{i+1,n} L_{i+1..n,i} */ register word ucol = 0; for(rci_t k = i + 1; k < nb; ++k) { if(__M4RI_GET_BIT(L->rows[k][0], i)) __M4RI_SET_BIT(ucol, k); } /* doing 64 dotproducts at a time, to use the parity64 parallelism */ rci_t giantstep; word tmp[64]; for(giantstep = 0; giantstep + m4ri_radix < mb; giantstep += m4ri_radix) { for(int babystep = 0; babystep < m4ri_radix; ++babystep) tmp[babystep] = B->rows[giantstep + babystep][0] & ucol; word const dotprod = m4ri_parity64(tmp); for(int babystep = 0; babystep < m4ri_radix; ++babystep) if(__M4RI_GET_BIT(dotprod, babystep)) __M4RI_FLIP_BIT(B->rows[giantstep + babystep][0], i); } for(int babystep = 0; giantstep + babystep < mb; ++babystep) tmp[babystep] = B->rows[giantstep + babystep][0] & ucol; for(int babystep = mb - giantstep; babystep < 64; ++babystep) tmp[babystep] = 0; word const dotprod = m4ri_parity64(tmp); for(int babystep = 0; giantstep + babystep < mb; ++babystep) if(__M4RI_GET_BIT(dotprod, babystep)) __M4RI_FLIP_BIT(B->rows[giantstep + babystep][0], i); } __M4RI_DD_MZD(B); } /***************** * LOWER LEFT ****************/ /* * Variant where U and B start at an odd bit position. Assumes that * L->ncols < 64 */ void _mzd_trsm_lower_left_weird(mzd_t const *L, mzd_t *B); /* * This version assumes that the matrices are at an even position on * the m4ri_radix grid and that their dimension is a multiple of m4ri_radix. */ void _mzd_trsm_lower_left_even(mzd_t const *L, mzd_t *B, const int cutoff); void mzd_trsm_lower_left(mzd_t const *L, mzd_t *B, const int cutoff) { if(L->ncols != B->nrows) m4ri_die("mzd_trsm_lower_left: L ncols (%d) need to match B nrows (%d).\n", L->ncols, B->nrows); if(L->nrows != L->ncols) m4ri_die("mzd_trsm_lower_left: L must be square and is found to be (%d) x (%d).\n", L->nrows, L->ncols); _mzd_trsm_lower_left (L, B, cutoff); } void _mzd_trsm_lower_left(mzd_t const *L, mzd_t *B, const int cutoff) { if(!L->offset) _mzd_trsm_lower_left_even(L, B, cutoff); else{ rci_t const nb = B->ncols; rci_t const mb = B->nrows; int const m1 = m4ri_radix - L->offset; if(mb <= m1) { _mzd_trsm_lower_left_weird (L, B); return; } /** \verbatim |\ ______ | \ | | | \ | B0 | |L00\ | | |____\ |______| | |\ | | | | \ | | | | \ | B1 | |L10 |L11\ | | |____|____\ |______| \endverbatim * \li L00 L10 B0 and B1 are possibly located at uneven locations. * \li Their column dimension is lower than 64. * \li The first column of L01, L11, B1 are aligned to words. */ mzd_t *B0 = mzd_init_window (B, 0, 0, m1, nb); mzd_t *B1 = mzd_init_window (B, m1, 0, mb, nb); mzd_t const *L00 = mzd_init_window_const (L, 0, 0, m1, m1); mzd_t const *L10 = mzd_init_window_const (L, m1, 0, mb, m1); mzd_t const *L11 = mzd_init_window_const (L, m1, m1, mb, mb); _mzd_trsm_lower_left_weird (L00, B0); mzd_addmul (B1, L10, B0, cutoff); _mzd_trsm_lower_left_even (L11, B1, cutoff); mzd_free_window(B0); mzd_free_window(B1); mzd_free_window((mzd_t*)L00); mzd_free_window((mzd_t*)L10); mzd_free_window((mzd_t*)L11); } __M4RI_DD_MZD(B); } void _mzd_trsm_lower_left_weird(mzd_t const *L, mzd_t *B) { rci_t const mb = B->nrows; rci_t const nb = B->ncols; int const Boffset = B->offset; int const nbrest = (nb + Boffset) % m4ri_radix; if(nb + B->offset > m4ri_radix) { // Large B word const mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix-B->offset); word const mask_end = __M4RI_LEFT_BITMASK(nbrest); // L[0,0] = 1, so no work required for i=0 for(rci_t i = 1; i < mb; ++i) { /* Computes X_i = B_i + L_{i,0..i-1} X_{0..i-1} */ /** * \todo needs to be optimized! **/ word *Lrow = L->rows[i]; word *Brow = B->rows[i]; for(rci_t k = 0; k < i; ++k) { if(__M4RI_GET_BIT(Lrow[0], k + L->offset)) { Brow[0] ^= B->rows[k][0] & mask_begin; for(wi_t j = 1; j < B->width - 1; ++j) Brow[j] ^= B->rows[k][j]; Brow[B->width - 1] ^= B->rows[k][B->width - 1] & mask_end; } } } } else { // Small B word const mask = __M4RI_MIDDLE_BITMASK(nb, B->offset); for(rci_t i = 1; i < mb; ++i) { /* Computes X_i = B_i + L_{i,0..i-1} X_{0..i-1} */ /** * \todo needs to be optimized! **/ word *Lrow = L->rows[i]; word *Brow = B->rows[i]; for(rci_t k = 0; k < i; ++k) { if(__M4RI_GET_BIT(Lrow[0], k + L->offset)) { Brow[0] ^= B->rows[k][0] & mask; } } } } __M4RI_DD_MZD(B); } void _mzd_trsm_lower_left_even(mzd_t const *L, mzd_t *B, const int cutoff) { rci_t const mb = B->nrows; rci_t const nb = B->ncols; int const Boffset = B->offset; int const nbrest = (nb + Boffset) % m4ri_radix; if(mb <= m4ri_radix){ /* base case */ if(nb + B->offset > m4ri_radix) { // B is large word const mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix-B->offset); word const mask_end = __M4RI_LEFT_BITMASK(nbrest); for(rci_t i = 1; i < mb; ++i) { /* Computes X_i = B_i + L_{i,0..i-1} X_{0..i-1} */ /** * \todo needs to be optimized! **/ word *Lrow = L->rows[i]; word *Brow = B->rows[i]; for(rci_t k = 0; k < i; ++k) { if(__M4RI_GET_BIT(Lrow[0], k)){ Brow[0] ^= B->rows[k][0] & mask_begin; for(wi_t j = 1; j < B->width - 1; ++j) Brow[j] ^= B->rows[k][j]; Brow[B->width - 1] ^= B->rows[k][B->width - 1] & mask_end; } } } } else { // B is small word const mask = __M4RI_MIDDLE_BITMASK(nb, B->offset); for(rci_t i = 1; i < mb; ++i) { /* Computes X_i = B_i + L_{i,0..i-1} X_{0..i-1} */ /** Need to be optimized !!! **/ word *Lrow = L->rows [i]; word *Brow = B->rows [i]; for(rci_t k = 0; k < i; ++k) { if(__M4RI_GET_BIT(Lrow[0], k)){ Brow[0] ^= B->rows[k][0] & mask; } } } } } else { rci_t const mb1 = (((mb - 1) / m4ri_radix + 1) >> 1) * m4ri_radix; mzd_t *B0 = mzd_init_window(B, 0, 0, mb1, nb); mzd_t *B1 = mzd_init_window(B, mb1, 0, mb, nb); mzd_t const *L00 = mzd_init_window_const(L, 0, 0, mb1, mb1); mzd_t const *L10 = mzd_init_window_const(L, mb1, 0, mb, mb1); mzd_t const *L11 = mzd_init_window_const(L, mb1, mb1, mb, mb); _mzd_trsm_lower_left_even (L00, B0, cutoff); mzd_addmul (B1, L10, B0, cutoff); _mzd_trsm_lower_left_even (L11, B1, cutoff); mzd_free_window(B0); mzd_free_window(B1); mzd_free_window((mzd_t*)L00); mzd_free_window((mzd_t*)L10); mzd_free_window((mzd_t*)L11); } __M4RI_DD_MZD(B); } /***************** * UPPER LEFT ****************/ /* * Variant where U and B start at an odd bit position * Assumes that U->ncols < 64 */ void _mzd_trsm_upper_left_weird (mzd_t const *U, mzd_t *B); void _mzd_trsm_upper_left_even(mzd_t const *U, mzd_t *B, const int cutoff); void mzd_trsm_upper_left(mzd_t const *U, mzd_t *B, const int cutoff) { if(U->ncols != B->nrows) m4ri_die("mzd_trsm_upper_left: U ncols (%d) need to match B nrows (%d).\n", U->ncols, B->nrows); if(U->nrows != U->ncols) m4ri_die("mzd_trsm_upper_left: U must be square and is found to be (%d) x (%d).\n", U->nrows, U->ncols); _mzd_trsm_upper_left (U, B, cutoff); } void _mzd_trsm_upper_left(mzd_t const *U, mzd_t *B, const int cutoff) { if(!U->offset) _mzd_trsm_upper_left_even (U, B, cutoff); else{ rci_t const nb = B->ncols; rci_t const mb = B->nrows; int const m1 = m4ri_radix - U->offset; if(mb <= m1) { _mzd_trsm_upper_left_weird (U, B); return; } /** \verbatim __________ ______ \ U00| | | | \ |U01 | | | \ | | | B0 | \ | | | | \|____| |______| \ | | | \U11| | | \ | | B1 | \ | | | \| |______| \endverbatim * \li U00, B0 and B1 are possibly located at uneven locations. * \li Their column dimension is greater than 64 * \li The first column of U01, U11, B0 and B1 are aligned to words. */ mzd_t *B0 = mzd_init_window (B, 0, 0, m1, nb); mzd_t *B1 = mzd_init_window (B, m1, 0, mb, nb); mzd_t const *U00 = mzd_init_window_const (U, 0, 0, m1, m1); mzd_t const *U01 = mzd_init_window_const (U, 0, m1, m1, mb); mzd_t const *U11 = mzd_init_window_const (U, m1, m1, mb, mb); _mzd_trsm_upper_left_even (U11, B1, cutoff); mzd_addmul (B0, U01, B1, cutoff); _mzd_trsm_upper_left_weird (U00, B0); mzd_free_window(B0); mzd_free_window(B1); mzd_free_window((mzd_t*)U00); mzd_free_window((mzd_t*)U01); mzd_free_window((mzd_t*)U11); } __M4RI_DD_MZD(B); } void _mzd_trsm_upper_left_weird (mzd_t const *U, mzd_t *B) { rci_t const mb = B->nrows; rci_t const nb = B->ncols; int const Boffset = B->offset; int const nbrest = (nb + Boffset) % m4ri_radix; if(nb + Boffset > m4ri_radix) { // Large B word const mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix-B->offset); word const mask_end = __M4RI_LEFT_BITMASK(nbrest); // U[mb-1,mb-1] = 1, so no work required for i=mb-1 for(rci_t i = mb - 2; i >= 0; --i) { /* Computes X_i = B_i + U_{i,i+1..mb} X_{i+1..mb} */ word *Urow = U->rows[i]; word *Brow = B->rows[i]; for(rci_t k = i + 1; k < mb; ++k) { if(__M4RI_GET_BIT(Urow[0], k + U->offset)) { Brow[0] ^= B->rows[k][0] & mask_begin; for(wi_t j = 1; j < B->width - 1; ++j) Brow[j] ^= B->rows[k][j]; Brow[B->width - 1] ^= B->rows[k][B->width - 1] & mask_end; } } } } else { // Small B word const mask = __M4RI_MIDDLE_BITMASK(nb, B->offset); // U[mb-1,mb-1] = 1, so no work required for i=mb-1 for(rci_t i = mb - 2; i >= 0; --i) { /* Computes X_i = B_i + U_{i,i+1..mb} X_{i+1..mb} */ word *Urow = U->rows[i]; word *Brow = B->rows[i]; for(rci_t k = i + 1; k < mb; ++k) { if(__M4RI_GET_BIT(Urow[0], k + U->offset)) { Brow[0] ^= B->rows[k][0] & mask; } } } } __M4RI_DD_MZD(B); } void _mzd_trsm_upper_left_even(mzd_t const *U, mzd_t *B, const int cutoff) { rci_t const mb = B->nrows; rci_t const nb = B->ncols; int const Boffset = B->offset; int const nbrest = (nb + Boffset) % m4ri_radix; if(mb <= m4ri_radix) { /* base case */ if(nb + B->offset > m4ri_radix) { // B is large word const mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix-B->offset); word const mask_end = __M4RI_LEFT_BITMASK(nbrest); // U[mb-1,mb-1] = 1, so no work required for i=mb-1 for(rci_t i = mb - 2; i >= 0; --i) { /* Computes X_i = B_i + U_{i,i+1..mb} X_{i+1..mb} */ word *Urow = U->rows[i]; word *Brow = B->rows[i]; for(rci_t k = i + 1; k < mb; ++k) { if(__M4RI_GET_BIT(Urow[0], k)){ Brow[0] ^= B->rows[k][0] & mask_begin; for(wi_t j = 1; j < B->width - 1; ++j) Brow[j] ^= B->rows[k][j]; Brow[B->width - 1] ^= B->rows[k][B->width - 1] & mask_end; } } } } else { // B is small word const mask = __M4RI_MIDDLE_BITMASK(nb, B->offset); // U[mb-1,mb-1] = 1, so no work required for i=mb-1 for(rci_t i = mb - 2; i >= 0; --i) { /* Computes X_i = B_i + U_{i,i+1..mb} X_{i+1..mb} */ word *Urow = U->rows [i]; word *Brow = B->rows [i]; for(rci_t k = i + 1; k < mb; ++k) { if(__M4RI_GET_BIT(Urow[0], k)){ Brow[0] ^= B->rows[k][0] & mask; } } } } } else if(mb <= __M4RI_MUL_BLOCKSIZE) { _mzd_trsm_upper_left_russian(U, B, 0); } else { rci_t const mb1 = (((mb-1) / m4ri_radix + 1) >> 1) * m4ri_radix; mzd_t *B0 = mzd_init_window(B, 0, 0, mb1, nb); mzd_t *B1 = mzd_init_window(B, mb1, 0, mb, nb); mzd_t const *U00 = mzd_init_window_const(U, 0, 0, mb1, mb1); mzd_t const *U01 = mzd_init_window_const(U, 0, mb1, mb1, mb); mzd_t const *U11 = mzd_init_window_const(U, mb1, mb1, mb, mb); _mzd_trsm_upper_left_even (U11, B1, cutoff); _mzd_addmul (B0, U01, B1, cutoff); _mzd_trsm_upper_left_even (U00, B0, cutoff); mzd_free_window(B0); mzd_free_window(B1); mzd_free_window((mzd_t*)U00); mzd_free_window((mzd_t*)U01); mzd_free_window((mzd_t*)U11); } __M4RI_DD_MZD(B); } mzd_t *mzd_trtri_upper(mzd_t *U) { if (U->nrows*U->ncols < __M4RI_CPU_L3_CACHE<<1) { mzd_trtri_upper_russian(U,0); } else { rci_t const n = U->nrows; rci_t n2 = (((n - 1) / m4ri_radix + 1) >> 1); #if __M4RI_HAVE_SSE2 if (n2%2) n2 += 1; #endif n2 *= m4ri_radix; assert(n2 < n); mzd_t *U00 = mzd_init_window(U, 0, 0, n2, n2); mzd_t *U01 = mzd_init_window(U, 0, n2, n2, n ); mzd_t *U11 = mzd_init_window(U, n2, n2 , n , n ); _mzd_trsm_upper_left_even( U00, U01, 0); _mzd_trsm_upper_right_even(U11, U01, 0); mzd_trtri_upper(U00); mzd_trtri_upper(U11); mzd_free_window((mzd_t*)U00); mzd_free_window((mzd_t*)U01); mzd_free_window((mzd_t*)U11); } return U; } libm4ri-20130416/src/triangular.h000066400000000000000000000120241212302366200164220ustar00rootroot00000000000000/** * \file trsm.h * * \brief Triangular system solving with Matrix routines. * * \author Clement Pernet */ #ifndef M4RI_TRSM_H #define M4RI_TRSM_H /******************************************************************* * * M4RI: Linear Algebra over GF(2) * * Copyright (C) 2008 Clement Pernet * * Distributed under the terms of the GNU General Public License (GPL) * version 2 or higher. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ * ********************************************************************/ #include /** * \brief Solves X U = B with X and B matrices and U upper triangular. * * X is stored inplace on B. * * \attention Note, that the 'right' variants of TRSM are slower than * the 'left' variants. * * This is the wrapper function including bounds checks. See * _mzd_trsm_upper_right() for implementation details. * * \param U Input upper triangular matrix. * \param B Input matrix, being overwritten by the solution matrix X * \param cutoff Minimal dimension for Strassen recursion. */ void mzd_trsm_upper_right(mzd_t const *U, mzd_t *B, const int cutoff); /** * \brief Solves X U = B with X and B matrices and U upper triangular. * * X is stored inplace on B. * * \attention Note, that the 'right' variants of TRSM are slower than * the 'left' variants. * * \param U Input upper triangular matrix. * \param B Input matrix, being overwritten by the solution matrix X * \param cutoff Minimal dimension for Strassen recursion. */ void _mzd_trsm_upper_right(mzd_t const *U, mzd_t *B, const int cutoff); /** * \brief Solves X L = B with X and B matrices and L lower triangular. * * X is stored inplace on B. * * This is the wrapper function including bounds checks. See * _mzd_trsm_upper_right() for implementation details. * * \attention Note, that the 'right' variants of TRSM are slower than the 'left' * variants. * * \param L Input upper triangular matrix. * \param B Input matrix, being overwritten by the solution matrix X * \param cutoff Minimal dimension for Strassen recursion. */ void mzd_trsm_lower_right(mzd_t const *L, mzd_t *B, const int cutoff); /** * \brief Solves X L = B with X and B with matrices and L lower * triangular. * * This version assumes that the matrices are at an even position on * the m4ri_radix grid and that their dimension is a multiple of m4ri_radix. * X is stored inplace on B. * * \attention Note, that the 'right' variants of TRSM are slower than * the 'left' variants. * * \param L Input lower triangular matrix. * \param B Input matrix, being overwritten by the solution matrix X * \param cutoff Minimal dimension for Strassen recursion. * */ void _mzd_trsm_lower_right(mzd_t const *L, mzd_t *B, const int cutoff); /** * \brief Solves L X = B with X and B matrices and L lower triangular. * * X is stored inplace on B. * * This is the wrapper function including bounds checks. See * _mzd_trsm_lower_left() for implementation details. * * \param L Input lower triangular matrix. * \param B Input matrix, being overwritten by the solution matrix X * \param cutoff Minimal dimension for Strassen recursion. */ void mzd_trsm_lower_left(mzd_t const *L, mzd_t *B, const int cutoff); /** * \brief Solves L X = B with X and B matrices and L lower triangular. * * X is stored inplace on B. * * \param L Input lower triangular matrix. * \param B Input matrix, being overwritten by the solution matrix X * \param cutoff Minimal dimension for Strassen recursion. */ void _mzd_trsm_lower_left(mzd_t const *L, mzd_t *B, const int cutoff); /** * \brief Solves U X = B with X and B matrices and U upper triangular. * * X is stored inplace on B. * * This is the wrapper function including bounds checks. See * _mzd_trsm_upper_left() for implementation details. * * \param U Input upper triangular matrix. * \param B Input matrix, being overwritten by the solution matrix X * \param cutoff Minimal dimension for Strassen recursion. */ void mzd_trsm_upper_left(mzd_t const *U, mzd_t *B, const int cutoff); /** * \brief Solves U X = B with X and B matrices and U upper triangular. * * X is stored inplace on B. * * \param U Input upper triangular matrix. * \param B Input matrix, being overwritten by the solution matrix X * \param cutoff Minimal dimension for Strassen recursion. */ void _mzd_trsm_upper_left (mzd_t const *U, mzd_t *B, const int cutoff); /** * \brief Invert the upper triangular matrix A by reduction to matrix multiplication. * * \param A Matrix to be inverted (overwritten). * \param k Table size parameter, may be 0 for automatic choice. * * \wordoffset * * \return Inverse of A or throws an error */ mzd_t *mzd_trtri_upper(mzd_t *A); #endif // M4RI_TRSM_H libm4ri-20130416/src/triangular_russian.c000066400000000000000000000231221212302366200201620ustar00rootroot00000000000000#include "triangular_russian.h" #include "graycode.h" #include "brilliantrussian.h" #include "ple_russian.h" #include "xor.h" void _mzd_trsm_upper_left_submatrix(mzd_t const *U, mzd_t *B, rci_t const start_row, int const k, word const mask_begin, word const mask_end) { for (int i = 0; i < k; ++i) { for (int j = 0; j < i; ++j) { if (mzd_read_bit(U, start_row+(k-i-1), start_row+(k-i)+j)) { word *a = B->rows[start_row+(k-i-1)]; word *b = B->rows[start_row+(k-i)+j]; *a++ ^= *b++ & mask_begin; wi_t ii; for(ii = 1; ii + 8 <= B->width - 1; ii += 8) { *a++ ^= *b++; *a++ ^= *b++; *a++ ^= *b++; *a++ ^= *b++; *a++ ^= *b++; *a++ ^= *b++; *a++ ^= *b++; *a++ ^= *b++; } switch(B->width - ii) { case 8: *a++ ^= *b++; case 7: *a++ ^= *b++; case 6: *a++ ^= *b++; case 5: *a++ ^= *b++; case 4: *a++ ^= *b++; case 3: *a++ ^= *b++; case 2: *a++ ^= *b++; case 1: *a++ ^= (*b++ & mask_end); } } } } __M4RI_DD_MZD(B); } void _mzd_trsm_upper_left_russian(mzd_t const *U, mzd_t *B, int k) { wi_t const wide = B->width; int const blocksize = __M4RI_MUL_BLOCKSIZE; word mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix - B->offset); word mask_end = __M4RI_LEFT_BITMASK((B->ncols + B->offset) % m4ri_radix); if (B->width == 1) mask_begin = mask_begin & mask_end; if (k == 0) { k = m4ri_opt_k(blocksize, B->nrows, B->ncols); if (k > 3) k -= 2; /* reduce k further if that has a chance of hitting L1 */ size_t const tsize = (int)(0.8 * (__M4RI_TWOPOW(k) * B->nrows)); if(__M4RI_CPU_L1_CACHE < tsize && tsize <= 2 * __M4RI_CPU_L1_CACHE) k -= 1; } mzd_t *T0 = mzd_init(__M4RI_TWOPOW(k), B->ncols + B->offset); mzd_t *T1 = mzd_init(__M4RI_TWOPOW(k), B->ncols + B->offset); mzd_t *T2 = mzd_init(__M4RI_TWOPOW(k), B->ncols + B->offset); mzd_t *T3 = mzd_init(__M4RI_TWOPOW(k), B->ncols + B->offset); mzd_t *T4 = mzd_init(__M4RI_TWOPOW(k), B->ncols + B->offset); mzd_t *T5 = mzd_init(__M4RI_TWOPOW(k), B->ncols + B->offset); mzd_t *T6 = mzd_init(__M4RI_TWOPOW(k), B->ncols + B->offset); mzd_t *T7 = mzd_init(__M4RI_TWOPOW(k), B->ncols + B->offset); rci_t *L0 = (rci_t*)m4ri_mm_calloc(__M4RI_TWOPOW(k), sizeof(rci_t)); rci_t *L1 = (rci_t*)m4ri_mm_calloc(__M4RI_TWOPOW(k), sizeof(rci_t)); rci_t *L2 = (rci_t*)m4ri_mm_calloc(__M4RI_TWOPOW(k), sizeof(rci_t)); rci_t *L3 = (rci_t*)m4ri_mm_calloc(__M4RI_TWOPOW(k), sizeof(rci_t)); rci_t *L4 = (rci_t*)m4ri_mm_calloc(__M4RI_TWOPOW(k), sizeof(rci_t)); rci_t *L5 = (rci_t*)m4ri_mm_calloc(__M4RI_TWOPOW(k), sizeof(rci_t)); rci_t *L6 = (rci_t*)m4ri_mm_calloc(__M4RI_TWOPOW(k), sizeof(rci_t)); rci_t *L7 = (rci_t*)m4ri_mm_calloc(__M4RI_TWOPOW(k), sizeof(rci_t)); int kk = 8 * k; rci_t i = 0; for (; i < B->nrows - kk; i += kk) { _mzd_trsm_upper_left_submatrix(U, B, B->nrows-i-kk, kk, mask_begin, mask_end); mzd_make_table(B, B->nrows - i - 8*k, 0, k, T7, L7); mzd_make_table(B, B->nrows - i - 7*k, 0, k, T6, L6); mzd_make_table(B, B->nrows - i - 6*k, 0, k, T5, L5); mzd_make_table(B, B->nrows - i - 5*k, 0, k, T4, L4); mzd_make_table(B, B->nrows - i - 4*k, 0, k, T3, L3); mzd_make_table(B, B->nrows - i - 3*k, 0, k, T2, L2); mzd_make_table(B, B->nrows - i - 2*k, 0, k, T1, L1); mzd_make_table(B, B->nrows - i - 1*k, 0, k, T0, L0); for(rci_t j = 0; j < B->nrows - i - kk; ++j) { rci_t const x7 = L7[ mzd_read_bits_int(U, j, B->nrows - i - 8*k, k) ]; rci_t const x6 = L6[ mzd_read_bits_int(U, j, B->nrows - i - 7*k, k) ]; rci_t const x5 = L5[ mzd_read_bits_int(U, j, B->nrows - i - 6*k, k) ]; rci_t const x4 = L4[ mzd_read_bits_int(U, j, B->nrows - i - 5*k, k) ]; rci_t const x3 = L3[ mzd_read_bits_int(U, j, B->nrows - i - 4*k, k) ]; rci_t const x2 = L2[ mzd_read_bits_int(U, j, B->nrows - i - 3*k, k) ]; rci_t const x1 = L1[ mzd_read_bits_int(U, j, B->nrows - i - 2*k, k) ]; rci_t const x0 = L0[ mzd_read_bits_int(U, j, B->nrows - i - 1*k, k) ]; word *b = B->rows[j]; word *t7 = T7->rows[x7]; word *t6 = T6->rows[x6]; word *t5 = T5->rows[x5]; word *t4 = T4->rows[x4]; word *t3 = T3->rows[x3]; word *t2 = T2->rows[x2]; word *t1 = T1->rows[x1]; word *t0 = T0->rows[x0]; _mzd_combine8(b, t0, t1, t2, t3, t4, t5, t6, t7, wide); } } /* handle stuff that doesn't fit in multiples of kk */ for ( ;i < B->nrows; i += k) { if (i > B->nrows - k) k = B->nrows - i; _mzd_trsm_upper_left_submatrix(U, B, B->nrows-i-k, k, mask_begin, mask_end); mzd_make_table(B, B->nrows - i - 1*k, 0, k, T0, L0); for(rci_t j = 0; j < B->nrows - i - k; ++j) { rci_t const x0 = L0[ mzd_read_bits_int(U, j, B->nrows - i - 1*k, k) ]; word *b = B->rows[j]; word *t0 = T0->rows[x0]; for (wi_t ii = 0; ii < wide; ++ii) b[ii] ^= t0[ii]; } } mzd_free(T0); mzd_free(T1); mzd_free(T2); mzd_free(T3); mzd_free(T4); mzd_free(T5); mzd_free(T6); mzd_free(T7); m4ri_mm_free(L0); m4ri_mm_free(L1); m4ri_mm_free(L2); m4ri_mm_free(L3); m4ri_mm_free(L4); m4ri_mm_free(L5); m4ri_mm_free(L6); m4ri_mm_free(L7); __M4RI_DD_MZD(B); } void mzd_make_table_trtri(mzd_t const *M, rci_t r, rci_t c, int k, mzd_t *T, rci_t *L) { assert(!(T->flags & mzd_flag_multiple_blocks)); wi_t const blockoffset= c / m4ri_radix; int const twokay= __M4RI_TWOPOW(k); wi_t const wide = T->width - blockoffset; wi_t const count = (wide + 7) / 8; int const entry_point = wide % 8; wi_t const next_row_offset = blockoffset + T->rowstride - T->width; word *ti, *ti1, *m; ti1 = T->rows[0] + blockoffset; ti = ti1 + T->rowstride; L[0] = 0; for (int i = 1; i < twokay; ++i) { rci_t rowneeded = r + m4ri_codebook[k]->inc[i - 1]; m = M->rows[rowneeded] + blockoffset; wi_t n = count; switch (entry_point) { case 0: do { *(ti++) = *(m++) ^ *(ti1++); case 7: *(ti++) = *(m++) ^ *(ti1++); case 6: *(ti++) = *(m++) ^ *(ti1++); case 5: *(ti++) = *(m++) ^ *(ti1++); case 4: *(ti++) = *(m++) ^ *(ti1++); case 3: *(ti++) = *(m++) ^ *(ti1++); case 2: *(ti++) = *(m++) ^ *(ti1++); case 1: *(ti++) = *(m++) ^ *(ti1++); } while (--n > 0); } ti += next_row_offset; ti1 += next_row_offset; L[m4ri_codebook[k]->ord[i]] = i; } for(int i=1; iord[i]); } #define __M4RI_TRTRI_NTABLES 4 static inline void _mzd_trtri_upper_submatrix(mzd_t *A, rci_t pivot_r, rci_t elim_r, const int k) { for(rci_t i=pivot_r; incols ) mzd_row_add_offset(A, j, i, i+1); } mzd_t *mzd_trtri_upper_russian(mzd_t *A, int k) { assert(A->nrows == A->ncols && A->offset == 0); if (k == 0) { k = m4ri_opt_k(A->nrows, A->ncols, 0); if (k >= 7) k = 7; if (0.75 * __M4RI_TWOPOW(k) *A->ncols > __M4RI_CPU_L3_CACHE / 2.0) k -= 1; } const int kk = __M4RI_TRTRI_NTABLES*k; mzd_t *T[__M4RI_TRTRI_NTABLES]; rci_t *L[__M4RI_TRTRI_NTABLES]; mzd_t *U[__M4RI_TRTRI_NTABLES]; for(int i=0; i<__M4RI_TRTRI_NTABLES; i++) { T[i] = mzd_init(__M4RI_TWOPOW(k), A->ncols); L[i] = (rci_t*)m4ri_mm_calloc(__M4RI_TWOPOW(k), sizeof(rci_t)); U[i] = mzd_init(k, A->ncols); } /** dummy offsets table for make_table_ple**/ rci_t id[m4ri_radix]; for(int i=0; inrows) { /*** * ---------------------------- * [ ....................... ] * [ ... U00 U01 U02 U03 ... ] * [ ... U10 U12 U13 ... ] * ---------------------------- r * [ ... U22 U23 ... ] * [ ... U33 ... ] * ---------------------------- * * Assume [ U00 U01 ] was already inverted and multiplied with [ U02 U03 ... ] * [ U10 ] [ U12 U13 ... ] * * We then invert U22 and construct a table for [U22 U23 ... ], then we * invert [U33] and multiply it with [U23]. Then we construct a table for [U23 ... ] **/ _mzd_trtri_upper_submatrix(A, r, r, k); _mzd_ple_to_e(U[0], A, r, r, k, id); mzd_make_table_trtri(U[0], 0, r, k, T[0], L[0]); _mzd_trtri_upper_submatrix(A, r+k, r, k); _mzd_ple_to_e(U[1], A, r+k, r+k, k, id); mzd_make_table_trtri(U[1], 0, r+k, k, T[1], L[1]); _mzd_trtri_upper_submatrix(A, r+2*k, r, k); _mzd_ple_to_e(U[2], A, r+2*k, r+2*k, k, id); mzd_make_table_trtri(U[2], 0, r+2*k, k, T[2], L[2]); _mzd_trtri_upper_submatrix(A, r+3*k, r, k); _mzd_ple_to_e(U[3], A, r+3*k, r+3*k, k, id); mzd_make_table_trtri(U[3], 0, r+3*k, k, T[3], L[3]); mzd_process_rows4_ple(A, 0, r, r, k, T[0], L[0], k, T[1], L[1], k, T[2], L[2], k, T[3], L[3]); r += kk; } /** deal with the rest **/ while(r < A->nrows) { if (A->nrows - r < k) k = A->nrows - r; for(rci_t i=0; incols ) mzd_row_add_offset(A, r+j, r+i, r+i+1); _mzd_ple_to_e(U[0], A, r, r, k, id); mzd_make_table_trtri(U[0], 0, r, k, T[0], L[0]); mzd_process_rows(A, 0, r, r, k, T[0], L[0]); r += k; } for(int i=0; i<__M4RI_TRTRI_NTABLES; i++) { mzd_free(T[i]); m4ri_mm_free(L[i]); mzd_free(U[i]); } __M4RI_DD_MZD(A); return A; } libm4ri-20130416/src/triangular_russian.h000066400000000000000000000025571212302366200202000ustar00rootroot00000000000000/** * \file triangular_russian.h * \brief TRSM and TRTRI via Gray code tables. * * \author Martin Albrecht */ #ifndef M4RI_TRIANGULAR_RUSSIAN #define M4RI_TRIANGULAR_RUSSIAN /******************************************************************* * * M4RI: Linear Algebra over GF(2) * * Copyright (C) 2008-2011 Martin Albrecht * * Distributed under the terms of the GNU General Public License (GPL) * version 2 or higher. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ * ********************************************************************/ #include void _mzd_trsm_upper_left_russian(mzd_t const *U, mzd_t *B, int k); /** * \brief Invert the upper triangular matrix A using Kronrod's method. * * \param A Matrix to be inverted (overwritten). * \param k Table size parameter, may be 0 for automatic choice. * * \wordoffset * * \return Inverse of A or throws an error */ mzd_t *mzd_trtri_upper_russian(mzd_t *A, int k); #endif //M4RI_TRIANGULAR_RUSSIAN libm4ri-20130416/src/wordwrapper.h000066400000000000000000000130511212302366200166270ustar00rootroot00000000000000/** * \file wordwrapper.h * * \brief C++ class wrapper for a word. * * \author Carlo Wood * * To use the wrapper class, configure with CC (not CXX) set to a C++ compiler. * For example: * * CFLAGS="-O2" CC="g++" ./configure --enable-debug */ /****************************************************************************** * * M4RI: Linear Algebra over GF(2) * * Copyright (C) 2011 Carlo Wood * * Distributed under the terms of the GNU General Public License (GPL) * version 2 or higher. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ ******************************************************************************/ #ifndef M4RI_DOXYGEN class word { private: bool M_initialized; uint64_t M_word; public: // Default constructor. Construct uninitialized word. word(void) : M_initialized(false), M_word(0xdead12344321deadUL) { } // Construct a zeroed word from the int 0. word(int value) : M_initialized(true), M_word(0) { assert(value == 0); } // Construct a word from a given uint64_t integer value. explicit word(uint64_t value) : M_initialized(true), M_word(value) { } // Copy constructor. word(word const& w) : M_initialized(w.M_initialized), M_word(w.M_word) { assert(M_initialized); } // Destructor. ~word() { M_initialized = false; M_word = 0xdeaddeaddeaddeadUL; } // Assignment operators. word& operator=(word const& w) { assert(w.M_initialized); M_initialized = w.M_initialized; M_word = w.M_word; return *this; } // Assign 0 to a word. word& operator=(int value) { assert(value == 0); // Only 0 may be assigned. M_initialized = true; M_word = 0; return *this; } // Compare two words. friend bool operator==(word const& w1, word const& w2) { assert(w1.M_initialized && w2.M_initialized); return w1.M_word == w2.M_word; } friend bool operator!=(word const& w1, word const& w2) { assert(w1.M_initialized && w2.M_initialized); return w1.M_word != w2.M_word; } // Invert all bits in a word. word operator~(void) const { return word(~M_word); } // Convert word as boolean to a mask with all zeroes (false) or all ones (true), by negating it. word operator-(void) const { assert((M_word & ~1UL) == 0); return word(-M_word); } // Bit-wise binary operators. friend word operator^(word const& w1, word const& w2) { assert(w1.M_initialized && w2.M_initialized); return word(w1.M_word ^ w2.M_word); } friend word operator&(word const& w1, word const& w2) { assert(w1.M_initialized && w2.M_initialized); return word(w1.M_word & w2.M_word); } friend word operator|(word const& w1, word const& w2) { assert(w1.M_initialized && w2.M_initialized); return word(w1.M_word | w2.M_word); } word& operator^=(word const& w) { assert(M_initialized && w.M_initialized); M_word ^= w.M_word; return *this; } word& operator&=(word const& w) { assert(M_initialized && w.M_initialized); M_word &= w.M_word; return *this; } word& operator|=(word const& w) { assert(M_initialized && w.M_initialized); M_word |= w.M_word; return *this; } // Shift operators. friend word operator<<(word const& w, size_t shift) { assert(w.M_initialized); assert(shift < 64); return word(w.M_word << shift); } friend word operator<<(word const& w, int shift) { assert(w.M_initialized); assert(shift >= 0 && shift < 64); return word(w.M_word << shift); } friend word operator>>(word const& w, size_t shift) { assert(w.M_initialized); assert(shift < 64); return word(w.M_word >> shift); } friend word operator>>(word const& w, int shift) { assert(w.M_initialized); assert(shift >= 0 && shift < 64); return word(w.M_word >> shift); } word& operator<<=(int shift) { assert(M_initialized); assert(shift >= 0 && shift < 64); M_word <<= shift; return *this; } word& operator>>=(int shift) { assert(M_initialized); assert(shift >= 0 && shift < 64); M_word >>= shift; return *this; } // Initialize an array of words with zero. static void init_array(word* const& a, wi_t size) { for (wi_t i = 0; i < size; ++i) a[i] = 0; } // Perform explicit conversions. BIT convert_to_BIT(void) const { assert(M_initialized); assert((M_word & ~1UL) == 0); // May only be 0 or 1. return M_word; } int convert_to_int(void) const { assert(M_initialized); assert(M_word <= 0x7fffffffU); // Make sure the value doesn't exceed the maximum value of an int. return M_word; } uint64_t convert_to_uint64_t(void) const { assert(M_initialized); return M_word; } // NOT operator. Returns true if all bits are zero. bool operator!(void) const { return !M_word; } // Automatic conversion to boolean. operator bool(void) const { assert(M_initialized); return M_word != 0; } private: // Disallow conversion to int (this protects us from accidental conversion to bool (see above) and from there to int without us noticing that). operator int(void) const { assert(false); return 0; } }; #define __M4RI_CONVERT_TO_BIT(w) ((w).convert_to_BIT()) #define __M4RI_CONVERT_TO_INT(w) ((w).convert_to_int()) #define __M4RI_CONVERT_TO_UINT64_T(w) ((w).convert_to_uint64_t()) #define __M4RI_CONVERT_TO_WORD(i) word((uint64_t)(i)) #endif // M4RI_DOXYGEN libm4ri-20130416/src/xor.h000066400000000000000000000310571212302366200150710ustar00rootroot00000000000000/** * \file xor.h * \brief Functions for adding vectors. * * \author Martin Albrecht * * \todo start counting at 0! */ #ifndef M4RI_XOR_H #define M4RI_XOR_H /******************************************************************* * * M4RI: Linear Algebra over GF(2) * * Copyright (C) 2008-2010 Martin Albrecht * * Distributed under the terms of the GNU General Public License (GPL) * version 2 or higher. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ * ********************************************************************/ #include #if __M4RI_HAVE_SSE2 #include #endif #include /** * Compute c[i] += t1[i] + t2[i] + t3[i] + t4[i] + t5[i] + t6[i] + t7[i] + t8[i] for 0 <= i < wide * * \todo the non SSE2 version of this code is slow, replace by code * from mzd_process_rows8 */ static inline void _mzd_combine8(word *c, word const *t1, word const *t2, word const *t3, word const *t4, word const *t5, word const *t6, word const *t7, word const *t8, wi_t wide_in) { wi_t wide = wide_in; #if __M4RI_HAVE_SSE2 /* assuming t1 ... t8 are aligned, but c might not be */ if (__M4RI_ALIGNMENT(c,16)==0) { __m128i *__c = (__m128i*)c; __m128i *__t1 = (__m128i*)t1; __m128i *__t2 = (__m128i*)t2; __m128i *__t3 = (__m128i*)t3; __m128i *__t4 = (__m128i*)t4; __m128i *__t5 = (__m128i*)t5; __m128i *__t6 = (__m128i*)t6; __m128i *__t7 = (__m128i*)t7; __m128i *__t8 = (__m128i*)t8; const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xFUL); __m128i xmm1; while(__c < eof) { xmm1 = _mm_xor_si128(*__c, *__t1++); xmm1 = _mm_xor_si128(xmm1, *__t2++); xmm1 = _mm_xor_si128(xmm1, *__t3++); xmm1 = _mm_xor_si128(xmm1, *__t4++); xmm1 = _mm_xor_si128(xmm1, *__t5++); xmm1 = _mm_xor_si128(xmm1, *__t6++); xmm1 = _mm_xor_si128(xmm1, *__t7++); xmm1 = _mm_xor_si128(xmm1, *__t8++); *__c++ = xmm1; } c = (word*)__c; t1 = (word*)__t1; t2 = (word*)__t2; t3 = (word*)__t3; t4 = (word*)__t4; t5 = (word*)__t5; t6 = (word*)__t6; t7 = (word*)__t7; t8 = (word*)__t8; wide = ((sizeof(word) * wide) % 16) / sizeof(word); } #endif for(wi_t i = 0; i < wide; ++i) { c[i] ^= t1[i] ^ t2[i] ^ t3[i] ^ t4[i] ^ t5[i] ^ t6[i] ^ t7[i] ^ t8[i]; } __M4RI_DD_RAWROW(c, wide_in); } /** * Compute c[i] += t1[i] + t2[i] + t3[i] + t4[i] + t5[i] + t6[i] for 0 <= i < wide * */ static inline void _mzd_combine6(word *c, word const *t1, word const *t2, word const *t3, word const *t4, word const *t5, word const *t6, wi_t wide_in) { wi_t wide = wide_in; #if __M4RI_HAVE_SSE2 /* assuming t1 ... t4 are aligned, but c might not be */ assert(__M4RI_ALIGNMENT(c,16) == 8 || __M4RI_ALIGNMENT(c,16) == 0); assert(__M4RI_ALIGNMENT(c,16) == __M4RI_ALIGNMENT(t1,16)); if (__M4RI_ALIGNMENT(c,16) == 8) { *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++ ^ *t6++; wide--; } __m128i *__c = (__m128i*)c; __m128i *__t1 = (__m128i*)t1; __m128i *__t2 = (__m128i*)t2; __m128i *__t3 = (__m128i*)t3; __m128i *__t4 = (__m128i*)t4; __m128i *__t5 = (__m128i*)t5; __m128i *__t6 = (__m128i*)t6; const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xFUL); __m128i xmm1; while(__c < eof) { xmm1 = _mm_xor_si128(*__c, *__t1++); xmm1 = _mm_xor_si128(xmm1, *__t2++); xmm1 = _mm_xor_si128(xmm1, *__t3++); xmm1 = _mm_xor_si128(xmm1, *__t4++); xmm1 = _mm_xor_si128(xmm1, *__t5++); xmm1 = _mm_xor_si128(xmm1, *__t6++); *__c++ = xmm1; } c = (word*)__c; t1 = (word*)__t1; t2 = (word*)__t2; t3 = (word*)__t3; t4 = (word*)__t4; t5 = (word*)__t5; t6 = (word*)__t6; wide = ((sizeof(word) * wide) % 16) / sizeof(word); if(wide) { *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++ ^ *t6++; } __M4RI_DD_RAWROW(c, wide_in); return; #else wi_t n = (wide + 7) / 8; switch (wide % 8) { case 0: do { *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++ ^ *t6++; case 7: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++ ^ *t6++; case 6: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++ ^ *t6++; case 5: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++ ^ *t6++; case 4: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++ ^ *t6++; case 3: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++ ^ *t6++; case 2: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++ ^ *t6++; case 1: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++ ^ *t6++; } while (--n > 0); } __M4RI_DD_RAWROW(c, wide_in); return; #endif // __M4RI_HAVE_SSE2 } /** * Compute c[i] += t1[i] + t2[i] + t3[i] + t4[i] + t5[i] for 0 <= i < wide * */ static inline void _mzd_combine5(word *c, word const *t1, word const *t2, word const *t3, word const *t4, word const *t5, wi_t wide_in) { wi_t wide = wide_in; #if __M4RI_HAVE_SSE2 /* assuming t1 ... t4 are aligned, but c might not be */ assert(__M4RI_ALIGNMENT(c,16) == 8 || __M4RI_ALIGNMENT(c,16) == 0); assert(__M4RI_ALIGNMENT(c,16) == __M4RI_ALIGNMENT(t1,16)); if (__M4RI_ALIGNMENT(c,16) == 8) { *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++; wide--; } __m128i *__c = (__m128i*)c; __m128i *__t1 = (__m128i*)t1; __m128i *__t2 = (__m128i*)t2; __m128i *__t3 = (__m128i*)t3; __m128i *__t4 = (__m128i*)t4; __m128i *__t5 = (__m128i*)t5; const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xFUL); __m128i xmm1; while(__c < eof) { xmm1 = _mm_xor_si128(*__c, *__t1++); xmm1 = _mm_xor_si128(xmm1, *__t2++); xmm1 = _mm_xor_si128(xmm1, *__t3++); xmm1 = _mm_xor_si128(xmm1, *__t4++); xmm1 = _mm_xor_si128(xmm1, *__t5++); *__c++ = xmm1; } c = (word*)__c; t1 = (word*)__t1; t2 = (word*)__t2; t3 = (word*)__t3; t4 = (word*)__t4; t5 = (word*)__t5; wide = ((sizeof(word) * wide) % 16) / sizeof(word); if(wide) { *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++; } __M4RI_DD_RAWROW(c, wide_in); return; #else wi_t n = (wide + 7) / 8; switch (wide % 8) { case 0: do { *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++; case 7: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++; case 6: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++; case 5: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++; case 4: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++; case 3: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++; case 2: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++; case 1: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++; } while (--n > 0); } __M4RI_DD_RAWROW(c, wide_in); return; #endif // __M4RI_HAVE_SSE2 } /** * Compute c[i] += t1[i] + t2[i] + t3[i] + t4[i] for 0 <= i < wide * */ static inline void _mzd_combine4(word *c, word const *t1, word const *t2, word const *t3, word const *t4, wi_t wide_in) { wi_t wide = wide_in; #if __M4RI_HAVE_SSE2 /* assuming t1 ... t4 are aligned, but c might not be */ assert(__M4RI_ALIGNMENT(c,16) == 8 || __M4RI_ALIGNMENT(c,16) == 0); assert(__M4RI_ALIGNMENT(c,16) == __M4RI_ALIGNMENT(t1,16)); if (__M4RI_ALIGNMENT(c,16) == 8) { *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++; wide--; } __m128i *__c = (__m128i*)c; __m128i *__t1 = (__m128i*)t1; __m128i *__t2 = (__m128i*)t2; __m128i *__t3 = (__m128i*)t3; __m128i *__t4 = (__m128i*)t4; const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xFUL); __m128i xmm1; while(__c < eof) { xmm1 = _mm_xor_si128(*__c, *__t1++); xmm1 = _mm_xor_si128(xmm1, *__t2++); xmm1 = _mm_xor_si128(xmm1, *__t3++); xmm1 = _mm_xor_si128(xmm1, *__t4++); *__c++ = xmm1; } c = (word*)__c; t1 = (word*)__t1; t2 = (word*)__t2; t3 = (word*)__t3; t4 = (word*)__t4; wide = ((sizeof(word) * wide) % 16) / sizeof(word); if(wide) { *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++; } __M4RI_DD_RAWROW(c, wide_in); return; #else wi_t n = (wide + 7) / 8; switch (wide % 8) { case 0: do { *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++; case 7: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++; case 6: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++; case 5: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++; case 4: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++; case 3: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++; case 2: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++; case 1: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++; } while (--n > 0); } __M4RI_DD_RAWROW(c, wide_in); return; #endif // __M4RI_HAVE_SSE2 } /** * Compute c[i] += t1[i] + t2[i] + t3[i] for 0 <= i < wide * */ static inline void _mzd_combine3(word *c, word const *t1, word const *t2, word const *t3, wi_t wide_in) { wi_t wide = wide_in; #if __M4RI_HAVE_SSE2 /* assuming t1 ... t3 are aligned, but c might not be */ if (__M4RI_ALIGNMENT(c,16)==0) { __m128i *__c = (__m128i*)c; __m128i *__t1 = (__m128i*)t1; __m128i *__t2 = (__m128i*)t2; __m128i *__t3 = (__m128i*)t3; const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xFUL); __m128i xmm1; while(__c < eof) { xmm1 = _mm_xor_si128(*__c, *__t1++); xmm1 = _mm_xor_si128(xmm1, *__t2++); xmm1 = _mm_xor_si128(xmm1, *__t3++); *__c++ = xmm1; } c = (word*)__c; t1 = (word*)__t1; t2 = (word*)__t2; t3 = (word*)__t3; wide = ((sizeof(word) * wide) % 16) / sizeof(word); } if(!wide) { __M4RI_DD_RAWROW(c, wide_in); return; } #endif // __M4RI_HAVE_SSE2 wi_t n = (wide + 7) / 8; switch (wide % 8) { case 0: do { *c++ ^= *t1++ ^ *t2++ ^ *t3++; case 7: *c++ ^= *t1++ ^ *t2++ ^ *t3++; case 6: *c++ ^= *t1++ ^ *t2++ ^ *t3++; case 5: *c++ ^= *t1++ ^ *t2++ ^ *t3++; case 4: *c++ ^= *t1++ ^ *t2++ ^ *t3++; case 3: *c++ ^= *t1++ ^ *t2++ ^ *t3++; case 2: *c++ ^= *t1++ ^ *t2++ ^ *t3++; case 1: *c++ ^= *t1++ ^ *t2++ ^ *t3++; } while (--n > 0); } __M4RI_DD_RAWROW(c, wide_in); } /** * Compute c[i] += t1[i] + t2[i] for 0 <= i < wide * */ static inline void _mzd_combine2(word *c, word const *t1, word const *t2, wi_t wide_in) { wi_t wide = wide_in; #if __M4RI_HAVE_SSE2 /* assuming t1 ... t2 are aligned, but c might not be */ if (__M4RI_ALIGNMENT(c,16)==0) { __m128i *__c = (__m128i*)c; __m128i *__t1 = (__m128i*)t1; __m128i *__t2 = (__m128i*)t2; const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xFUL); __m128i xmm1; while(__c < eof) { xmm1 = _mm_xor_si128(*__c, *__t1++); xmm1 = _mm_xor_si128(xmm1, *__t2++); *__c++ = xmm1; } c = (word*)__c; t1 = (word*)__t1; t2 = (word*)__t2; wide = ((sizeof(word) * wide) % 16) / sizeof(word); } if(!wide) { __M4RI_DD_RAWROW(c, wide_in); return; } #endif // __M4RI_HAVE_SSE2 wi_t n = (wide + 7) / 8; switch (wide % 8) { case 0: do { *c++ ^= *t1++ ^ *t2++; case 7: *c++ ^= *t1++ ^ *t2++; case 6: *c++ ^= *t1++ ^ *t2++; case 5: *c++ ^= *t1++ ^ *t2++; case 4: *c++ ^= *t1++ ^ *t2++; case 3: *c++ ^= *t1++ ^ *t2++; case 2: *c++ ^= *t1++ ^ *t2++; case 1: *c++ ^= *t1++ ^ *t2++; } while (--n > 0); } __M4RI_DD_RAWROW(c, wide_in); } /** * Compute c[i] += t1[i] for 0 <= i < wide * */ static inline void _mzd_combine(word *c, word const *t1, wi_t wide_in) { wi_t wide = wide_in; #if __M4RI_HAVE_SSE2 /* assuming c, t1 are alligned the same way */ if (__M4RI_ALIGNMENT(c,16)==8 && wide) { *c++ ^= *t1++; wide--; } __m128i *__c = (__m128i*)c; __m128i *__t1 = (__m128i*)t1; const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xFUL); __m128i xmm1; while(__c < eof-1) { xmm1 = _mm_xor_si128(*__c, *__t1++); *__c++ = xmm1; xmm1 = _mm_xor_si128(*__c, *__t1++); *__c++ = xmm1; } if(__c < eof) { xmm1 = _mm_xor_si128(*__c, *__t1++); *__c++ = xmm1; } c = (word*)__c; t1 = (word*)__t1; wide = ((sizeof(word) * wide) % 16) / sizeof(word); if(!wide) { __M4RI_DD_RAWROW(c, wide_in); return; } #endif // __M4RI_HAVE_SSE2 wi_t n = (wide + 7) / 8; switch (wide % 8) { case 0: do { *c++ ^= *t1++; case 7: *c++ ^= *t1++; case 6: *c++ ^= *t1++; case 5: *c++ ^= *t1++; case 4: *c++ ^= *t1++; case 3: *c++ ^= *t1++; case 2: *c++ ^= *t1++; case 1: *c++ ^= *t1++; } while (--n > 0); } __M4RI_DD_RAWROW(c, wide_in); } #endif // M4RI_XOR_H libm4ri-20130416/testsuite/000077500000000000000000000000001212302366200153445ustar00rootroot00000000000000libm4ri-20130416/testsuite/Makefile.in000066400000000000000000000061441212302366200174160ustar00rootroot00000000000000# The use TOPSRCDIR allows to compile this testsuite for a clone (and different revision) # of this repository. If the environment variable topsrcdir is the top source directory # of the clone, compile this testsuite as: # # defines="-Dmzd_randomize=bench_randomize" # if ! grep lesser_LSB $topsrcdir/src/misc.h >/dev/null; then # defines="$defines -DBENCH_RANDOM_REVERSE" # fi # if ! grep rci_t $topsrcdir/src/misc.h >/dev/null; then # defines="$defines -Drci_t=size_t -Dwi_t=size_t" # fi # make bench_elimination bench_multiplication TOPSRCDIR="$topsrcdir" DEFINES="$defines" # # Note that if (once) the revision of TOPSRCDIR is new enough and contains the # current fast mzd_randomize (or when you don't care that it's slow) then you # can completely leave out DEFINES and just compile as: # # make TOPSRCDIR="$topsrcdir" # # Finally, if you also leave out TOPSRCDIR and just run 'make' then the testsuite # is compiled against the current source tree. CC = @CC@ DEFINES = TOPSRCDIR = .. CFLAGS = -I$(TOPSRCDIR) -D_XOPEN_SOURCE=600 @CFLAGS@ $(DEFINES) @OPENMP_CFLAGS@ STAGEDIR := $(realpath -s $(TOPSRCDIR)/.libs) LDFLAGS = -L$(STAGEDIR) -Wl,-rpath,$(STAGEDIR) -lm4ri -lm PAPI_FLAGS = @PAPI_LDFLAGS@ @PAPI_LIBS@ PAPI_CFLAGS = @PAPI_CFLAGS@ DEBUG = -ggdb TEST_PRGS = \ test_elimination \ test_multiplication \ test_trsm \ test_ple \ test_solve \ test_kernel \ test_random \ test_smallops \ test_transpose \ test_colswap \ test_misc \ test_invert BENCH_PRGS = \ bench_elimination \ bench_multiplication \ bench_ple \ bench_trsm \ bench_elimination_sparse \ bench_mzd \ bench_invert M4RI_HEADERS = \ ../src/config.h \ ../src/m4ri.h \ ../src/brilliantrussian.h \ ../src/echelonform.h \ ../src/graycode.h \ ../src/misc.h \ ../src/mmc.h \ ../src/parity.h \ ../src/ple.h \ ../src/ple_russian.h \ ../src/solve.h \ ../src/strassen.h \ ../src/triangular.h \ ../src/triangular_russian.h \ ../src/mzd.h \ ../src/mzp.h \ ../src/xor.h CPUCYCLES_DIR = ./cpucycles-20060326 .PHONY: all clean dist-clean all: $(TEST_PRGS) $(BENCH_PRGS) bench_%: cpucycles.o benchmarking.o bench_%.c Makefile cpucycles.h $(M4RI_HEADERS) $(CC) $(DEBUG) $(PAPI_CFLAGS) $(CFLAGS) $@.c cpucycles.o benchmarking.o $(LDFLAGS) $(PAPI_FLAGS) -lm -o $@ test_%: test_%.c testing.o Makefile $(M4RI_HEADERS) $(CC) $(DEBUG) $(CFLAGS) $@.c $(LDFLAGS) testing.o -o $@ clean: rm -f $(TEST_PRGS) rm -f $(BENCH_PRGS) rm -f *.o cpucycles.h rm -f $(CPUCYCLES_DIR)/cpucycles.o rm -f $(CPUCYCLES_DIR)/cpucycles.h distclean: clean @MAINTAINER_MODE_TRUE@ rm -f Makefile cpucycles.h: cpucycles.o cpucycles.o: (cd $(CPUCYCLES_DIR); sh do; cp cpucycles.o ..; cp cpucycles.h ..) testing.o: testing.c testing.h ../src/misc.h ../src/mzd.h $(CC) $(DEBUG) $(CFLAGS) -c testing.c $(LDFLAGS) -o $@ benchmarking.o: benchmarking.c benchmarking.h ../src/config.h ../src/misc.h $(CC) $(DEBUG) $(PAPI_CFLAGS) $(CFLAGS) -c benchmarking.c $(LDFLAGS) -o $@ Makefile: @MAINTAINER_MODE_TRUE@ Makefile.in ../config.status (cd .. && /bin/sh ./config.status testsuite/Makefile) libm4ri-20130416/testsuite/bench_elimination.c000066400000000000000000000142451212302366200211650ustar00rootroot00000000000000#include #include #include #include "cpucycles.h" #include "benchmarking.h" #ifdef HAVE_LIBPAPI #define _GNU_SOURCE #include // papi.h needs caddr_t #include #include #endif struct elim_params { rci_t m; rci_t n; rci_t r; char const *algorithm; }; static unsigned long long loop_calibration[32]; int run_nothing(void *_p, unsigned long long *data, int *data_len) { struct elim_params *p = (struct elim_params *)_p; mzd_t *A = mzd_init(p->m, p->n); if(p->r != 0) { mzd_t *L, *U; L = mzd_init(p->m, p->m); U = mzd_init(p->m, p->n); mzd_randomize(U); mzd_randomize(L); for (rci_t i = 0; i < p->m; ++i) { for (rci_t j = i + 1; j < p->m; j+=m4ri_radix) { int const length = MIN(m4ri_radix, p->m - j); mzd_clear_bits(L, i, j, length); } mzd_write_bit(L,i,i, 1); for (rci_t j = 0; j < i; j+=m4ri_radix) { int const length = MIN(m4ri_radix, i - j); mzd_clear_bits(U, i, j, length); } if(i < p->r) { mzd_write_bit(U, i, i, 1); } else { for (rci_t j = i; j < p->n; j+=m4ri_radix) { int const length = MIN(m4ri_radix, p->n - i); mzd_clear_bits(U, i, j, length); } } } mzd_mul(A,L,U,0); mzd_free(L); mzd_free(U); } else { mzd_randomize(A); } #ifndef HAVE_LIBPAPI *data_len = 2; #else *data_len = MIN(papi_array_len + 1, *data_len); #endif int papi_res; #ifndef HAVE_LIBPAPI data[0] = walltime(0); data[1] = cpucycles(); #else int array_len = *data_len - 1; unsigned long long t0 = PAPI_get_virt_usec(); papi_res = PAPI_start_counters((int*)papi_events, array_len); if(papi_res) m4ri_die(""); #endif #ifndef HAVE_LIBPAPI data[1] = cpucycles() - data[1]; data[0] = walltime(data[0]); #else PAPI_stop_counters((long long*)&data[1], array_len); t0 = PAPI_get_virt_usec() - t0; data[0] = t0; for (int nv = 0; nv <= array_len; ++nv) { if (data[nv] < loop_calibration[nv]) loop_calibration[nv] = data[nv]; } #endif mzd_free(A); return (0); } int run(void *_p, unsigned long long *data, int *data_len) { struct elim_params *p = (struct elim_params *)_p; #ifndef HAVE_LIBPAPI *data_len = 2; #else *data_len = MIN(papi_array_len + 1, *data_len); #endif int papi_res; mzd_t *A = mzd_init(p->m, p->n); if(p->r != 0) { mzd_t *L, *U; L = mzd_init(p->m, p->m); U = mzd_init(p->m, p->n); mzd_randomize(U); mzd_randomize(L); for (rci_t i = 0; i < p->m; ++i) { for (rci_t j = i + 1; j < p->m; j+=m4ri_radix) { int const length = MIN(m4ri_radix, p->m - j); mzd_clear_bits(L, i, j, length); } mzd_write_bit(L,i,i, 1); for (rci_t j = 0; j < i; j+=m4ri_radix) { int const length = MIN(m4ri_radix, i - j); mzd_clear_bits(U, i, j, length); } if(i < p->r) { mzd_write_bit(U, i, i, 1); } else { for (rci_t j = i; j < p->n; j+=m4ri_radix) { int const length = MIN(m4ri_radix, p->n - i); mzd_clear_bits(U, i, j, length); } } } mzd_mul(A,L,U,0); mzd_free(L); mzd_free(U); } else { mzd_randomize(A); } #ifndef HAVE_LIBPAPI data[0] = walltime(0); data[1] = cpucycles(); #else int array_len = *data_len - 1; unsigned long long t0 = PAPI_get_virt_usec(); papi_res = PAPI_start_counters((int*)papi_events, array_len); if (papi_res) m4ri_die(""); #endif if(strcmp(p->algorithm, "m4ri") == 0) p->r = mzd_echelonize_m4ri(A, 1, 0); else if(strcmp(p->algorithm, "pluq") == 0) p->r = mzd_echelonize_pluq(A, 1); else if(strcmp(p->algorithm, "mmpf") == 0) p->r = _mzd_pluq_russian(A, mzp_init(A->nrows), mzp_init(A->ncols), 0); else if(strcmp(p->algorithm, "naive") == 0) p->r = mzd_echelonize_naive(A, 1); #ifndef HAVE_LIBPAPI data[1] = cpucycles() - data[1]; data[0] = walltime(data[0]); #else PAPI_stop_counters((long long*)&data[1], array_len); t0 = PAPI_get_virt_usec() - t0; data[0] = t0; for (int nv = 0; nv <= array_len; ++nv) { data[nv] -= loop_calibration[nv]; } #endif mzd_free(A); return 0; } void print_help_and_exit() { printf("Parameters m(, n, alg, r) expected.\n"); printf(" m -- integer > 0\n"); printf(" n -- integer > 0\n"); printf(" alg -- 'm4ri', 'pluq', 'mmpf' or 'naive' (default: 'pluq')\n"); printf(" r -- target rank >= 0, if 0 then mzd_randomize() is called (default: MIN(m,n))\n"); printf("\n"); bench_print_global_options(stderr); m4ri_die(""); } int main(int argc, char **argv) { int opts = global_options(&argc, &argv); int data_len; #ifdef HAVE_LIBPAPI int papi_counters = PAPI_num_counters(); if (papi_counters < papi_array_len) { fprintf(stderr, "%s: Warning: there are only %d hardware counters available!\n", progname, papi_counters); papi_array_len = papi_counters; } if (papi_test(papi_events, papi_array_len)) exit(1); for (int nv = 0; nv <= papi_array_len; ++nv) loop_calibration[nv] = 100000000; data_len = papi_array_len + 1; #else data_len = 2; #endif if (opts < 0 || argc < 2 || argc > 5) { print_help_and_exit(); } struct elim_params params; params.m = atoi(argv[1]); if (argc >= 3) params.n = atoi(argv[2]); else params.n = params.m; if (argc >= 4) params.algorithm = argv[3]; else params.algorithm = "pluq"; if (argc >= 5) params.r = atoi(argv[4]); else params.r = params.m; srandom(17); unsigned long long data[16]; for (int i = 0; i < 4; ++i) run_nothing((void*)¶ms, data, &data_len); run_bench(run, (void*)¶ms, data, data_len); double cc_per_op = ((double)data[1])/ ( (double)params.m * (double)params.n * powl((double)params.r,0.807) ); printf("m: %5d, n: %5d, last r: %5d, cpu cycles: %12llu, cc/(mnr^0.807): %.5lf, ", params.m, params.n, params.r, data[1], cc_per_op); print_wall_time(data[0] / 1000000.0); printf("\n"); #ifdef HAVE_LIBPAPI for (int n = 1; n < data_len; ++n) { double tmp = ((double)data[n]) / powl((double)params.n,2.807); printf("%20s (%20llu) per bit (divided by n^2.807): %15.5f\n", papi_event_name(papi_events[n - 1]), data[n], tmp); } #endif } libm4ri-20130416/testsuite/bench_elimination_sparse.c000066400000000000000000000037031212302366200225370ustar00rootroot00000000000000#include #include #include #include "cpucycles.h" #include "benchmarking.h" struct elim_sparse_params { rci_t m; rci_t n; rci_t r; char const *algorithm; long density; int full; }; int run(void *_p, unsigned long long *data, int *data_len) { struct elim_sparse_params *p = (struct elim_sparse_params *)_p; *data_len = 2; mzd_t *A = mzd_init(p->m, p->n); for(rci_t i = 0; i < p->m; ++i) { for(rci_t j = 0; j < p->n; ++j) { if(random() <= p->density) { mzd_write_bit(A, i, j, 1); } } } data[0] = walltime(0); data[1] = cpucycles(); if(strcmp(p->algorithm, "m4ri") == 0) p->r = mzd_echelonize_m4ri(A, p->full, 0); else if(strcmp(p->algorithm, "cross") == 0) p->r = mzd_echelonize(A, p->full); else if(strcmp(p->algorithm, "pluq") == 0) p->r = mzd_echelonize_pluq(A, p->full); else if(strcmp(p->algorithm, "naive") == 0) p->r = mzd_echelonize_naive(A, p->full); data[1] = cpucycles() - data[1]; data[0] = walltime(data[0]); mzd_free(A); return 0; } int main(int argc, char **argv) { global_options(&argc, &argv); if (argc < 3) { m4ri_die("Parameters m,n, (alg,density,full) expected.\n"); } struct elim_sparse_params p; p.density = RAND_MAX / 10; // Use a density of 0.1 by default. p.full = 1; if (argc >= 4) p.algorithm = argv[3]; else p.algorithm = "m4ri"; if (argc >= 5) p.density = RAND_MAX * strtod(argv[4], NULL); if(argc >= 6) p.full = atoi(argv[5]); p.m = atoi(argv[1]); p.n = atoi(argv[2]); /* put this call in run() to benchmark one particular matrix over and over again instead of computing the average of various matrices.*/ srandom(17); unsigned long long data[2]; run_bench(run,(void*)&p, data, 2); printf("m: %5d, n: %5d, last r: %5d, density: %7.5f, cpu cycles: %10llu, wall time: %lf\n", p.m, p.n, p.r, (double)p.density / RAND_MAX, data[1], data[0] / 1000000.0); } libm4ri-20130416/testsuite/bench_invert.c000066400000000000000000000054721212302366200201660ustar00rootroot00000000000000#include #include #include #include "cpucycles.h" #include "benchmarking.h" struct inv_params { rci_t n; int direction; char const *algorithm; }; int run(void *_p, unsigned long long *data, int *data_len) { struct inv_params *p = (struct inv_params *)_p; *data_len = 2; mzd_t *A = NULL, *L = NULL, *U = NULL, *B = NULL; if (p->direction <= 0) { L = mzd_init(p->n, p->n); mzd_randomize(L); for (rci_t i = 0; i < p->n; ++i) { for (rci_t j = i + 1; j < p->n; ++j) mzd_write_bit(L,i,j, 0); mzd_write_bit(L,i,i, 1); } } if (p->direction >= 0) { U = mzd_init(p->n, p->n); mzd_randomize(U); for (rci_t i = 0; i < p->n; ++i) { for (rci_t j = 0; j < i; ++j) mzd_write_bit(U,i,j, 0); mzd_write_bit(U,i,i, 1); } } switch(p->direction) { case 0: A = mzd_mul(NULL, L, U, 0); mzd_free(L); mzd_free(U); break; case -1: A = L; break; case 1: A = U; break; default: m4ri_die("unknown direction '%d'",p->direction); }; data[0] = walltime(0); data[1] = cpucycles(); switch(p->direction) { case 0: if(strcmp(p->algorithm, "m4ri") == 0) B = mzd_inv_m4ri(NULL, A, 0); else m4ri_die("unknown algorithm: '%s'",p->algorithm); break; case 1: if(strcmp(p->algorithm, "m4ri") == 0) { mzd_trtri_upper_russian(A, 0); B = mzd_copy(NULL, A); } else if (strcmp(p->algorithm, "mm") == 0) { mzd_trtri_upper(A); B = mzd_copy(NULL, A); } else m4ri_die("unknown algorithm: '%s'",p->algorithm); break; case -1: m4ri_die("not implemented error"); break; } data[1] = cpucycles() - data[1]; data[0] = walltime(data[0]); mzd_free(A); mzd_free(B); return 0; } int main(int argc, char **argv) { int opts = global_options(&argc, &argv); if (opts < 0) { bench_print_global_options(stderr); exit(-1); } if (argc != 4) { printf("Parameters n,direction,alg expected.\n"); printf(" n -- integer > 0\n"); printf(" direction -- lower triangular (-1), full (0) or upper triangular (1).\n"); printf(" algorithm -- 'm4ri' or 'mm' (for direction 1)\n"); printf("\n"); bench_print_global_options(stderr); m4ri_die(""); } struct inv_params params; params.n = atoi(argv[1]); params.direction = atoi(argv[2]); params.algorithm = argv[3]; /* put this call in run() to benchmark one particular matrix over and over again instead of computing the average of various matrices.*/ srandom(17); unsigned long long data[2]; run_bench(run, (void*)¶ms, data, 2); double cc_per_op = ((double)data[1])/ powl((double)params.n,2.807); printf("n: %5d, cpu cycles: %10llu, cc/(n^2.807): %.5lf, wall time: %lf\n", params.n, data[1], cc_per_op, data[0] / 1000000.0); } libm4ri-20130416/testsuite/bench_multiplication.c000066400000000000000000000117341212302366200217120ustar00rootroot00000000000000#include #include #include "cpucycles.h" #include #include "benchmarking.h" #ifdef HAVE_LIBPAPI #define _GNU_SOURCE #include // papi.h needs caddr_t #include #include #endif struct mul_params { rci_t m; rci_t n; rci_t l; int cutoff; }; static unsigned long long loop_calibration[32]; int run_nothing(void *_p, unsigned long long *data, int *data_len) { struct mul_params *p = (struct mul_params *)_p; mzd_t *A = mzd_init(p->m, p->n); mzd_t *B = mzd_init(p->n, p->l); mzd_randomize(A); mzd_randomize(B); #ifndef HAVE_LIBPAPI *data_len = 2; #else *data_len = MIN(papi_array_len + 1, *data_len); #endif int papi_res; #ifndef HAVE_LIBPAPI data[0] = walltime(0); data[1] = cpucycles(); #else int array_len = *data_len - 1; unsigned long long t0 = PAPI_get_virt_usec(); papi_res = PAPI_start_counters((int*)papi_events, array_len); if(papi_res) m4ri_die(""); #endif #ifndef HAVE_LIBPAPI data[1] = cpucycles() - data[1]; data[0] = walltime(data[0]); #else PAPI_stop_counters((long long*)&data[1], array_len); t0 = PAPI_get_virt_usec() - t0; data[0] = t0; for (int nv = 0; nv <= array_len; ++nv) { if (data[nv] < loop_calibration[nv]) loop_calibration[nv] = data[nv]; } #endif mzd_free(A); mzd_free(B); return (0); } int run(void *_p, unsigned long long *data, int *data_len) { struct mul_params *p = (struct mul_params *)_p; #ifndef HAVE_LIBPAPI *data_len = 2; #else *data_len = MIN(papi_array_len + 1, *data_len); #endif int papi_res; mzd_t *A = mzd_init(p->m, p->n); mzd_t *B = mzd_init(p->n, p->l); mzd_randomize(A); mzd_randomize(B); #ifndef HAVE_LIBPAPI data[0] = walltime(0); data[1] = cpucycles(); #else int array_len = *data_len - 1; unsigned long long t0 = PAPI_get_virt_usec(); papi_res = PAPI_start_counters((int*)papi_events, array_len); if (papi_res) m4ri_die(""); #endif mzd_t *C = mzd_mul(NULL, A, B, p->cutoff); #ifndef HAVE_LIBPAPI data[1] = cpucycles() - data[1]; data[0] = walltime(data[0]); #else PAPI_stop_counters((long long*)&data[1], array_len); t0 = PAPI_get_virt_usec() - t0; data[0] = t0; for (int nv = 0; nv <= array_len; ++nv) { data[nv] -= loop_calibration[nv]; } #endif mzd_free(A); mzd_free(B); mzd_free(C); return (0); } void print_help_and_exit() { printf("Parameters expected.\n"); printf("Two combinations are supported:\n"); printf(" 1. n, cuttoff\n"); printf(" n -- matrix dimension, integer > 0\n"); printf(" cutoff -- integer >= 0 (optional, default: 0).\n\n"); printf(" 2. m, n, l, cuttoff\n"); printf(" m -- row dimension of A, integer > 0\n"); printf(" n -- column dimension of A, integer > 0\n"); printf(" l -- column dimension of B, integer > 0\n"); printf(" cutoff -- integer >= 0 (optional, default: 0).\n\n"); printf("\n"); bench_print_global_options(stderr); m4ri_die(""); } int main(int argc, char **argv) { int opts = global_options(&argc, &argv); int data_len; struct mul_params params; #ifdef HAVE_LIBPAPI int papi_counters = PAPI_num_counters(); if (papi_counters < papi_array_len) { fprintf(stderr, "%s: Warning: there are only %d hardware counters available!\n", progname, papi_counters); papi_array_len = papi_counters; } if (papi_test(papi_events, papi_array_len)) exit(1); for (int nv = 0; nv <= papi_array_len; ++nv) loop_calibration[nv] = 100000000; data_len = papi_array_len + 1; #else data_len = 2; #endif if (opts < 0) { print_help_and_exit(); } switch(argc) { case 2: params.m = atoi(argv[1]); params.n = atoi(argv[1]); params.l = atoi(argv[1]); params.cutoff = 0; break; case 3: params.m = atoi(argv[1]); params.n = atoi(argv[1]); params.l = atoi(argv[1]); params.cutoff = atoi(argv[2]); break; case 4: params.m = atoi(argv[1]); params.n = atoi(argv[2]); params.l = atoi(argv[3]); params.cutoff = 0; break; case 5: params.m = atoi(argv[1]); params.n = atoi(argv[2]); params.l = atoi(argv[3]); params.cutoff = atoi(argv[4]); break; default: print_help_and_exit(); } if (params.m <= 0 || params.n <= 0 || params.l <= 0) { m4ri_die("Parameters m,n,l must be > 0\n"); } srandom(17); unsigned long long data[16]; for (int i = 0; i < 100; ++i) run_nothing((void*)¶ms, data, &data_len); run_bench(run, (void*)¶ms, data, data_len); double cc_per_op = ((double)data[1])/ powl((double)params.n,2.807); printf("m: %5d, n: %5d, l: %5d, cutoff: %5d, cpu cycles: %12llu, cc/n^2.807: %.5lf, ", params.m, params.n, params.l, params.cutoff, data[1], cc_per_op); print_wall_time(data[0] / 1000000.0); printf("\n"); #ifdef HAVE_LIBPAPI for (int n = 1; n < data_len; ++n) { double tmp = ((double)data[n]) / powl((double)params.n,2.807); printf("%20s (%20llu) per bit (divided by n^2.807): %15.5f\n", papi_event_name(papi_events[n - 1]), data[n], tmp); } #endif } libm4ri-20130416/testsuite/bench_mzd.c000066400000000000000000000732421212302366200174510ustar00rootroot00000000000000/* * bench_packedmatrix.c * * Application to test functionality of packedmatrix.c. * * Copyright (C) 2011 Carlo Wood * RSA-1024 0x624ACAD5 1997-01-26 Sign & Encrypt * Fingerprint16 = 32 EC A7 B6 AC DB 65 A6 F6 F6 55 DD 1C DC FF 61 * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS 1 #endif #include #ifdef HAVE_LIBPAPI #define _GNU_SOURCE #include // papi.h needs caddr_t #include #include #endif #include #include #include #include "cpucycles.h" #include #include "benchmarking.h" struct test_params { rci_t m; rci_t n; rci_t k; rci_t l; rci_t row[3]; int rows; rci_t col[3]; int cols; wi_t wrd[3]; int wrds; uint64_t count; int cutoff; int boolean; int integer; char const* funcname; }; typedef int (*run_type)(void*, unsigned long long*, int*); static unsigned long long loop_calibration[32]; #ifdef HAVE_LIBPAPI #define BENCHMARK_PREFIX(mzd_func) \ int run_##mzd_func(void *_p, unsigned long long *data, int *data_len) { \ *data_len = MIN(papi_array_len + 1, *data_len); \ struct test_params *p = (struct test_params *)_p; \ int papi_res; \ do #define TIME_BEGIN(mzd_func_with_ARGS) \ do { \ int array_len = *data_len - 1; \ mzd_func_with_ARGS; \ unsigned long long t0 = PAPI_get_virt_usec(); \ papi_res = PAPI_start_counters((int*)papi_events, array_len) #define TIME_END \ PAPI_stop_counters((long long*)&data[1], array_len); \ t0 = PAPI_get_virt_usec() - t0; \ data[0] = t0; \ for (int nv = 0; nv <= array_len; ++nv) \ { \ if (data[nv] < loop_calibration[nv]) \ loop_calibration[nv] = data[nv]; \ data[nv] -= loop_calibration[nv]; \ } \ } while(0) #define BENCHMARK_POSTFIX \ while(0); \ return papi_res; \ } #else // HAVE_LIBPAPI #define BENCHMARK_PREFIX(mzd_func) \ int run_##mzd_func(void *_p, unsigned long long *data, int *data_len) { \ *data_len = 2; \ struct test_params *p = (struct test_params *)_p; \ do #define TIME_BEGIN(mzd_func_with_ARGS) \ do { \ mzd_func_with_ARGS; \ data[0] = walltime(0); \ data[1] = cpucycles() #define TIME_END \ data[1] = cpucycles() - data[1]; \ data[0] = walltime(data[0]); \ } while(0) #define BENCHMARK_POSTFIX \ while(0); \ return 0; \ } #endif // HAVE_LIBPAPI #define TIME(mzd_func_with_ARGS) \ TIME_BEGIN(mzd_func_with_ARGS); \ for (uint64_t i = 0; i < loop_count; ++i) { mzd_func_with_ARGS; } \ TIME_END mzd_t* volatile vA; rci_t volatile vrowa; rci_t volatile vcola; rci_t volatile vrowb; rci_t volatile vcolb; wi_t volatile vstartblock; int volatile vn; int volatile vint; word volatile vword; BIT volatile vbit; BENCHMARK_PREFIX(bench_nothing) { mzd_t* const A = mzd_init(64, 64); mzd_randomize(A); uint64_t volatile loop_count = p->count; TIME(); mzd_free(A); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(_mzd_row_swap) { mzd_t* const A = mzd_init(p->m, p->n); mzd_randomize(A); vA = A; vrowa = p->row[0]; vrowb = p->row[1]; vstartblock = p->wrd[0]; uint64_t const loop_count = p->count; TIME(_mzd_row_swap(vA, vrowa, vrowb, vstartblock)); mzd_free(A); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(mzd_row_swap) { mzd_t* const A = mzd_init(p->m, p->n); mzd_randomize(A); rci_t const rowa = p->row[0]; rci_t const rowb = p->row[1]; uint64_t const loop_count = p->count; TIME(mzd_row_swap(A, rowa, rowb)); mzd_free(A); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(mzd_copy_row) { mzd_t* const A = mzd_init(p->m, p->n); mzd_t* const B = mzd_init(p->m, p->n); mzd_randomize(A); rci_t const rowa = p->row[0]; rci_t const rowb = p->row[1]; uint64_t const loop_count = p->count; TIME(mzd_copy_row(B, rowb, A, rowa)); mzd_free(A); mzd_free(B); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(mzd_col_swap) { mzd_t* const A = mzd_init(p->m, p->n); mzd_randomize(A); rci_t const cola = p->col[0]; rci_t const colb = p->col[1]; uint64_t const loop_count = p->count; TIME(mzd_col_swap(A, cola, colb)); mzd_free(A); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(mzd_col_swap_in_rows) { mzd_t* const A = mzd_init(p->m, p->n); mzd_randomize(A); vA = A; vcola = p->col[0]; vcolb = p->col[1]; vrowa = p->row[0]; vrowb = p->row[1]; uint64_t const loop_count = p->count; TIME(mzd_col_swap_in_rows(vA, vcola, vcolb, vrowa, vrowb)); mzd_free(A); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(mzd_read_bit) { mzd_t* const A = mzd_init(p->m, p->n); mzd_randomize(A); vA = A; vrowa = p->row[0]; vcola = p->col[0]; uint64_t const loop_count = p->count; TIME(vbit = mzd_read_bit(vA, vrowa, vcola)); mzd_free(A); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(mzd_write_bit) { mzd_t* const A = mzd_init(p->m, p->n); vA = A; vrowa = p->row[0]; vcola = p->col[0]; vbit = 0; uint64_t const loop_count = p->count; TIME(mzd_write_bit(vA, vrowa, vcola, vbit); vbit = !vbit); mzd_free(A); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(mzd_row_add_offset) { mzd_t* const A = mzd_init(p->m, p->n); mzd_randomize(A); vA = A; vrowa = p->row[0]; vrowb = p->row[1]; vcola = p->col[0]; uint64_t const loop_count = p->count; TIME(mzd_row_add_offset(vA, vrowa, vrowb, vcola)); mzd_free(A); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(mzd_row_add) { mzd_t* const A = mzd_init(p->m, p->n); mzd_randomize(A); rci_t const rowa = p->row[0]; rci_t const rowb = p->row[1]; uint64_t const loop_count = p->count; TIME(mzd_row_add(A, rowa, rowb)); mzd_free(A); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(mzd_transpose) { mzd_t* const A = mzd_init(p->m, p->n); mzd_t* const B = mzd_init(p->n, p->m); mzd_randomize(A); uint64_t const loop_count = p->count; TIME(mzd_transpose(B, A)); mzd_free(A); mzd_free(B); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(mzd_mul_naive) { mzd_t* const A = mzd_init(p->m, p->l); mzd_t* const B = mzd_init(p->l, p->n); mzd_t* const C = mzd_init(p->m, p->n); mzd_randomize(A); mzd_randomize(B); uint64_t const loop_count = p->count; TIME(mzd_mul_naive(C, A, B)); mzd_free(A); mzd_free(B); mzd_free(C); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(mzd_addmul_naive) { mzd_t* const A = mzd_init(p->m, p->l); mzd_t* const B = mzd_init(p->l, p->n); mzd_t* const C = mzd_init(p->m, p->n); mzd_randomize(A); mzd_randomize(B); uint64_t const loop_count = p->count; TIME(mzd_addmul_naive(C, A, B)); mzd_free(A); mzd_free(B); mzd_free(C); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(_mzd_mul_naive) { mzd_t* const A = mzd_init(p->m, p->l); mzd_t* const B = mzd_init(p->n, p->l); mzd_t* const C = mzd_init(p->m, p->n); mzd_randomize(A); mzd_randomize(B); int const clear = p->boolean; uint64_t const loop_count = p->count; TIME(_mzd_mul_naive(C, A, B, clear)); mzd_free(A); mzd_free(B); mzd_free(C); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(_mzd_mul_va) { mzd_t* const A = mzd_init(p->m, p->n); mzd_t* const V = mzd_init(1, p->m); mzd_t* const C = mzd_init(1, p->n); mzd_randomize(A); mzd_randomize(V); int const clear = p->boolean; uint64_t const loop_count = p->count; TIME(_mzd_mul_va(C, V, A, clear)); mzd_free(A); mzd_free(V); mzd_free(C); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(mzd_gauss_delayed) { mzd_t** A = malloc(sizeof(mzd_t) * (p->count + 1)); rci_t const cola = p->col[0]; int const full = p->boolean; uint64_t const loop_count = p->count; rci_t result; for (int i = loop_count; i >= 0; --i) { A[i] = mzd_init(p->m, p->n); mzd_randomize(A[i]); } TIME_BEGIN(result = mzd_gauss_delayed(A[0], cola, full)); for (int i = loop_count; i > 0; --i) { result = mzd_gauss_delayed(A[i], cola, full); } TIME_END; for (int i = 0; i <= loop_count; ++i) mzd_free(A[i]); free(A); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(mzd_echelonize_naive) { mzd_t** A = malloc(sizeof(mzd_t) * (p->count + 1)); int const full = p->boolean; uint64_t const loop_count = p->count; rci_t result; for (int i = loop_count; i >= 0; --i) { A[i] = mzd_init(p->m, p->n); mzd_randomize(A[i]); } TIME_BEGIN(result = mzd_echelonize_naive(A[0], full)); for (int i = loop_count; i > 0; --i) { result = mzd_echelonize_naive(A[i], full); } TIME_END; for (int i = 0; i <= loop_count; ++i) mzd_free(A[i]); free(A); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(mzd_equal) { mzd_t* const A = mzd_init(p->m, p->n); mzd_randomize(A); mzd_t* const B = mzd_copy(NULL, A); uint64_t const loop_count = p->count; int volatile result; TIME(result = mzd_equal(A, B)); mzd_free(A); mzd_free(B); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(mzd_cmp) { mzd_t* const A = mzd_init(p->m, p->n); mzd_randomize(A); mzd_t* const B = mzd_copy(NULL, A); uint64_t const loop_count = p->count; int volatile result; TIME(result = mzd_cmp(A, B)); mzd_free(A); mzd_free(B); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(mzd_copy) { mzd_t* const A = mzd_init(p->m, p->n); mzd_t* const B = mzd_init(p->m, p->n); mzd_randomize(A); uint64_t const loop_count = p->count; TIME(mzd_copy(B, A)); mzd_free(A); mzd_free(B); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(mzd_concat) { mzd_t* const A = mzd_init(p->m, p->k); mzd_t* const B = mzd_init(p->m, p->l); mzd_t* const C = mzd_init(p->m, p->k + p->l); mzd_randomize(A); mzd_randomize(B); uint64_t const loop_count = p->count; TIME(mzd_concat(C, A, B)); mzd_free(A); mzd_free(B); mzd_free(C); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(mzd_stack) { mzd_t* const A = mzd_init(p->k, p->n); mzd_t* const B = mzd_init(p->l, p->n); mzd_t* const C = mzd_init(p->k + p->l, p->n); mzd_randomize(A); mzd_randomize(B); uint64_t const loop_count = p->count; TIME(mzd_stack(C, A, B)); mzd_free(A); mzd_free(B); mzd_free(C); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(mzd_submatrix) { mzd_t* const A = mzd_init(p->m, p->n); mzd_randomize(A); rci_t const rowa = p->row[0]; rci_t const cola = p->col[0]; rci_t const rowb = p->row[1]; rci_t const colb = p->col[1]; mzd_t* const S = mzd_init(rowb - rowa, colb - cola); uint64_t const loop_count = p->count; TIME(mzd_submatrix(S, A, rowa, cola, rowb, colb)); mzd_free(A); mzd_free(S); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(mzd_invert_naive) { mzd_t* const A = mzd_init(p->m, p->m); mzd_t* const I = mzd_init(p->m, p->m); mzd_t* const C = mzd_init(p->m, p->m); mzd_randomize(A); mzd_set_ui(I, 1); uint64_t const loop_count = p->count; TIME(mzd_invert_naive(C, A, I)); mzd_free(A); mzd_free(I); mzd_free(C); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(mzd_add) { mzd_t* const A = mzd_init(p->m, p->n); mzd_t* const B = mzd_init(p->m, p->n); mzd_t* const C = mzd_init(p->m, p->n); mzd_randomize(A); mzd_randomize(B); uint64_t const loop_count = p->count; TIME(mzd_add(C, A, B)); mzd_free(A); mzd_free(B); mzd_free(C); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(_mzd_add) { mzd_t* const A = mzd_init(p->m, p->n); mzd_t* const B = mzd_init(p->m, p->n); mzd_t* const C = mzd_init(p->m, p->n); mzd_randomize(A); mzd_randomize(B); uint64_t const loop_count = p->count; TIME(_mzd_add(C, A, B)); mzd_free(A); mzd_free(B); mzd_free(C); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(mzd_combine) { mzd_t* const A = mzd_init(p->m, p->n); mzd_t* const B = mzd_init(p->m, p->n); mzd_t* const C = mzd_init(p->m, p->n); mzd_randomize(A); mzd_randomize(B); rci_t row1 = p->row[0]; rci_t row2 = p->row[1]; rci_t row3 = p->row[2]; wi_t startblock = p->wrd[0]; uint64_t const loop_count = p->count; TIME(mzd_combine(C, row3, startblock, A, row1, startblock, B, row2, startblock)); mzd_free(A); mzd_free(B); mzd_free(C); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(mzd_read_bits) { mzd_t* const A = mzd_init(p->m, p->n); mzd_randomize(A); vA = A; vrowa = p->row[0]; vcola = p->col[0]; vn = p->integer; uint64_t const loop_count = p->count; TIME(vword = mzd_read_bits(vA, vrowa, vcola, vn)); mzd_free(A); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(mzd_read_bits_int) { mzd_t* const A = mzd_init(p->m, p->n); mzd_randomize(A); vA = A; vrowa = p->row[0]; vcola = p->col[0]; vn = p->integer; uint64_t const loop_count = p->count; TIME(vint = mzd_read_bits_int(vA, vrowa, vcola, vn)); mzd_free(A); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(mzd_xor_bits) { mzd_t* const A = mzd_init(p->m, p->n); mzd_randomize(A); vA = A; vrowa = p->row[0]; vcola = p->col[0]; vn = p->integer; vword = 0xffffffffffffffffULL; uint64_t const loop_count = p->count; TIME(mzd_xor_bits(vA, vrowa, vcola, vn, vword)); mzd_free(A); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(mzd_and_bits) { mzd_t* const A = mzd_init(p->m, p->n); mzd_randomize(A); vA = A; vrowa = p->row[0]; vcola = p->col[0]; vn = p->integer; vword = 0xffffffffffffffffULL; uint64_t const loop_count = p->count; TIME(mzd_and_bits(vA, vrowa, vcola, vn, vword)); mzd_free(A); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(mzd_clear_bits) { mzd_t* volatile A = mzd_init(p->m, p->n); mzd_randomize(A); vA = A; vrowa = p->row[0]; vcola = p->col[0]; vn = p->integer; uint64_t const loop_count = p->count; TIME(mzd_clear_bits(vA, vrowa, vcola, vn)); mzd_free(A); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(mzd_is_zero) { mzd_t* const A = mzd_init(p->m, p->n); uint64_t const loop_count = p->count; int volatile result; TIME(result = mzd_is_zero(A)); mzd_free(A); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(mzd_row_clear_offset) { mzd_t* const A = mzd_init(p->m, p->n); mzd_randomize(A); rci_t row = p->row[0]; rci_t col = p->col[0]; uint64_t const loop_count = p->count; TIME(mzd_row_clear_offset(A, row, col)); mzd_free(A); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(mzd_find_pivot) { mzd_t* const A = mzd_init(p->m, p->n); mzd_randomize(A); rci_t row = p->row[0]; rci_t col = p->col[0]; uint64_t const loop_count = p->count; int volatile result; rci_t row_out; rci_t col_out; TIME(result = mzd_find_pivot(A, row, col, &row_out, &col_out)); mzd_free(A); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(mzd_density) { mzd_t* const A = mzd_init(p->m, p->n); mzd_randomize(A); wi_t res = p->wrd[0]; uint64_t const loop_count = p->count; double volatile result; TIME(result = mzd_density(A, res)); mzd_free(A); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(_mzd_density) { mzd_t* const A = mzd_init(p->m, p->n); mzd_randomize(A); rci_t row = p->row[0]; rci_t col = p->col[0]; wi_t res = p->wrd[0]; uint64_t const loop_count = p->count; double volatile result; TIME(result = _mzd_density(A, res, row, col)); mzd_free(A); } BENCHMARK_POSTFIX BENCHMARK_PREFIX(mzd_first_zero_row) { mzd_t* const A = mzd_init(p->m, p->m); mzd_set_ui(A, 1); uint64_t const loop_count = p->count; rci_t volatile result; TIME(result = mzd_first_zero_row(A)); mzd_free(A); } BENCHMARK_POSTFIX // Returns a number proportional with the ideal number of // mathematical operations for the given code. double complexity1(struct test_params *p, char code) { switch(code) { case 'k': return p->k; // Linear with size 'k' of a matrix. case 'l': return p->l; // Linear with size 'l' of a matrix. case 'm': return p->m; // Linear with the number of rows of the matrix. case 'n': return p->n; // Linear with the number of columns of the matrix. case 'W': assert(p->n > m4ri_radix * p->wrd[0]); // Linear with the number of processed columns. return p->n - m4ri_radix * p->wrd[0]; case 'D': assert(p->row[0] < p->row[1]); return p->row[1] - p->row[0]; // Linear with the number of rows between start_row and stop_row. case 'E': assert(p->col[0] < p->col[1]); return p->col[1] - p->col[0]; // Linear with the number of cols between start_col and stop_col. case 'C': assert(p->col[0] < p->n); return p->n - p->col[0]; // Linear with the number of columns of column col and beyond. } return 0.0; } char const* complexity1_human(struct test_params *p, char code) { switch(code) { case 'k': return "k"; case 'l': return "l"; case 'm': return "m"; case 'n': return "n"; case 'W': return "cols"; case 'D': return "rows"; case 'E': return "cols"; case 'C': return "cols"; } return "UNKNOWN"; } double complexity(struct test_params *p, char const* cp) { double c = 1; while (*cp) { c *= complexity1(p, *cp); ++cp; } return c; } void print_complexity_human(struct test_params *p, char const* cp) { int first = 1; char last_cp = 0; int power = 0; while (*cp) { if (*cp != last_cp) { if (power > 1) printf("^%d", power); if (!first && isupper(*cp)) printf("*"); printf("%s", complexity1_human(p, *cp)); power = 0; last_cp = *cp; } ++power; ++cp; } if (power > 1) printf("^%d", power); } struct function_st { char const* funcname; run_type run_func; char const* input_codes; char const* complexity_code; uint64_t count; }; typedef struct function_st function_st; static function_st function_mapper[] = { { "_mzd_row_swap", run__mzd_row_swap, "Rmn,ri,ri,wi", "W", 1000000000 }, { "mzd_row_swap", run_mzd_row_swap, "Rmn,ri,ri", "n", 1000000000 }, { "mzd_copy_row", run_mzd_copy_row, "Omn,ri,R,ri", "n", 1000000000 }, { "mzd_col_swap", run_mzd_col_swap, "Rmn,ci,ci", "m", 10000000 }, { "mzd_col_swap_in_rows", run_mzd_col_swap_in_rows, "Rmn,ci,ci,ri,ri", "D", 10000000 }, { "mzd_read_bit", run_mzd_read_bit, "Rmn,ri,ci", "", 100000000 }, { "mzd_write_bit", run_mzd_write_bit, "Omn,ri,ci", "", 100000000 }, { "mzd_row_add_offset", run_mzd_row_add_offset, "Rmn,ri,ri,ci", "C", 100000000 }, { "mzd_row_add", run_mzd_row_add, "Rmn,ri,ri", "n", 100000000 }, { "mzd_transpose", run_mzd_transpose, "Onm,Rmn", "mn", 10000000 }, { "mzd_mul_naive", run_mzd_mul_naive, "Omn,Rml,Rln", "mnl",10000000 }, { "mzd_addmul_naive", run_mzd_addmul_naive, "Omn,Rml,Rln", "mnl",10000000 }, { "_mzd_mul_naive", run__mzd_mul_naive, "Omn,Rml,Rnl,b", "mnl",10000000 }, { "_mzd_mul_va", run__mzd_mul_va, "O1n,V1m,Amn,b", "mn", 1000000000 }, { "mzd_gauss_delayed", run_mzd_gauss_delayed, "Rmn,ci,b", "mC", 10000000 }, { "mzd_echelonize_naive", run_mzd_echelonize_naive, "Rmn,b", "mn", 10000000 }, { "mzd_equal", run_mzd_equal, "Rmn,Rmn", "mn", 1000000000 }, { "mzd_cmp", run_mzd_cmp, "Rmn,Rmn", "mn", 1000000000 }, { "mzd_copy", run_mzd_copy, "Omn,Rmn", "mn", 1000000000 }, { "mzd_concat", run_mzd_concat, "Omn,Rmk,Rml", "mn", 10000000 }, { "mzd_stack", run_mzd_stack, "Omn,Rkn,Rln", "mn", 1000000000 }, { "mzd_submatrix", run_mzd_submatrix, "O,Rmn,ri,ci,ri,ci", "DE", 10000000 }, { "mzd_invert_naive", run_mzd_invert_naive, "Omm,Rmm,Imm", "mmm",10000000 }, { "mzd_add", run_mzd_add, "Omn,Rmn,Rmn", "mn", 10000000 }, { "_mzd_add", run__mzd_add, "Omn,Rmn,Rmn", "mn", 10000000 }, { "mzd_combine", run_mzd_combine, "Omn,ri,wi,R,ri,R,ri", "W", 10000000 }, { "mzd_read_bits", run_mzd_read_bits, "Rmn,ri,ci,n", "", 10000000 }, { "mzd_read_bits_int", run_mzd_read_bits_int, "Rmn,ri,ci,n", "", 10000000 }, { "mzd_xor_bits", run_mzd_xor_bits, "Rmn,ri,ci,n,w", "", 10000000 }, { "mzd_and_bits", run_mzd_and_bits, "Rmn,ri,ci,n,w", "", 10000000 }, { "mzd_clear_bits", run_mzd_clear_bits, "Rmn,ri,ci,n", "", 10000000 }, { "mzd_is_zero", run_mzd_is_zero, "Rmn", "mn", 10000000 }, { "mzd_row_clear_offset", run_mzd_row_clear_offset, "Omn,ri,ci", "C", 10000000 }, { "mzd_find_pivot", run_mzd_find_pivot, "Rmn,ri,ci", "", 1000000 }, { "mzd_density", run_mzd_density, "Rmn,wi", "", 10000000 }, { "_mzd_density", run__mzd_density, "Rmn,wi,ri,ci", "", 10000000 }, { "mzd_first_zero_row", run_mzd_first_zero_row, "Rmm", "m", 10000000000 }, { "nothing", run_bench_nothing, "", "", 1 } }; int decode_size(char var, struct test_params* params, int* argcp, char*** argvp) { if (*argcp < 1) { fprintf(stderr, "%s: Not enough arguments. Expected matrix size: %c\n", progname, var); return 1; } --(*argcp); switch(var) { case 'k': params->k = atoi((*argvp)[0]); break; case 'l': params->l = atoi((*argvp)[0]); break; case 'm': params->m = atoi((*argvp)[0]); break; case 'n': params->n = atoi((*argvp)[0]); break; } ++(*argvp); return 0; } int decode_index(char idx, struct test_params* params, int* argcp, char*** argvp) { if (*argcp < 1) { int count = 0; switch(idx) { case 'r': count = params->rows; break; case 'c': count = params->cols; break; case 'w': count = params->wrds; break; } fprintf(stderr, "%s: Not enough arguments. Expected ", progname); switch(idx) { case 'r': fprintf(stderr, "row"); break; case 'c': fprintf(stderr, "column"); break; case 'w': fprintf(stderr, "word"); break; } fprintf(stderr, " index : %c%d\n", idx, count + 1); return 1; } --(*argcp); switch(idx) { case 'r': params->row[params->rows++] = atoi((*argvp)[0]); break; case 'c': params->col[params->cols++] = atoi((*argvp)[0]); break; case 'w': params->wrd[params->wrds++] = atoi((*argvp)[0]); break; } ++(*argvp); return 0; } int decode_code(char idx, struct test_params* params, int* argcp, char*** argvp) { if (*argcp < 1) { fprintf(stderr, "%s: Not enough arguments. Expected ", progname); switch(idx) { case 'b': printf("boolean"); break; case 'n': printf("integer"); break; default: printf("%c", idx); } fprintf(stderr, ".\n"); return 1; } --(*argcp); switch(idx) { case 'b': params->boolean = atoi((*argvp)[0]); if (params->boolean != 0 && params->boolean != 1) { fprintf(stderr, "%s: Expected boolean: %s\n", progname, (*argvp)[0]); return 1; } break; case 'n': params->integer = atoi((*argvp)[0]); break; } ++(*argvp); return 0; } int main(int argc, char** argv) { int opts = global_options(&argc, &argv); struct test_params params; unsigned long long data[8]; int data_len; #ifdef HAVE_LIBPAPI int papi_counters = PAPI_num_counters(); if (papi_counters < papi_array_len) { fprintf(stderr, "%s: Warning: there are only %d hardware counters available!\n", progname, papi_counters); papi_array_len = papi_counters; } int res = PAPI_start_counters((int*)papi_events, papi_array_len); switch(res) { case 0: { long long* tmp = (long long*)malloc(papi_array_len * sizeof(long long)); PAPI_stop_counters(tmp, papi_array_len); free(tmp); break; } case PAPI_ECNFLCT: { fprintf(stderr, "%s: %s: Conflicting event: The underlying counter hardware cannot count the specified events simultaneously.\n", progname, papi_event_name(papi_events[papi_array_len - 1])); fprintf(stderr, "Run `papi_event_chooser PRESET"); for (int nv = 0; nv < papi_array_len - 1; ++nv) fprintf(stderr, " %s", papi_event_name(papi_events[nv])); fprintf(stderr, "` to get a list of possible events that can be added.\n"); break; } case PAPI_ENOEVNT: { for (int nv = 0; nv < papi_array_len; ++nv) if ((res = PAPI_query_event(papi_events[nv])) != PAPI_OK) { fprintf(stderr, "%s: PAPI_start_counters: %s: %s.\n", progname, papi_event_name(papi_events[nv]), PAPI_strerror(res)); break; } break; } case PAPI_ESYS: fprintf(stderr, "%s: PAPI_start_counters: %s\n", progname, strerror(errno)); break; default: fprintf(stderr, "%s: PAPI_start_counters: %s.\n", progname, PAPI_strerror(res)); break; } if (res) return 1; for (int nv = 0; nv <= papi_array_len; ++nv) loop_calibration[nv] = 100000000; params.count = 1; data_len = papi_array_len + 1; for (int i = 0; i < 100; ++i) run_bench_nothing((void*)¶ms, data, &data_len); #endif int f; int found = 0; params.rows = 0; params.cols = 0; params.wrds = 0; params.cutoff = -1; if (argc >= 2) { params.funcname = argv[1]; for (f = 0; f < sizeof(function_mapper) / sizeof(function_mapper[0]); ++f) { if (strcmp(params.funcname, function_mapper[f].funcname) == 0) { found = 1; break; } } } if (!found) { if (argc >= 2) fprintf(stderr, "%s: function name \"%s\" not found.\n", progname, params.funcname); else { fprintf(stderr, "Usage: %s [OPTIONS] [ARGS]\n", progname); bench_print_global_options(stderr); } fprintf(stderr, "Possible values for :\n"); for (f = 0; f < sizeof(function_mapper) / sizeof(function_mapper[0]); ++f) { if (f != 0 && f % 4 == 0) fprintf(stderr, "\n"); fprintf(stderr, "%-22s", function_mapper[f].funcname); } fprintf(stderr, "\n"); return 1; } argc -= 2; // argc >= 1 if more arguments. argv += 2; // Next argument in argv[0] char* input_codes = strdup(function_mapper[f].input_codes); char* input_code[10]; char* p = input_codes; int codes = 0; while(*p) { input_code[codes++] = p++; while(*p && *p != ',') ++p; if (*p == ',') *p++ = '\0'; } int saw_var[4]; for (int var_index = 0; var_index < 4; ++var_index) saw_var[var_index] = 0; int saw_vars = 0; char usage[64]; char* usage_ptr = usage; int error = 0; for (int c = 0; ; ++c) { if (c < codes) { p = input_code[c]; if (isupper(*p)) { while(*++p) { if (*p != '1') { int var_index = *p - 'k'; assert(var_index >= 0 && var_index <= 3); // 'k', 'l', 'm' or 'n'. saw_var[var_index] = 1; saw_vars = 1; } } continue; } } if (saw_vars) { saw_vars = 0; for (int var_count = 2; var_count < 6; ++var_count) { int var_index = var_count % 4; if (saw_var[var_index] == 1) { *usage_ptr++ = ' '; *usage_ptr++ = 'k' + var_index; saw_var[var_index] = 2; if (!error && decode_size('k' + var_index, ¶ms, &argc, &argv)) error = 1; } } } if (c == codes) break; if (p[1] == 'i') { *usage_ptr++ = ' '; *usage_ptr++ = *p; switch(*p) { case 'r': *usage_ptr++ = '1' + params.rows; if (error) ++params.rows; break; case 'c': *usage_ptr++ = '1' + params.cols; if (error) ++params.cols; break; case 'w': *usage_ptr++ = '1' + params.wrds; if (error) ++params.wrds; break; } if (!error && decode_index(*p, ¶ms, &argc, &argv)) error = 1; } else { *usage_ptr++ = ' '; *usage_ptr++ = *p; if (!error && decode_code(*p, ¶ms, &argc, &argv)) error = 1; } } *usage_ptr = '\0'; if (argc != 0) error = 1; if (error) { if (argc != 0) fprintf(stderr, "%s %s: too many parameters.\n", progname, params.funcname); fprintf(stderr, "Usage: %s [OPTIONS] %s%s\n", progname, params.funcname, usage); if (opts <= 0) bench_print_global_options(stderr); return 1; } double cost = complexity(¶ms, function_mapper[f].complexity_code); params.count = bench_count ? bench_count : function_mapper[f].count / cost; if (params.count < 1) params.count = 1; bench_count = params.count; srandom(17); data_len = run_bench(function_mapper[f].run_func, (void*)¶ms, data, sizeof(data) / sizeof(unsigned long long)); printf("function: %s, count: %" PRId64 ", ", params.funcname, params.count); if (saw_var[2]) printf("m: %d, ", params.m); if (saw_var[3]) printf("n: %d, ", params.n); if (saw_var[0]) printf("k: %d, ", params.k); if (saw_var[1]) printf("l: %d, ", params.l); for (int i = 0; i < 3; ++i) { if (i < params.rows) printf("row%c: %d, ", 'a' + i, params.row[i]); if (i < params.cols) printf("col%c: %d, ", 'a' + i, params.col[i]); if (i < params.wrds) printf("word%c: %d, ", 'a' + i, params.wrd[i]); } if (params.cutoff != -1) printf("cutoff: %d, ", params.cutoff); print_wall_time(data[0] / 1000000.0 / params.count); printf(", cpu cycles: %llu", (data[1] + params.count / 2) / params.count); #ifndef HAVE_LIBPAPI printf(", cc/"); print_complexity_human(¶ms, function_mapper[f].complexity_code); printf(": %f\n", data[1] / (params.count * cost)); #else printf("\n"); for (int n = 1; n < data_len; ++n) { printf("%s (%f) per bit (divided by ", papi_event_name(papi_events[n - 1]), (double)data[n] / params.count); print_complexity_human(¶ms, function_mapper[f].complexity_code); printf("): %f\n", data[n] / (params.count * cost)); } #endif } libm4ri-20130416/testsuite/bench_ple.c000066400000000000000000000045711212302366200174360ustar00rootroot00000000000000#include #include #include #include "cpucycles.h" #include "benchmarking.h" struct pluq_params { rci_t m; rci_t n; rci_t r; char *what; }; int run(void *_p, unsigned long long *data, int *data_len) { struct pluq_params *p = (struct pluq_params *)_p; *data_len = 2; mzd_t *A = mzd_init(p->m, p->n); mzd_t *U; mzd_t *L; int halfrank = 0; if(halfrank) { U = mzd_init(p->m, p->n); L = mzd_init(p->m, p->m); mzd_randomize(U); mzd_randomize(L); #if 0 for (rci_t i = 0; i < p->m; ++i) { for (rci_t j = i + 1; j < p->m; ++j) mzd_write_bit(L,i,j, 0); mzd_write_bit(L,i,i, 1); } for(rci_t i = 0; i < MIN(p->m, p->n); ++i) { for (rci_t j = 0; j < i; ++j) mzd_write_bit(U,i,j, 0); mzd_write_bit(U,i,i, 1); } #endif for(rci_t i = 0; i < p->m; ++i) { mzd_write_bit(U,i,i, 1); for(rci_t j = 0; j < i; ++j) mzd_write_bit(U,i,j, 0); if ((i % 2)) for(rci_t j = i; j < p->n; ++j) mzd_write_bit(U,i,j, 0); for(rci_t j = i + 1; j < p->m; ++j) mzd_write_bit(L,i,j, 0); mzd_write_bit(L,i,i, 1); } mzd_mul(A,L,U,0); } else { mzd_randomize(A); } mzp_t *P = mzp_init(p->m); mzp_t *Q = mzp_init(p->n); data[0] = walltime(0); data[1] = cpucycles(); if(strcmp(p->what,"pluq")) p->r = mzd_pluq(A, P, Q, 0); else if (strcmp(p->what,"ple")) p->r = mzd_ple(A, P, Q, 0); else m4ri_die("Unknown task '%s'",p->what); data[0] = walltime(data[0]); data[1]= cpucycles() - data[1]; mzd_free(A); mzp_free(P); mzp_free(Q); if(halfrank) { mzd_free(U); mzd_free(L); } return 0; } int main(int argc, char **argv) { int opts = global_options(&argc, &argv); if (opts < 0) { bench_print_global_options(stderr); exit(-1); } if (argc != 4) { printf("Parameters m,n,what expected.\n"); printf(" m -- integer > 0\n"); printf(" n -- integer > 0\n"); printf(" what -- PLUQ or PLE.\n"); printf("\n"); bench_print_global_options(stderr); m4ri_die(""); } struct pluq_params p; p.m = atoi(argv[1]); p.n = atoi(argv[2]); p.what = argv[3]; srandom(17); unsigned long long data[2]; run_bench(run, (void*)&p, data, 2); printf("m: %5d, n: %5d, what: %s, r: %5d, cpu cycles: %12llu, wall time: %6.3lf\n", p.m, p.n, p.what, p.r, data[1], data[0] / 1000000.0); } libm4ri-20130416/testsuite/bench_trsm.c000066400000000000000000000047241212302366200176430ustar00rootroot00000000000000#include #include #include "cpucycles.h" #include #include "benchmarking.h" struct trsm_params { rci_t m; rci_t n; int upper; int left; char const *algorithm; }; mzd_t *mzd_random_lower(const rci_t n) { mzd_t *A = mzd_init(n,n); mzd_randomize(A); for(rci_t i=0; im, p->n); mzd_randomize(B); if (p->upper) { T = mzd_random_upper(p->m); } else { T = mzd_random_lower(p->m); } data[0] = walltime(0); data[1] = cpucycles(); switch(2*p->upper + p->left) { case 3: mzd_trsm_upper_left(T, B, 0); break; case 2: mzd_trsm_upper_right(T, B, 0); break; case 1: mzd_trsm_lower_left(T, B, 0); break; case 0: mzd_trsm_lower_right(T, B, 0); break; default: m4ri_die("Parameters for upper (=%d) or left (=%d) not supported",p->upper,p->left); } data[0] = walltime(data[0]); data[1] = cpucycles() - data[1]; mzd_free(B); mzd_free(T); return 0; } int main(int argc, char **argv) { int opts = global_options(&argc, &argv); if (opts < 0) { bench_print_global_options(stderr); exit(-1); } if (argc != 5) { printf("Parameters m,n,upper,left expected.\n"); printf(" m -- integer > 0\n"); printf(" n -- integer > 0\n"); printf(" upper -- 1 for upper triangular, 0 for lower triangular.\n"); printf(" left -- 1 for triangular matrix on left, 0 for right\n"); printf("\n"); bench_print_global_options(stderr); exit(-1); } struct trsm_params p; p.m = atoi(argv[1]); p.n = atoi(argv[2]); p.upper = atoi(argv[3]); p.left = atoi(argv[4]); srandom(17); unsigned long long data[2]; run_bench(run, (void*)&p, data, 2); /** this has no meaning if m << n **/ double cc_per_op = (4*(double)data[1])/ (p.m*powl((double)p.n,1.807)); printf("m: %5d, n: %5d, upper: %d, left: %d, cpu cycles: %llu, cc/(n^2.807): %.5lf, wall time: %lf\n", p.m, p.n, p.upper, p.left, data[1], cc_per_op, data[0] / 1000000.0); } libm4ri-20130416/testsuite/benchmarking.c000066400000000000000000000577151212302366200201570ustar00rootroot00000000000000/* * benchmarking.c * * Benchmark engine. * * Copyright (C) 2011 Carlo Wood * RSA-1024 0x624ACAD5 1997-01-26 Sign & Encrypt * Fingerprint16 = 32 EC A7 B6 AC DB 65 A6 F6 F6 55 DD 1C DC FF 61 * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* * Example usage: * * ./bench_elimination -s 0 -m 4 -c 90 -a 0.005 -d -t 30 -n 1000 1000 1000 * * would run at most 30 seconds (-t) or 1000 times (-n), whichever comes * first, or stop after the real average of wall time (-s 0) falls with 90% * certainty (-c) in a range that is +/- 0.005 times the observed mean (-a: accuracry), * but no sooner than that at least 4 (-m: minimum) measurements have been * done. It would also print (-d: dump) each measurement (0:microseconds 1:cpuclocks). * * Example output. * * 2416 6441500 * 2376 6335490 * 2360 6294450 * 2361 6295280 * 2371 6321440 * 2350 6266740 * 2362 6298700 * 2386 6362520 * 2344 6249890 * 2347 6260450 * 2346 6254590 * Total running time: 0.103 seconds. * Virtual time (s): Sample size: 11; mean: 0.002365; standard deviation: 0.000021 * Virtual time (s): 90% confidence interval: +/- 0.000012 (0.5%): [0.002354..0.002377] * * The last three lines can be suppressed by passing the option -q (quiet). */ #include #ifdef HAVE_LIBPAPI #define _GNU_SOURCE #include // papi.h needs caddr_t #include #include #endif #include #include #include #include #include #include #include #include "benchmarking.h" #include enum { C80, C90, C95, C98, C99 }; /* * Command line option decoding */ int bench_quiet = 0; // Set if -q is used. int bench_dump = 0; // Set if -d is used. int bench_minimum = 2; // Minimum number of measurements. Set with -m . int bench_maximum = 1000; // Maximum number of measurements. Set with -n . unsigned long long bench_maxtime = 60000000; // Maximum number of microseconds to run. Set with -t , in seconds (floating point). double bench_accuracy = 0.01; // The +/- range (where 1.0 is 100%) within that we want the real population mean to be with the given confidence. Set with -a int bench_confidence_index = C99; // The confidence that the real mean is within the given (or found) range. int bench_stats = 1; // The counter used for statistics (0 = realtime, 1 = cpuclocks). Set with -s . int bench_dump_counter = -1; // The counter to dump (see bench_stats). Set with -d . If not given all counters are dumped. char const* progname; // Set to argv[0]. /* * Command line option used by bench_packedmatrix.c */ uint64_t bench_count = 0; // Can be set by -x , otherwise a reasonable default is being used. #ifdef HAVE_LIBPAPI int bench_disregard_L2_misses = 0; // Set if -2 is used. /* * PAPI events being counted. */ int papi_events[32] = { PAPI_TOT_CYC, /* Total cycles. This must always be the first entry. */ }; int papi_array_len = 1; int bench_PAPI_L2_TCM_index; char* papi_event_name(int event) { // PAPI needs to be initialized before calling PAPI_event_code_to_name. if (PAPI_is_initialized() == PAPI_NOT_INITED) { int res = PAPI_library_init(PAPI_VER_CURRENT); if (res != PAPI_OK && res != PAPI_VER_CURRENT) { fprintf(stderr, "%s: PAPI_library_init: error code %d %s\n", progname, res, PAPI_strerror(res)); m4ri_die("PAPI failed to initialize.\n"); } } static char buf[PAPI_MAX_STR_LEN]; int res = PAPI_event_code_to_name(event, buf); if (res) snprintf(buf, PAPI_MAX_STR_LEN, "", event); return buf; } int papi_add_event(char const* event_name) { // PAPI needs to be initialized before calling PAPI_event_name_to_code. if (PAPI_is_initialized() == PAPI_NOT_INITED) { int res = PAPI_library_init(PAPI_VER_CURRENT); if (res != PAPI_OK && res != PAPI_VER_CURRENT) { fprintf(stderr, "%s: PAPI_library_init: error code %d %s\n", progname, res, PAPI_strerror(res)); m4ri_die("PAPI failed to initialize.\n"); } } int event; int res = PAPI_event_name_to_code((char*)event_name, &event); if (res != PAPI_OK) { if (res == PAPI_ENOEVNT) fprintf(stderr, "%s: %s: No such event.\n", progname, event_name); else fprintf(stderr, "%s: PAPI_event_name_to_code(\"%s\"): %s\n", progname, event_name, PAPI_strerror(res)); return res; } int found = 0; for (int nv = 0; nv < papi_array_len; ++nv) { if (papi_events[nv] == event) { found = 1; break; } } if (!found) papi_events[papi_array_len++] = event; return 0; } void papi_add_events(char* event_names) { char* tmpptr; char* name = strtok_r(event_names, ", ", &tmpptr); while (name) { papi_add_event(name); name = strtok_r(NULL, ", ", &tmpptr); } } #endif // HAVE_LIBPAPI int global_options(int* argcp, char*** argvp) { int result = 0; progname = (*argvp)[0]; while((*argcp) > 1) { if ((*argvp)[1][0] != '-' || (*argvp)[1][1] == '\0' || (*argvp)[1][2] != '\0') return result; switch((*argvp)[1][1]) { case 'd': bench_dump = 1; if (isdigit((*argvp)[2][0])) { ++*argvp; --*argcp; bench_dump_counter = atoi((*argvp)[1]); } break; case 'q': bench_quiet = 1; break; #ifdef HAVE_LIBPAPI case '2': { bench_disregard_L2_misses = 1; if (papi_add_event("PAPI_L2_TCM")) { fprintf(stderr, "%s: Ignoring -2: Level 2 cache misses cannot be detected with the current set of PAPI events (-p).\n", progname); bench_disregard_L2_misses = 0; } for (int nv = 0; nv < papi_array_len; ++nv) { if (papi_events[nv] == PAPI_L2_TCM) { bench_PAPI_L2_TCM_index = nv + 1; // +1 for in data[] inserted virtual time at index 0. break; } } break; } case 'p': { ++*argvp; --*argcp; papi_add_events((*argvp)[1]); break; } #endif case 'm': ++*argvp; --*argcp; bench_minimum = atoi((*argvp)[1]); break; case 'n': ++*argvp; --*argcp; bench_maximum = atoi((*argvp)[1]); if (bench_maximum < bench_minimum) bench_minimum = bench_maximum; break; case 't': ++*argvp; --*argcp; bench_maxtime = 1000000 * strtod((*argvp)[1], NULL); break; case 'a': ++*argvp; --*argcp; bench_accuracy = strtod((*argvp)[1], NULL); break; case 'c': { ++*argvp; --*argcp; int confidence = atoi((*argvp)[1]); switch (confidence) { case 80: bench_confidence_index = C80; break; case 90: bench_confidence_index = C90; break; case 95: bench_confidence_index = C95; break; case 98: bench_confidence_index = C98; break; case 99: bench_confidence_index = C99; break; default: m4ri_die("The only possible confidence percentages are 80, 90, 95, 98 and 99%\n"); break; } break; } case 'x': ++*argvp; --*argcp; bench_count = atoll((*argvp)[1]); break; case 's': ++*argvp; --*argcp; bench_stats = atoi((*argvp)[1]); break; default: return -1; } ++result; ++*argvp; --*argcp; } return result; } void bench_print_global_options(FILE* out) { fprintf(out, "OPTIONS\n"); fprintf(out, " -m Do at least number of measurements. Default 2.\n"); fprintf(out, " -n Do at most number of measurements. Default 1000.\n"); fprintf(out, " -t Stop after seconds. Default 60.0 seconds.\n"); fprintf(out, " -a Stop after has been reached. Default 0.01 (= 1%%).\n"); fprintf(out, " -c Stop when accuracy has been reached with this confidence. Default 99 (%%).\n"); fprintf(out, " -s Counter to perform statistic over (0: realtime, 1: cpuclocks. Default: 1).\n"); fprintf(out, " -x Call function times in the inner most loop (calls per measurement).\n"); fprintf(out, " -d [] Dump measurements. Dump all or only when given.\n"); fprintf(out, " -q Quiet. Suppress printing of statistics.\n"); #ifdef HAVE_LIBPAPI fprintf(out, " -2 Disregard measurements with any level 2 cache misses.\n"); fprintf(out, " -p [,,...]\n"); fprintf(out, " Count and report the given events. The list is comma or space separated,\n"); fprintf(out, " for example -p \"PAPI_TOT_INS PAPI_L1_DCM\".\n"); fprintf(out, " Run `papi_event_chooser PRESET PAPI_TOT_CYC [PAPI_*]` for more events.\n"); #endif } /* * vector implementation * * vector_create: Create vector of size s. * vector_destruct: Destruct vector. * vector_resize: Resize internal allocation. * vector_size: Return number of elements. * vector_pushback: Add one element at the end. * vector_get: Get element at position index. */ struct vector_st { size_t alloc_size; size_t size; double* data; }; typedef struct vector_st* vector; vector vector_create(size_t s) { vector v = (vector)malloc(sizeof(struct vector_st)); v->alloc_size = s; v->data = s ? (double*)malloc(sizeof(double) * s) : NULL; v->size = 0; return v; } void vector_destruct(vector v) { free(v->data); free(v); } void vector_resize(vector v, size_t s) { v->data = (double*)realloc(v->data, sizeof(double) * s); v->alloc_size = s; if (v->size > v->alloc_size) v->size = v->alloc_size; } static inline size_t vector_size(vector v) { return v->size; } void vector_pushback(vector v, double d) { if (++(v->size) > v->alloc_size) vector_resize(v, v->alloc_size * 2); v->data[v->size - 1] = d; } static inline double vector_get(vector v, int index) { return v->data[index]; } /* * Normal distribution * * normal_calculate: Calculate the mean and standard deviation of the data in vector v. * * Returns -1 on failure (not enough data points), 0 otherwise. */ struct normal_st { int size; double mean; double sigma; }; typedef struct normal_st normal; int normal_calculate(vector v, normal* dist, double multiplier) { dist->size = vector_size(v); if (dist->size < 2) { dist->mean = vector_get(v, 0) * multiplier; dist->sigma = 0.0; return 0; } // Calculate the sum of all data. double sum = 0; for (int i = 0; i < dist->size; ++i) sum += vector_get(v, i) * multiplier; dist->mean = sum / dist->size; // Calculate the sum of the square of all differences with mean. sum = 0; for (int i = 0; i < dist->size; ++i) { double delta = vector_get(v, i) * multiplier - dist->mean; sum += delta * delta; } dist->sigma = sqrt(sum / (dist->size - 1)); return 0; } /* * T-Table */ static float student_t[5][34] = { { 3.078, 1.886, 1.638, 1.533, 1.476, 1.440, 1.415, 1.397, 1.383, 1.372, 1.363, 1.356, 1.350, 1.345, 1.341, 1.337, 1.333, 1.330, 1.328, 1.325, 1.323, 1.321, 1.319, 1.318, 1.316, 1.315, 1.314, 1.313, 1.311, 1.310, 1.303, 1.296, 1.289, 1.282 }, { 6.314, 2.920, 2.353, 2.132, 2.015, 1.943, 1.895, 1.860, 1.833, 1.812, 1.796, 1.782, 1.771, 1.761, 1.753, 1.746, 1.740, 1.734, 1.729, 1.725, 1.721, 1.717, 1.714, 1.711, 1.708, 1.706, 1.703, 1.701, 1.699, 1.697, 1.684, 1.671, 1.658, 1.645 }, { 12.706, 4.303, 3.182, 2.776, 2.571, 2.447, 2.365, 2.306, 2.262, 2.228, 2.201, 2.179, 2.160, 2.145, 2.131, 2.120, 2.110, 2.101, 2.093, 2.086, 2.080, 2.074, 2.069, 2.064, 2.060, 2.056, 2.052, 2.048, 2.045, 2.042, 2.021, 2.000, 1.980, 1.960 }, { 31.821, 6.965, 4.541, 3.747, 3.365, 3.143, 2.998, 2.896, 2.821, 2.764, 2.718, 2.681, 2.650, 2.624, 2.602, 2.583, 2.567, 2.552, 2.539, 2.528, 2.518, 2.508, 2.500, 2.492, 2.485, 2.479, 2.473, 2.467, 2.462, 2.457, 2.423, 2.390, 2.358, 2.326 }, { 63.657, 9.925, 5.841, 4.604, 4.032, 3.707, 3.499, 3.355, 3.250, 3.169, 3.106, 3.055, 3.012, 2.977, 2.947, 2.921, 2.898, 2.878, 2.861, 2.845, 2.831, 2.819, 2.807, 2.797, 2.787, 2.779, 2.771, 2.763, 2.756, 2.750, 2.704, 2.660, 2.617, 2.576 } }; static float student_t_certainty[5] = { 0.2, 0.1, 0.05, 0.02, 0.01 }; // Two-tails. static float t_table(int confidence_index, int freedoms) { if (freedoms <= 30) return student_t[confidence_index][freedoms - 1]; double a, b, y1, y2, y3; long x1, x2; long x3 = 0; int i; if (freedoms <= 60) { i = 29; x1 = 30; x2 = 40; x3 = 60; } else if (freedoms <= 120) { i = 30; x1 = 40; x2 = 60; x3 = 120; } else { i = 31; x1 = 60; x2 = 120; /* x3 = infinity */ } y1 = student_t[confidence_index][i]; y2 = student_t[confidence_index][i + 1]; y3 = student_t[confidence_index][i + 2]; if (freedoms <= 120) { double c, d; d = (x1 * x1 * (x3 - x2) + x2 * x2 * (x1 - x3) + x3 * x3 * (x2 - x1)); a = - (x1 * (y3 - y2) + x2 * (y1 - y3) + x3 * (y2 - y1)) / d; b = (x1 * x1 * (y3 - y2) + x2 * x2 * (y1 - y3) + x3 * x3 * (y2 - y1)) / d; c = y2 - a * x2 * x2 - b * x2; return (a * freedoms * freedoms + b * freedoms + c); } double ln1, ln2; ln1 = log(y2 - y3); ln2 = log(y1 - y3); a = - ( ln1 - ln2) / (x1 - x2); b = (x1 * ln1 - x2 * ln2) / (x1 - x2); return (y3 + exp(a * freedoms + b)); } /* * walltime */ unsigned long long walltime(unsigned long long t0) { static time_t base_sec; struct timeval tp; gettimeofday(&tp, NULL); if (__M4RI_UNLIKELY(base_sec == 0)) base_sec = tp.tv_sec; return (tp.tv_sec - base_sec) * 1000000 + tp.tv_usec - t0; } /* * Printing doubles. */ int bench_precision(double sigma) { if (sigma < 1E-10) return 12; int log_sigma = log10(sigma); if (log_sigma >= 2) return 0; return 2 - log_sigma; } void print_double(double d, int precision) { switch(precision) { case 0: printf("%.0f", d); break; case 1: printf("%.1f", d); break; case 2: printf("%.2f", d); break; case 3: printf("%.3f", d); break; case 4: printf("%.4f", d); break; case 5: printf("%.5f", d); break; case 6: printf("%.6f", d); break; case 7: printf("%.7f", d); break; case 8: printf("%.8f", d); break; case 9: printf("%.9f", d); break; case 10: printf("%.10f", d); break; case 11: printf("%.11f", d); break; case 12: printf("%.12f", d); break; } } /* * run_bench * * Benchmark main loop. */ int run_bench( int (*f)(void* params, unsigned long long* data, int *data_len), void* params, unsigned long long* data, int data_len) { double const CONFIDENCE = 1.0 - student_t_certainty[bench_confidence_index]; unsigned long long data_sum[32]; memset(data_sum, 0, sizeof(data_sum)); data_len = MIN(data_len, sizeof(data_sum) / sizeof(unsigned long long)); vector stats_data = vector_create(128); normal stats; #ifdef HAVE_LIBPAPI int total_calls = 0; #endif if (!bench_count) bench_count = 1; unsigned long long start_walltime = walltime(0); for (int n = 1; n <= bench_maximum; ++n) { if (!bench_quiet && !bench_dump) { printf("."); fflush(stdout); } do { int res = f(params, data, &data_len); if (res < 0) m4ri_die("benchmark function failed with exit code: %d\n", res); #ifdef HAVE_LIBPAPI ++total_calls; #endif } #ifdef HAVE_LIBPAPI while(bench_disregard_L2_misses && data[bench_PAPI_L2_TCM_index]); #else while(0); #endif if (bench_dump) { if (bench_dump_counter >= 0 && bench_dump_counter < data_len) printf("%llu", data[bench_dump_counter]); else { printf("%llu", data[0]); for (int nv = 1; nv < data_len; ++nv) printf(" %llu", data[nv]); } printf("\n"); fflush(stdout); } vector_pushback(stats_data, data[bench_stats]); for (int nv = 0; nv < data_len; ++nv) data_sum[nv] += data[nv]; if (n >= bench_minimum && normal_calculate(stats_data, &stats, (bench_stats == 0) ? 0.000001 : (1.0 / bench_count)) == 0) { double standard_error = stats.sigma / sqrt(stats.size); double critical_value = t_table(bench_confidence_index, stats.size - 1); // Stop when the real mean lays with CONFIDENCE in the range [mean * (1 - bench_accuracy), mean * (1 + bench_accuracy)]. // or when we're already running bench_maxtime seconds. if (standard_error * critical_value / stats.mean <= bench_accuracy || walltime(start_walltime) > bench_maxtime) break; } } for (int nv = 0; nv < data_len; ++nv) data[nv] = (data_sum[nv] + stats.size / 2) / stats.size; if (!bench_quiet) { if (!bench_quiet && !bench_dump) printf("\n"); printf("Total running time: %6.3f seconds.\n", walltime(start_walltime) / 1000000.0); #ifdef HAVE_LIBPAPI if (bench_disregard_L2_misses) printf("Samples disregarded because of level 2 cache misses: %d\n", total_calls - stats.size); #endif int precision = bench_precision(stats.sigma); #ifdef HAVE_LIBPAPI if (bench_stats) printf("%s: ", papi_event_name(papi_events[bench_stats - 1])); else printf("Virtual time (s): "); #endif printf("Sample size: %d; mean: ", stats.size); print_double(stats.mean, precision); printf("; standard deviation: "); print_double(stats.sigma, precision); printf("\n"); #ifdef HAVE_LIBPAPI if (bench_stats) printf("%s: ", papi_event_name(papi_events[bench_stats - 1])); else printf("Virtual time (s): "); #endif double standard_error = stats.sigma / sqrt(stats.size); double critical_value = t_table(bench_confidence_index, stats.size - 1); double accuracy = standard_error * critical_value; printf("%2.0f%% confidence interval: +/- ", CONFIDENCE * 100); print_double(accuracy, precision); printf(" (%.1f%%): [", accuracy / stats.mean * 100); print_double(stats.mean - accuracy, precision); printf(".."); print_double(stats.mean + accuracy, precision); printf("]\n"); } vector_destruct(stats_data); return data_len; } /* * Randomize */ // The same as m4ri_random_word. Duplicated here because it's // not available in older revisions that we want to benchmark against. word bench_random_word() { // random() only returns 31 bits, so we need three calls. word a0 = random(); word a1 = random(); word a2 = random(); word v = a0 ^ (a1 << 24) ^ a2 << 48; #ifdef BENCH_RANDOM_REVERSE v = ((v >> 1) & 0x5555555555555555ULL) | ((v & 0x5555555555555555ULL) << 1); v = ((v >> 2) & 0x3333333333333333ULL) | ((v & 0x3333333333333333ULL) << 2); v = ((v >> 4) & 0x0F0F0F0F0F0F0F0FULL) | ((v & 0x0F0F0F0F0F0F0F0FULL) << 4); v = ((v >> 8) & 0x00FF00FF00FF00FFULL) | ((v & 0x00FF00FF00FF00FFULL) << 8); v = ((v >> 16) & 0x0000FFFF0000FFFFULL) | ((v & 0x0000FFFF0000FFFFULL) << 16); v = (v >> 32) | (v << 32); #endif return v; } // Needed for mzd_t. #include // The same as m4ri_randomize. Duplicated here because it's // not available in older revisions that we want to benchmark against. void bench_randomize(mzd_t *A) { wi_t const width = A->width - 1; int const offset = A->offset; if(offset) { if(width == 0) { word const mask = __M4RI_RIGHT_BITMASK(m4ri_radix - offset) & __M4RI_LEFT_BITMASK((A->ncols + offset) % m4ri_radix); for(rci_t i = 0; i < A->nrows; ++i) #ifdef BENCH_RANDOM_REVERSE A->rows[i][0] ^= (A->rows[i][0] ^ (bench_random_word() >> offset)) & mask; #else A->rows[i][0] ^= (A->rows[i][0] ^ (bench_random_word() << offset)) & mask; #endif } else { word const mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix - offset); word const mask_end = __M4RI_LEFT_BITMASK((A->ncols + offset) % m4ri_radix); #ifdef BENCH_RANDOM_REVERSE int const need_last_bits = ((m4ri_one << (m4ri_radix - 1 - offset)) & mask_end) != 0; #else int const need_last_bits = ((m4ri_one << offset) & mask_end) != 0; #endif for(rci_t i = 0; i < A->nrows; ++i) { word prev_random_word; word random_word = bench_random_word(); #ifdef BENCH_RANDOM_REVERSE A->rows[i][0] ^= (A->rows[i][0] ^ (random_word >> offset)) & mask_begin; #else A->rows[i][0] ^= (A->rows[i][0] ^ (random_word << offset)) & mask_begin; #endif for(wi_t j = 1; j < width; ++j) { prev_random_word = random_word; random_word = bench_random_word(); #ifdef BENCH_RANDOM_REVERSE A->rows[i][j] = (random_word >> offset) | (prev_random_word << (m4ri_radix - offset)); #else A->rows[i][j] = (random_word << offset) | (prev_random_word >> (m4ri_radix - offset)); #endif } prev_random_word = random_word; random_word = 0; if (need_last_bits) random_word = bench_random_word(); #ifdef BENCH_RANDOM_REVERSE A->rows[i][width] ^= (A->rows[i][width] ^ ((random_word >> offset) | (prev_random_word << (m4ri_radix - offset)))) & mask_end; #else A->rows[i][width] ^= (A->rows[i][width] ^ ((random_word << offset) | (prev_random_word >> (m4ri_radix - offset)))) & mask_end; #endif } } } else { word const mask_end = __M4RI_LEFT_BITMASK(A->ncols % m4ri_radix); for(rci_t i = 0; i < A->nrows; ++i) { for(wi_t j = 0; j < width; ++j) A->rows[i][j] = bench_random_word(); A->rows[i][width] ^= (A->rows[i][width] ^ bench_random_word()) & mask_end; } } } /* * Random number generator */ static uint64_t bench_random_M; static uint64_t bench_random_modulo; void bench_random_init(uint64_t modulo) { // Set bench_random_M to the largest multiple of modulo, minus one, that fits in an uint64_t. // A modulo of zero is interpreted as 2^64, and thus returns 0xffffffffffffffff. bench_random_M = modulo ? -modulo / modulo * modulo - 1 : -1; bench_random_M += modulo; bench_random_modulo = modulo; } // Returns a uniformly distributed random number in the range [0, bench_random_modulo>. uint64_t bench_random() { for(;;) { word R = bench_random_word(); if (R <= bench_random_M) return R % bench_random_modulo; } } void print_wall_time(double seconds) { if (seconds >= 0.01) printf("wall time: %10.5f s", seconds); else if (seconds >= 0.00001) printf("wall time: %10.5f ms", 1000.0 * seconds); else printf("wall time: %10.5f us", 1000000.0 * seconds); } #ifdef HAVE_LIBPAPI int papi_test(int * papi_events, int papi_array_len) { int res = PAPI_start_counters(papi_events, papi_array_len); switch(res) { case 0: { long long* tmp = (long long*)malloc(papi_array_len * sizeof(long long)); PAPI_stop_counters(tmp, papi_array_len); free(tmp); break; } case PAPI_ECNFLCT: { fprintf(stderr, "%s: %s: Conflicting event: The underlying counter hardware cannot count the specified events simultaneously.\n", progname, papi_event_name(papi_events[papi_array_len - 1])); fprintf(stderr, "Run `papi_event_chooser PRESET"); for (int nv = 0; nv < papi_array_len - 1; ++nv) fprintf(stderr, " %s", papi_event_name(papi_events[nv])); fprintf(stderr, "` to get a list of possible events that can be added.\n"); break; } case PAPI_ENOEVNT: { for (int nv = 0; nv < papi_array_len; ++nv) if ((res = PAPI_query_event(papi_events[nv])) != PAPI_OK) { fprintf(stderr, "%s: PAPI_start_counters: %s: %s.\n", progname, papi_event_name(papi_events[nv]), PAPI_strerror(res)); break; } break; } case PAPI_ESYS: fprintf(stderr, "%s: PAPI_start_counters: %s\n", progname, strerror(errno)); break; default: fprintf(stderr, "%s: PAPI_start_counters: %s.\n", progname, PAPI_strerror(res)); break; } return res; } #endif libm4ri-20130416/testsuite/benchmarking.h000066400000000000000000000016451212302366200201530ustar00rootroot00000000000000#ifndef BENCHMARKETING_H #define BENCHMARKETING_H #include /* * Command line options. See benchmarking.h for documentation. */ extern int bench_quiet; extern int bench_dump; extern int bench_minimum; extern int bench_maximum; extern unsigned long long bench_maxtime; extern double bench_accuracy; extern int bench_confidence_index; extern char const* progname; extern uint64_t bench_count; unsigned long long walltime(unsigned long long t0); int global_options(int* argcp, char*** argvp); void bench_print_global_options(FILE*); int run_bench( int (*f)(void* params, unsigned long long* data, int *data_len), void* params, unsigned long long* data, int data_len); #ifdef HAVE_LIBPAPI extern int papi_events[]; extern int papi_array_len; char* papi_event_name(int event); int papi_test(int * papi_events, int papi_array_len); #endif void print_wall_time(double seconds); #endif //BENCHMARKETING_H libm4ri-20130416/testsuite/cpucycles-20060326/000077500000000000000000000000001212302366200202365ustar00rootroot00000000000000libm4ri-20130416/testsuite/cpucycles-20060326/alpha.c000066400000000000000000000027351212302366200214760ustar00rootroot00000000000000/* cpucycles/alpha.c version 20060316 D. J. Bernstein Public domain. */ #include #include #include static long long tod(void) { struct timeval t; gettimeofday(&t,(struct timezone *) 0); return t.tv_sec * (long long) 1000000 + t.tv_usec; } static long long rpcc(void) { unsigned long long t; asm volatile("rpcc %0" : "=r"(t)); return t & 0xffffffff; } static long long firstrpcc; static long long firsttod; static long long lastrpcc; static long long lasttod; static double mhz = 0; static void init(void) { firstrpcc = rpcc(); firsttod = tod(); do { lastrpcc = rpcc(); lasttod = tod(); } while (lasttod - firsttod < 10000); lastrpcc -= firstrpcc; lastrpcc &= 0xffffffff; lasttod -= firsttod; mhz = (double) lastrpcc / (double) lasttod; } long long cpucycles_alpha(void) { double x; long long y; if (!mhz) init(); lastrpcc = rpcc(); lasttod = tod(); lastrpcc -= firstrpcc; lastrpcc &= 0xffffffff; lasttod -= firsttod; /* Number of cycles since firstrpcc is lastrpcc + 2^32 y for unknown y. */ /* Number of microseconds since firsttod is lasttod. */ x = (lasttod * mhz - lastrpcc) * 0.00000000023283064365386962890625; y = x; while (x > y + 0.5) y += 1; while (x < y - 0.5) y -= 1; y *= 4294967296ULL; lastrpcc += y; mhz = (double) lastrpcc / (double) lasttod; return firstrpcc + lastrpcc; } long long cpucycles_alpha_persecond(void) { if (!mhz) init(); return 1000000.0 * mhz; } libm4ri-20130416/testsuite/cpucycles-20060326/alpha.h000066400000000000000000000007171212302366200215010ustar00rootroot00000000000000/* cpucycles alpha.h version 20060318 D. J. Bernstein Public domain. */ #ifndef CPUCYCLES_alpha_h #define CPUCYCLES_alpha_h #ifdef __cplusplus extern "C" { #endif extern long long cpucycles_alpha(void); extern long long cpucycles_alpha_persecond(void); #ifdef __cplusplus } #endif #ifndef cpucycles_implementation #define cpucycles_implementation "alpha" #define cpucycles cpucycles_alpha #define cpucycles_persecond cpucycles_alpha_persecond #endif #endif libm4ri-20130416/testsuite/cpucycles-20060326/amd64cpuinfo.c000066400000000000000000000011101212302366200226720ustar00rootroot00000000000000#include #include long long cpucycles_amd64cpuinfo(void) { unsigned long long result; asm volatile(".byte 15;.byte 49;shlq $32,%%rdx;orq %%rdx,%%rax" : "=a" (result) :: "%rdx"); return result; } long long cpucycles_amd64cpuinfo_persecond(void) { FILE *f; double result; int s; f = fopen("/proc/cpuinfo","r"); if (!f) return 0; for (;;) { s = fscanf(f,"cpu MHz : %lf",&result); if (s > 0) break; if (s == 0) s = fscanf(f,"%*[^\n]\n"); if (s < 0) { result = 0; break; } } fclose(f); return 1000000.0 * result; } libm4ri-20130416/testsuite/cpucycles-20060326/amd64cpuinfo.h000066400000000000000000000010071212302366200227040ustar00rootroot00000000000000/* cpucycles amd64cpuinfo.h version 20060318 D. J. Bernstein Public domain. */ #ifndef CPUCYCLES_amd64cpuinfo_h #define CPUCYCLES_amd64cpuinfo_h #ifdef __cplusplus extern "C" { #endif extern long long cpucycles_amd64cpuinfo(void); extern long long cpucycles_amd64cpuinfo_persecond(void); #ifdef __cplusplus } #endif #ifndef cpucycles_implementation #define cpucycles_implementation "amd64cpuinfo" #define cpucycles cpucycles_amd64cpuinfo #define cpucycles_persecond cpucycles_amd64cpuinfo_persecond #endif #endif libm4ri-20130416/testsuite/cpucycles-20060326/amd64tscfreq.c000066400000000000000000000006371212302366200227130ustar00rootroot00000000000000#include #include long long cpucycles_amd64tscfreq(void) { unsigned long long result; asm volatile(".byte 15;.byte 49;shlq $32,%%rdx;orq %%rdx,%%rax" : "=a" (result) :: "%rdx"); return result; } long long cpucycles_amd64tscfreq_persecond(void) { long result = 0; size_t resultlen = sizeof(long); sysctlbyname("machdep.tsc_freq",&result,&resultlen,0,0); return result; } libm4ri-20130416/testsuite/cpucycles-20060326/amd64tscfreq.h000066400000000000000000000010071212302366200227100ustar00rootroot00000000000000/* cpucycles amd64tscfreq.h version 20060318 D. J. Bernstein Public domain. */ #ifndef CPUCYCLES_amd64tscfreq_h #define CPUCYCLES_amd64tscfreq_h #ifdef __cplusplus extern "C" { #endif extern long long cpucycles_amd64tscfreq(void); extern long long cpucycles_amd64tscfreq_persecond(void); #ifdef __cplusplus } #endif #ifndef cpucycles_implementation #define cpucycles_implementation "amd64tscfreq" #define cpucycles cpucycles_amd64tscfreq #define cpucycles_persecond cpucycles_amd64tscfreq_persecond #endif #endif libm4ri-20130416/testsuite/cpucycles-20060326/clockmonotonic.c000066400000000000000000000012761212302366200234310ustar00rootroot00000000000000#include #include #include #include #include #include static double cpufrequency = 0; static void init(void) { long result = 0; size_t resultlen = sizeof(long); sysctlbyname("machdep.tsc_freq",&result,&resultlen,0,0); cpufrequency = result; } long long cpucycles_clockmonotonic(void) { double result; struct timespec t; if (!cpufrequency) init(); clock_gettime(CLOCK_MONOTONIC,&t); result = t.tv_nsec; result *= 0.000000001; result += (double) t.tv_sec; result *= cpufrequency; return result; } long long cpucycles_clockmonotonic_persecond(void) { if (!cpufrequency) init(); return cpufrequency; } libm4ri-20130416/testsuite/cpucycles-20060326/clockmonotonic.h000066400000000000000000000010271212302366200234300ustar00rootroot00000000000000/* cpucycles clockmonotonic.h version 20060318 D. J. Bernstein Public domain. */ #ifndef CPUCYCLES_clockmonotonic_h #define CPUCYCLES_clockmonotonic_h #ifdef __cplusplus extern "C" { #endif extern long long cpucycles_clockmonotonic(void); extern long long cpucycles_clockmonotonic_persecond(void); #ifdef __cplusplus } #endif #ifndef cpucycles_implementation #define cpucycles_implementation "clockmonotonic" #define cpucycles cpucycles_clockmonotonic #define cpucycles_persecond cpucycles_clockmonotonic_persecond #endif #endif libm4ri-20130416/testsuite/cpucycles-20060326/compile000077500000000000000000000012731212302366200216170ustar00rootroot00000000000000#!/bin/sh case "$COMPILER" in suncc) case "$ARCHITECTURE" in 64) /opt/SUNWspro/bin/cc -xarch=v9 -O2 "$@" ;; 32) /opt/SUNWspro/bin/cc -xarch=v8 -O2 "$@" ;; *) /opt/SUNWspro/bin/cc -O2 "$@" ;; esac ;; ibmcc) case "$ARCHITECTURE" in 64) xlc -q64 -O2 "$@" ;; 32) xlc -q32 -O2 "$@" ;; *) xlc -O2 "$@" ;; esac ;; hpcc) case "$ARCHITECTURE" in 64) /opt/ansic/bin/cc +DD64 -O2 "$@" ;; 32) /opt/ansic/bin/cc +DD32 -O2 "$@" ;; *) /opt/ansic/bin/cc -O2 "$@" ;; esac ;; *) case "$ARCHITECTURE" in 64) gcc -m64 -O2 "$@" ;; 32) gcc -m32 -O2 "$@" ;; *) gcc -O2 "$@" ;; esac ;; esac libm4ri-20130416/testsuite/cpucycles-20060326/cpucycles.html000066400000000000000000000316651212302366200231310ustar00rootroot00000000000000 cpucycles: counting CPU cycles

cpucycles: counting CPU cycles

A C or C++ program can call cpucycles() to receive a long long cycle count. The program has to
     #include "cpucycles.h"
and link to cpucycles.o. The program can look at the constant string cpucycles_implementation to see which implementation of cpucycles it's using. The program can also call cpucycles_persecond() to receive a long long estimate of the number of cycles per second.

Here's how to create cpucycles.h and cpucycles.o:

     wget http://ebats.cr.yp.to/cpucycles-20060326.tar.gz
     gunzip < cpucycles-20060326.tar.gz | tar -xf -
     cd cpucycles-20060326
     sh do
The do script creates cpucycles.h and cpucycles.o. It also prints one line of output showing the implementation selected, the number of cycles per second, a double-check of the number of cycles per second, and the differences between several adjacent calls to the cpucycles() function.

Some systems have multiple incompatible formats for executable programs. The most important reason is that some CPUs (the Athlon 64, for example, and the UltraSPARC) have two incompatible modes, a 32-bit mode and a 64-bit mode. On these systems, you can run

     env ARCHITECTURE=32 sh do
to create a 32-bit cpucycles.o or
     env ARCHITECTURE=64 sh do
to create a 64-bit cpucycles.o.

Notes on accuracy

Benchmarking tools are encouraged to record several timings of a function: call cpucycles(), function(), cpucycles(), function(), etc., and then print one line reporting the differences between successive cpucycles() results. The median of several differences is much more stable than the average.

Cycle counts continue to increase while other programs are running, while the operating system is handling an interruption such as a network packet, etc. This won't affect the median of several timings of a fast function---the function usually won't be interrupted---but it can affect the median of several timings of a slow function. Hopefully a benchmarking machine isn't running other programs.

On dual-CPU systems (and dual-core systems such as the Athlon 64 X2), the CPUs often don't have synchronized cycle counters, so a process that switches CPUs can have its cycle counts jump forwards or backwards. I've never seen this affect the median of several timings.

Some CPUs dynamically reduce CPU speed to save power, but deliberately keep their cycle counters running at full speed, the idea being that measuring time is more important than measuring cycles. Hopefully a benchmarking machine won't enter power-saving mode.

Cycle counts are occasionally off by a multiple of 2^32 on some CPUs, as discussed below. I've never seen this affect the median of several timings.

The estimate returned by cpucycles_persecond() may improve accuracy after cpucycles() has been called repeatedly.

Implementations

alpha. The Alpha's built-in cycle-counting function counts cycles modulo 2^32. cpucycles usually manages to fix this by calling gettimeofday (which takes a large but low-variance number of cycles) and automatically estimating the chip speed. In extreme situations the resulting cycle counts could still be off by a multiple of 2^32.

Results on td161: alpha 499845359 499838717 423 360 336 349 353 348 469 329 348 345 348 345 348 345 348 345 348 345 348 348 348 345 348 345 348 345 348 348 348 345 348 345 348 345 348 348 348 345 348 345 348 345 348 348 348 345 348 345 348 345 348 348 348 468 318 348 345 348 345 348 345 348 345 348

amd64cpuinfo. cpucycles uses the CPU's RDTSC instruction to count cycles, and reads /proc/cpuinfo to see the kernel's estimate of cycles per second.

Results on dancer with ARCHITECTURE=64 (default): amd64cpuinfo 2002653000 2002526765 22 9 9 8 8 17 6 10 5 9 8 8 8 17 6 10 5 9 8 8 8 17 6 10 5 9 8 8 8 17 6 10 5 9 8 8 11 14 15 28 10 8 9 12 23 106 10 8 8 8 8 8 8 17 6 10 5 9 8 8 8 17 6 10

amd64tscfreq. cpucycles uses the CPU's RDTSC instruction to count cycles, and uses sysctlbyname("machdep.tsc_freq",...) to see the kernel's estimate of cycles per second.

clockmonotonic. Backup option, using the POSIX clock_gettime(CLOCK_MONOTONIC) function to count nanoseconds and using sysctlbyname("machdep.tsc_freq",...) to see the kernel's estimate of cycles per second. This often has much worse than microsecond precision.

Results on whisper (artificially induced): clockmonotonic 1298904202 1298866469 2177 1815 2177 2177 1814 2177 2178 2177 1814 2178 2177 1814 2177 2177 1815 2177 2177 1814 2177 2179 1813 2178 2177 1815 2177 2177 1814 2177 2177 2177 1815 2178 2177 1813 2178 2177 1815 2177 2177 1814 2177 2177 1815 2177 2177 1814 2177 2179 2177 1814 2177 2177 2177 1815 2177 2177 1814 2177 2178 1814 2178 2177 1814 2177

gettimeofday. Backup option, using the POSIX gettimeofday() function to count microseconds and /proc/cpuinfo to see the kernel's estimate of cycles per second. This often has much worse than microsecond precision.

Results on dancer (artificially induced) with ARCHITECTURE=32: gettimeofday 2002653000 2002307748 2002 2003 2003 2002 2003 2003 2002 2003 2003 2002 2003 2003 2002 2003 2003 2002 2003 2003 2002 2003 2003 2002 2003 2003 4005 0 4005 2003 0 4005 2003 2002 2003 2003 2002 2003 2003 2002 2003 2002 2003 2003 2002 2003 2003 2002 2003 2003 2002 2003 2003 2002 2003 2003 2002 2003 4005 0 2003 4005 0 4006 2002 2003

Results on dancer (artificially induced) with ARCHITECTURE=64 (default): gettimeofday 2002653000 2002293956 2560 1792 2048 1792 2048 2304 1792 2048 1792 0 2048 2304 2048 1792 1792 2048 0 2304 2048 1792 2048 1792 2304 0 2048 1792 2048 1792 2048 0 2304 2048 1792 1792 2048 2304 2048 0 1792 2048 1792 2304 2048 1792 2048 0 1792 2560 1792 1792 2048 1792 0 2560 25600 2048 1792 2560 1792 0 2048 1792 2048 2304

hppapstat. cpucycles uses the CPU's MFCTL %cr16 instruction to count cycles, and pstat(PSTAT_PROCESSOR,...) to see the kernel's estimate of cycles per second.

Results on hp400: hppapstat 440000000 439994653 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11

powerpcaix. cpucycles uses the CPU's MFTB instruction to count ``time base''; uses /usr/sbin/lsattr -E -l proc0 -a frequency to see the kernel's estimate of cycles per second; and spends some time comparing MFTB to gettimeofday() to figure out the number of time-base counts per second.

I've seen a 533MHz PowerPC G4 (7410) with a 16-cycle time base; a 668MHz POWER RS64 IV (SStar) system with a 1-cycle time base; a 1452MHz POWER with an 8-cycle time base; and a 2000MHz PowerPC G5 (970) with a 60-cycle time base.

Results on tigger: powerpcaix 1452000000 1451981436 56 64 64 64 56 64 64 64 56 64 64 64 56 64 64 64 56 64 64 64 56 64 64 64 56 64 64 64 56 64 64 64 56 64 64 64 56 64 64 64 56 64 64 64 56 64 64 64 56 64 64 64 56 64 64 64 56 64 64 64 56 64 64 64

powerpclinux. cpucycles uses the CPU's MFTB instruction to count ``time base''; reads /proc/cpuinfo to see the kernel's estimate of cycles per second; and spends some time comparing MFTB to gettimeofday() to figure out the number of time-base counts per second.

Results on gggg: powerpclinux 533000000 532650134 48 32 48 32 32 48 32 32 48 32 32 48 32 32 48 32 32 48 32 32 32 48 32 32 48 32 32 48 32 32 48 32 32 48 32 32 32 48 32 32 48 32 32 48 32 32 48 32 32 48 32 32 32 48 32 32 48 32 32 48 32 32 48 32

powerpcmacos. cpucycles uses the mach_absolute_time function to count ``time base''; uses sysctlbyname("hw.cpufrequency",...) to see the kernel's estimate of cycles per second; and uses sysctlbyname("hw.tbfrequency",...) to see the kernel's estimate of time-base counts per second.

Results on geespaz with ARCHITECTURE=32 (default): powerpcmacos 2000000000 1999891801 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60

Results on geespaz with ARCHITECTURE=64: powerpcmacos 2000000000 1999896339 420 60 60 60 60 60 60 0 60 60 60 60 60 60 60 60 0 60 60 60 60 60 60 60 60 60 0 60 60 60 60 60 60 60 60 60 0 60 60 60 60 60 60 60 0 60 60 60 60 60 60 60 60 0 60 60 60 60 60 60 60 60 0 60

sparc32psrinfo. cpucycles uses the CPU's RDTICK instruction in 32-bit mode to count cycles, and runs /usr/sbin/psrinfo -v to see the kernel's estimate of cycles per second.

Results on icarus with ARCHITECTURE=32 (default): sparc32psrinfo 900000000 899920056 297 23 23 18 22 23 18 17 22 18 17 22 23 18 17 129 17 17 17 17 17 17 17 97 17 17 17 17 17 17 17 85 17 17 17 17 17 17 17 97 17 17 17 17 17 17 17 85 17 17 17 17 17 17 17 97 17 17 17 17 17 17 17 85

Results on wessel with ARCHITECTURE=32 (default): sparc32psrinfo 900000000 899997269 39 23 18 22 18 25 72 17 22 18 17 22 23 26 71 17 17 17 17 17 17 17 85 17 17 17 17 17 17 17 97 17 17 17 17 17 17 17 85 17 17 17 17 17 17 17 97 17 17 17 17 17 17 17 85 17 17 17 17 17 17 17 109 17

sparcpsrinfo. cpucycles uses the CPU's RDTICK instruction in 64-bit mode to count cycles, and runs /usr/sbin/psrinfo -v to see the kernel's estimate of cycles per second.

Results on icarus with ARCHITECTURE=64: sparcpsrinfo 900000000 899920264 289 12 12 12 12 12 12 19 12 113 19 12 12 12 12 12 12 130 12 12 12 12 12 12 12 144 12 12 12 12 12 12 12 144 12 12 12 12 12 12 12 144 12 12 12 12 12 12 12 144 12 12 12 12 12 12 12 144 12 12 12 12 12 12

Results on wessel with ARCHITECTURE=64: sparcpsrinfo 900000000 899997032 29 19 12 19 19 19 12 12 123 12 12 12 12 12 12 12 174 12 12 12 12 12 12 12 174 12 12 12 12 12 12 12 174 12 12 12 12 12 12 12 174 12 12 12 12 12 12 12 174 12 12 12 12 12 12 12 174 12 12 12 12 12 12 12

x86cpuinfo. cpucycles uses the CPU's RDTSC instruction to count cycles, and reads /proc/cpuinfo to see the kernel's estimate of cycles per second. There have been reports of the 64-bit cycle counters on some x86 CPUs being occasionally off by 2^32; cpucycles makes no attempt to fix this.

Results on cruncher: x86cpuinfo 132957999 132951052 60 36 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32

Results on dali: x86cpuinfo 448882000 448881565 49 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45 45

Results on dancer with ARCHITECTURE=32: x86cpuinfo 2002653000 2002538651 26 11 9 11 10 17 11 10 10 10 9 10 9 12 9 173 11 10 10 10 10 17 11 10 10 10 9 10 9 17 11 10 10 10 9 10 9 17 11 10 10 10 9 10 9 17 11 10 10 10 9 10 9 17 11 10 10 10 9 10 9 17 11 10

Results on fireball: x86cpuinfo 1894550999 1894188944 104 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88

Results on neumann: x86cpuinfo 999534999 999456935 49 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44

Results on rzitsc: x86cpuinfo 2799309000 2799170567 132 96 100 104 100 100 96 96 96 100 96 108 104 104 112 96 112 96 108 96 112 96 96 96 100 112 120 100 96 100 104 112 96 96 96 88 96 128 108 96 116 96 100 100 108 96 100 96 108 96 104 100 112 96 100 96 100 100 88 108 100 108 92 96

Results on shell: x86cpuinfo 3391548999 3391341751 108 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88 88

Results on thoth: x86cpuinfo 900447000 900028758 67 19 18 18 19 188 16 16 16 19 19 18 19 147 16 16 16 19 19 17 16 16 16 16 16 19 19 17 16 16 16 16 16 19 19 17 16 16 16 16 16 19 19 18 19 156 16 16 16 19 19 18 19 147 16 16 16 19 19 18 19 147 16 16

x86tscfreq. cpucycles uses the CPU's RDTSC instruction to count cycles, and uses sysctlbyname("machdep.tsc_freq",...) to see the kernel's estimate of cycles per second.

Results on whisper: x86tscfreq 1298904202 1298892874 72 72 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53 53

Version

This is the cpucycles-20060326.html web page. This web page is in the public domain. libm4ri-20130416/testsuite/cpucycles-20060326/do000066400000000000000000000021701212302366200205630ustar00rootroot00000000000000#!/bin/sh output="cpucycles.o cpucycles.h" cleanup="test cpucycles-impl.o cpucycles-impl.h cpucycles-impl.c" exec 2>do.notes rm -f $output $cleanup ( echo amd64tscfreq gcc echo amd64cpuinfo gcc echo x86tscfreq gcc echo x86cpuinfo gcc echo powerpclinux gcc echo powerpcmacos gcc echo powerpcaix gcc echo powerpcaix ibmcc echo sparcpsrinfo gcc echo sparcpsrinfo suncc echo sparc32psrinfo gcc echo sparc32psrinfo suncc echo hppapstat gcc echo hppapstat hpcc echo alpha gcc echo clockmonotonic gcc echo gettimeofday gcc ) | ( while read name compiler do echo ===== Trying $name.c with $compiler... >&2 rm -f $cleanup cp $name.c cpucycles-impl.c || continue cp $name.h cpucycles-impl.h || continue env COMPILER=$compiler ./compile -c cpucycles-impl.c || continue env COMPILER=$compiler ./compile -o test test.c cpucycles-impl.o || continue ./test || continue echo ===== Success. Using $name.c. >&2 mv cpucycles-impl.o cpucycles.o mv cpucycles-impl.h cpucycles.h rm -f $cleanup exit 0 done echo ===== Giving up. >&2 rm -f $output $cleanup exit 111 ) libm4ri-20130416/testsuite/cpucycles-20060326/gettimeofday.c000066400000000000000000000014651212302366200230710ustar00rootroot00000000000000#include #include #include #include static double cpufrequency = 0; static void init(void) { FILE *f; double result; int s; f = fopen("/proc/cpuinfo","r"); if (!f) return; for (;;) { s = fscanf(f,"cpu MHz : %lf",&result); if (s > 0) break; if (s == 0) s = fscanf(f,"%*[^\n]\n"); if (s < 0) { result = 0; break; } } fclose(f); cpufrequency = 1000000.0 * result; } long long cpucycles_gettimeofday(void) { double result; struct timeval t; if (!cpufrequency) init(); gettimeofday(&t,(struct timezone *) 0); result = t.tv_usec; result *= 0.000001; result += (double) t.tv_sec; result *= cpufrequency; return result; } long long cpucycles_gettimeofday_persecond(void) { if (!cpufrequency) init(); return cpufrequency; } libm4ri-20130416/testsuite/cpucycles-20060326/gettimeofday.h000066400000000000000000000010071212302366200230660ustar00rootroot00000000000000/* cpucycles gettimeofday.h version 20060318 D. J. Bernstein Public domain. */ #ifndef CPUCYCLES_gettimeofday_h #define CPUCYCLES_gettimeofday_h #ifdef __cplusplus extern "C" { #endif extern long long cpucycles_gettimeofday(void); extern long long cpucycles_gettimeofday_persecond(void); #ifdef __cplusplus } #endif #ifndef cpucycles_implementation #define cpucycles_implementation "gettimeofday" #define cpucycles cpucycles_gettimeofday #define cpucycles_persecond cpucycles_gettimeofday_persecond #endif #endif libm4ri-20130416/testsuite/cpucycles-20060326/hppapstat.c000066400000000000000000000010421212302366200224030ustar00rootroot00000000000000#include #include #include #include #include #include long long cpucycles_hppapstat(void) { register long long result; _MFCTL(16,result); return result; } long long cpucycles_hppapstat_persecond(void) { struct pst_processor pst; union pstun pu; double result; pu.pst_processor = &pst; if (pstat(PSTAT_PROCESSOR,pu,sizeof(pst),1,0) < 0) return 0; result = pst.psp_iticksperclktick; result *= (double) sysconf(_SC_CLK_TCK); return result; } libm4ri-20130416/testsuite/cpucycles-20060326/hppapstat.h000066400000000000000000000007571212302366200224240ustar00rootroot00000000000000/* cpucycles hppapstat.h version 20060319 D. J. Bernstein Public domain. */ #ifndef CPUCYCLES_hppapstat_h #define CPUCYCLES_hppapstat_h #ifdef __cplusplus extern "C" { #endif extern long long cpucycles_hppapstat(void); extern long long cpucycles_hppapstat_persecond(void); #ifdef __cplusplus } #endif #ifndef cpucycles_implementation #define cpucycles_implementation "hppapstat" #define cpucycles cpucycles_hppapstat #define cpucycles_persecond cpucycles_hppapstat_persecond #endif #endif libm4ri-20130416/testsuite/cpucycles-20060326/powerpcaix.c000066400000000000000000000030351212302366200225640ustar00rootroot00000000000000#include #include #include #include #include static long myround(double u) { long result = u; while (result + 0.5 < u) result += 1; while (result - 0.5 > u) result -= 1; return result; } static long long microseconds(void) { struct timeval t; gettimeofday(&t,(struct timezone *) 0); return t.tv_sec * (long long) 1000000 + t.tv_usec; } static long long timebase(void) { unsigned long high; unsigned long low; unsigned long newhigh; unsigned long long result; asm volatile( "Lcpucycles:mftbu %0;mftb %1;mftbu %2;cmpw %0,%2;bne Lcpucycles" : "=r" (high), "=r" (low), "=r" (newhigh) ); result = high; result <<= 32; result |= low; return result; } static double cpufrequency = 0; static long tbcycles = 0; static void init(void) { FILE *f; long long tb0; long long us0; long long tb1; long long us1; f = popen("/usr/sbin/lsattr -E -l proc0 -a frequency","r"); if (!f) return; if (fscanf(f,"frequency %lf",&cpufrequency) < 1) cpufrequency = 0; pclose(f); if (!cpufrequency) return; tb0 = timebase(); us0 = microseconds(); do { tb1 = timebase(); us1 = microseconds(); } while (us1 - us0 < 10000); if (tb1 <= tb0) return; tb1 -= tb0; us1 -= us0; tbcycles = myround((cpufrequency * 0.000001 * (double) us1) / (double) tb1); } long long cpucycles_powerpcaix(void) { if (!tbcycles) init(); return timebase() * tbcycles; } long long cpucycles_powerpcaix_persecond(void) { if (!tbcycles) init(); return cpufrequency; } libm4ri-20130416/testsuite/cpucycles-20060326/powerpcaix.h000066400000000000000000000007671212302366200226020ustar00rootroot00000000000000/* cpucycles powerpcaix.h version 20060318 D. J. Bernstein Public domain. */ #ifndef CPUCYCLES_powerpcaix_h #define CPUCYCLES_powerpcaix_h #ifdef __cplusplus extern "C" { #endif extern long long cpucycles_powerpcaix(void); extern long long cpucycles_powerpcaix_persecond(void); #ifdef __cplusplus } #endif #ifndef cpucycles_implementation #define cpucycles_implementation "powerpcaix" #define cpucycles cpucycles_powerpcaix #define cpucycles_persecond cpucycles_powerpcaix_persecond #endif #endif libm4ri-20130416/testsuite/cpucycles-20060326/powerpclinux.c000066400000000000000000000032341212302366200231430ustar00rootroot00000000000000#include #include #include #include #include static long myround(double u) { long result = u; while (result + 0.5 < u) result += 1; while (result - 0.5 > u) result -= 1; return result; } static long long microseconds(void) { struct timeval t; gettimeofday(&t,(struct timezone *) 0); return t.tv_sec * (long long) 1000000 + t.tv_usec; } static long long timebase(void) { unsigned long high; unsigned long low; unsigned long newhigh; unsigned long long result; asm volatile( "Lcpucycles:mftbu %0;mftb %1;mftbu %2;cmpw %0,%2;bne Lcpucycles" : "=r" (high), "=r" (low), "=r" (newhigh) ); result = high; result <<= 32; result |= low; return result; } static double cpufrequency = 0; static long tbcycles = 0; static void init(void) { FILE *f; int s; long long tb0; long long us0; long long tb1; long long us1; f = fopen("/proc/cpuinfo","r"); if (!f) return 0; for (;;) { s = fscanf(f," clock : %lf MHz",&cpufrequency); if (s > 0) break; if (s == 0) s = fscanf(f,"%*[^\n]\n"); if (s < 0) { cpufrequency = 0; break; } } fclose(f); if (!cpufrequency) return; cpufrequency *= 1000000.0; tb0 = timebase(); us0 = microseconds(); do { tb1 = timebase(); us1 = microseconds(); } while (us1 - us0 < 10000); if (tb1 <= tb0) return; tb1 -= tb0; us1 -= us0; tbcycles = myround((cpufrequency * 0.000001 * (double) us1) / (double) tb1); } long long cpucycles_powerpclinux(void) { if (!tbcycles) init(); return timebase() * tbcycles; } long long cpucycles_powerpclinux_persecond(void) { if (!tbcycles) init(); return cpufrequency; } libm4ri-20130416/testsuite/cpucycles-20060326/powerpclinux.h000066400000000000000000000010071212302366200231440ustar00rootroot00000000000000/* cpucycles powerpclinux.h version 20060319 D. J. Bernstein Public domain. */ #ifndef CPUCYCLES_powerpclinux_h #define CPUCYCLES_powerpclinux_h #ifdef __cplusplus extern "C" { #endif extern long long cpucycles_powerpclinux(void); extern long long cpucycles_powerpclinux_persecond(void); #ifdef __cplusplus } #endif #ifndef cpucycles_implementation #define cpucycles_implementation "powerpclinux" #define cpucycles cpucycles_powerpclinux #define cpucycles_persecond cpucycles_powerpclinux_persecond #endif #endif libm4ri-20130416/testsuite/cpucycles-20060326/powerpcmacos.c000066400000000000000000000017521212302366200231110ustar00rootroot00000000000000#include #include #include #define timebase mach_absolute_time static int cpumib[2] = { CTL_HW, HW_CPU_FREQ } ; static int tbmib[2] = { CTL_HW, HW_TB_FREQ } ; static long myround(double u) { long result = u; while (result + 0.5 < u) result += 1; while (result - 0.5 > u) result -= 1; return result; } static long tbcycles = 0; static void init(void) { int cpufrequency = 0; size_t cpufrequencylen = sizeof(int); int tbfrequency = 0; size_t tbfrequencylen = sizeof(int); sysctl(cpumib,2,&cpufrequency,&cpufrequencylen,0,0); sysctl(tbmib,2,&tbfrequency,&tbfrequencylen,0,0); if (tbfrequency > 0) tbcycles = myround((double) cpufrequency / (double) tbfrequency); } long long cpucycles_powerpcmacos(void) { if (!tbcycles) init(); return timebase() * tbcycles; } long long cpucycles_powerpcmacos_persecond(void) { int result = 0; size_t resultlen = sizeof(int); sysctl(cpumib,2,&result,&resultlen,0,0); return result; } libm4ri-20130416/testsuite/cpucycles-20060326/powerpcmacos.h000066400000000000000000000010071212302366200231070ustar00rootroot00000000000000/* cpucycles powerpcmacos.h version 20060319 D. J. Bernstein Public domain. */ #ifndef CPUCYCLES_powerpcmacos_h #define CPUCYCLES_powerpcmacos_h #ifdef __cplusplus extern "C" { #endif extern long long cpucycles_powerpcmacos(void); extern long long cpucycles_powerpcmacos_persecond(void); #ifdef __cplusplus } #endif #ifndef cpucycles_implementation #define cpucycles_implementation "powerpcmacos" #define cpucycles cpucycles_powerpcmacos #define cpucycles_persecond cpucycles_powerpcmacos_persecond #endif #endif libm4ri-20130416/testsuite/cpucycles-20060326/sparc32psrinfo.c000066400000000000000000000011611212302366200232570ustar00rootroot00000000000000#include #include long long cpucycles_sparc32psrinfo(void) { long long result; asm volatile( ".word 0x93410000;.word 0x91327020;mov %%g0,%0" : "=r" (result) : : "%g0" ); return result; } long long cpucycles_sparc32psrinfo_persecond(void) { FILE *f; double result; int s; f = popen("/usr/sbin/psrinfo -v","r"); if (!f) return 0; for (;;) { s = fscanf(f," The %*s processor operates at %lf MHz",&result); if (s > 0) break; if (s == 0) s = fscanf(f,"%*[^\n]\n"); if (s < 0) { result = 0; break; } } pclose(f); return 1000000.0 * result; } libm4ri-20130416/testsuite/cpucycles-20060326/sparc32psrinfo.h000066400000000000000000000010271212302366200232650ustar00rootroot00000000000000/* cpucycles sparc32psrinfo.h version 20060319 D. J. Bernstein Public domain. */ #ifndef CPUCYCLES_sparc32psrinfo_h #define CPUCYCLES_sparc32psrinfo_h #ifdef __cplusplus extern "C" { #endif extern long long cpucycles_sparc32psrinfo(void); extern long long cpucycles_sparc32psrinfo_persecond(void); #ifdef __cplusplus } #endif #ifndef cpucycles_implementation #define cpucycles_implementation "sparc32psrinfo" #define cpucycles cpucycles_sparc32psrinfo #define cpucycles_persecond cpucycles_sparc32psrinfo_persecond #endif #endif libm4ri-20130416/testsuite/cpucycles-20060326/sparcpsrinfo.c000066400000000000000000000010541212302366200231130ustar00rootroot00000000000000#include #include long long cpucycles_sparcpsrinfo(void) { long long result; asm volatile("rd %%tick,%0" : "=r" (result)); return result; } long long cpucycles_sparcpsrinfo_persecond(void) { FILE *f; double result; int s; f = popen("/usr/sbin/psrinfo -v","r"); if (!f) return 0; for (;;) { s = fscanf(f," The %*s processor operates at %lf MHz",&result); if (s > 0) break; if (s == 0) s = fscanf(f,"%*[^\n]\n"); if (s < 0) { result = 0; break; } } pclose(f); return 1000000.0 * result; } libm4ri-20130416/testsuite/cpucycles-20060326/sparcpsrinfo.h000066400000000000000000000010071212302366200231160ustar00rootroot00000000000000/* cpucycles sparcpsrinfo.h version 20060318 D. J. Bernstein Public domain. */ #ifndef CPUCYCLES_sparcpsrinfo_h #define CPUCYCLES_sparcpsrinfo_h #ifdef __cplusplus extern "C" { #endif extern long long cpucycles_sparcpsrinfo(void); extern long long cpucycles_sparcpsrinfo_persecond(void); #ifdef __cplusplus } #endif #ifndef cpucycles_implementation #define cpucycles_implementation "sparcpsrinfo" #define cpucycles cpucycles_sparcpsrinfo #define cpucycles_persecond cpucycles_sparcpsrinfo_persecond #endif #endif libm4ri-20130416/testsuite/cpucycles-20060326/test.c000066400000000000000000000026631212302366200213700ustar00rootroot00000000000000#include #include #include #include #include "cpucycles-impl.h" static long long tod(void) { struct timeval t; gettimeofday(&t,(struct timezone *) 0); return t.tv_sec * (long long) 1000000 + t.tv_usec; } long long todstart; long long todend; long long cpustart; long long cpuend; long long t[1001]; main() { int i; for (i = 0;i <= 1000;++i) t[i] = cpucycles(); for (i = 0;i < 1000;++i) if (t[i] > t[i + 1]) { fprintf(stderr,"t[%d] = %lld\n",i,t[i]); fprintf(stderr,"t[%d] = %lld\n",i + 1,t[i + 1]); fprintf(stderr,"cpucycles_persecond() = %lld\n",cpucycles_persecond()); return 100; } if (t[0] == t[1000]) { fprintf(stderr,"t[%d] = %lld\n",0,t[0]); fprintf(stderr,"t[%d] = %lld\n",1000,t[1000]); fprintf(stderr,"cpucycles_persecond() = %lld\n",cpucycles_persecond()); return 100; } if (cpucycles_persecond() <= 0) { fprintf(stderr,"cpucycles_persecond() = %lld\n",cpucycles_persecond()); return 100; } todstart = tod(); cpustart = cpucycles(); sleep(1); todend = tod(); cpuend = cpucycles(); todend -= todstart; cpuend -= cpustart; for (i = 0;i <= 1000;++i) t[i] = cpucycles(); printf("%s",cpucycles_implementation); printf(" %lld",cpucycles_persecond()); printf(" %lld",(long long) (((double) cpuend) * 1000000.0 / (double) todend)); for (i = 0;i < 64;++i) printf(" %lld",t[i + 1] - t[i]); printf("\n"); return 0; } libm4ri-20130416/testsuite/cpucycles-20060326/x86cpuinfo.c000066400000000000000000000010151212302366200224100ustar00rootroot00000000000000#include #include long long cpucycles_x86cpuinfo(void) { long long result; asm volatile(".byte 15;.byte 49" : "=A" (result)); return result; } long long cpucycles_x86cpuinfo_persecond(void) { FILE *f; double result; int s; f = fopen("/proc/cpuinfo","r"); if (!f) return 0; for (;;) { s = fscanf(f,"cpu MHz : %lf",&result); if (s > 0) break; if (s == 0) s = fscanf(f,"%*[^\n]\n"); if (s < 0) { result = 0; break; } } fclose(f); return 1000000.0 * result; } libm4ri-20130416/testsuite/cpucycles-20060326/x86cpuinfo.h000066400000000000000000000007671212302366200224320ustar00rootroot00000000000000/* cpucycles x86cpuinfo.h version 20060318 D. J. Bernstein Public domain. */ #ifndef CPUCYCLES_x86cpuinfo_h #define CPUCYCLES_x86cpuinfo_h #ifdef __cplusplus extern "C" { #endif extern long long cpucycles_x86cpuinfo(void); extern long long cpucycles_x86cpuinfo_persecond(void); #ifdef __cplusplus } #endif #ifndef cpucycles_implementation #define cpucycles_implementation "x86cpuinfo" #define cpucycles cpucycles_x86cpuinfo #define cpucycles_persecond cpucycles_x86cpuinfo_persecond #endif #endif libm4ri-20130416/testsuite/cpucycles-20060326/x86tscfreq.c000066400000000000000000000005511212302366200224200ustar00rootroot00000000000000#include #include long long cpucycles_x86tscfreq(void) { long long result; asm volatile(".byte 15;.byte 49" : "=A" (result)); return result; } long long cpucycles_x86tscfreq_persecond(void) { long result = 0; size_t resultlen = sizeof(long); sysctlbyname("machdep.tsc_freq",&result,&resultlen,0,0); return result; } libm4ri-20130416/testsuite/cpucycles-20060326/x86tscfreq.h000066400000000000000000000007671212302366200224360ustar00rootroot00000000000000/* cpucycles x86tscfreq.h version 20060318 D. J. Bernstein Public domain. */ #ifndef CPUCYCLES_x86tscfreq_h #define CPUCYCLES_x86tscfreq_h #ifdef __cplusplus extern "C" { #endif extern long long cpucycles_x86tscfreq(void); extern long long cpucycles_x86tscfreq_persecond(void); #ifdef __cplusplus } #endif #ifndef cpucycles_implementation #define cpucycles_implementation "x86tscfreq" #define cpucycles cpucycles_x86tscfreq #define cpucycles_persecond cpucycles_x86tscfreq_persecond #endif #endif libm4ri-20130416/testsuite/test_colswap.c000066400000000000000000000045551212302366200202300ustar00rootroot00000000000000/* * test_colswap.c * * Application to test functionality of mzd_col_swap. * * Copyright (C) 2011 Carlo Wood * RSA-1024 0x624ACAD5 1997-01-26 Sign & Encrypt * Fingerprint16 = 32 EC A7 B6 AC DB 65 A6 F6 F6 55 DD 1C DC FF 61 * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include int test_colswap(rci_t c1) { int failure = 0; printf("col_swap c1: %4d, (c2,offset): ", c1); rci_t const rows = 4 * c1 + 1; mzd_t* A = mzd_init(rows, 4 * c1 + 1); mzd_randomize(A); for (int c2 = 0; c2 < c1 && !failure; c2 += 13) { for (int offset = 0; offset < c1 && !failure; offset += MAX(1, c1 / 2)) { mzd_t* B = mzd_init_window(A, 0, offset, rows, offset + c1 + 1); mzd_t* C = mzd_copy(NULL, B); mzd_col_swap(B, c1, c2); for(int r = 0; r < rows; ++r) { if (mzd_read_bit(C, r, c1) != mzd_read_bit(B, r, c2) || mzd_read_bit(C, r, c2) != mzd_read_bit(B, r, c1)) { ++failure; break; } } mzd_col_swap(B, c2, c1); if (!mzd_equal(B, C)) { ++failure; } mzd_col_swap_in_rows(B, c1, c2, c2, c1); for(int r = c1; r < c2; ++r) { if (mzd_read_bit(C, r, c1) != mzd_read_bit(B, r, c2) || mzd_read_bit(C, r, c2) != mzd_read_bit(B, r, c1)) { ++failure; break; } } mzd_col_swap_in_rows(B, c2, c1, c2, c1); if (!mzd_equal(B, C)) { ++failure; } mzd_free(C); mzd_free(B); } } mzd_free(A); printf(" "); if (failure) { printf("FAILED\n"); } else printf("passed\n"); return failure; } int main() { int status = 0; for (int c1 = 1; c1 < 400; c1 += 15) { status += test_colswap(c1); } if (!status) { printf("All tests passed.\n"); } else { printf("TEST FAILED!\n"); return 1; } return 0; } libm4ri-20130416/testsuite/test_elimination.c000066400000000000000000000056311212302366200210640ustar00rootroot00000000000000#include #include #include int elim_test_equality(rci_t nr, rci_t nc) { int ret = 0; printf("elim: m: %4d, n: %4d ", nr, nc); mzd_t *A = mzd_init(nr, nc); mzd_randomize(A); mzd_t *B = mzd_copy(NULL, A); mzd_t *C = mzd_copy(NULL, A); mzd_t *D = mzd_copy(NULL, A); mzd_t *E = mzd_copy(NULL, A); mzd_t *F = mzd_copy(NULL, A); mzd_t *G = mzd_copy(NULL, A); /* M4RI k=auto */ rci_t ra = mzd_echelonize_m4ri(A, 1, 0); /* M4RI k=8 */ rci_t rb = mzd_echelonize_m4ri(B, 1, 8); /* M4RI Upper Triangular k=auto*/ rci_t rc = mzd_echelonize_m4ri(C, 0, 0); mzd_top_echelonize_m4ri(C, 0); /* M4RI Upper Triangular k=4*/ rci_t rd = mzd_echelonize_m4ri(D, 0, 4); mzd_top_echelonize_m4ri(D, 4); /* Gauss */ rci_t re = mzd_echelonize_naive(E, 1); /* Gauss Upper Triangular */ rci_t rf = mzd_echelonize_naive(F, 0); mzd_top_echelonize_m4ri(F, 0); /* PLUQ */ rci_t rg = mzd_echelonize_pluq(G, 1); if(mzd_equal(A, B) != TRUE || ra != rb) { printf("A != B "); ret -= 1; } if(mzd_equal(B, C) != TRUE || rb != rc) { printf("B != C "); ret -= 1; } if(mzd_equal(C, D) != TRUE || rc != rd) { printf("C != D "); ret -= 1; } if(mzd_equal(D, E) != TRUE || rd != re) { printf("D != E "); ret -= 1; } if(mzd_equal(E, F) != TRUE || re != rf) { printf("E != F "); ret -= 1; } if(mzd_equal(F, G) != TRUE || rf != rg) { printf("F != G "); ret -= 1; } if(mzd_equal(G, A) != TRUE || rg != ra) { printf("G != A "); ret -= 1; } mzd_free(A); mzd_free(B); mzd_free(C); mzd_free(D); mzd_free(E); mzd_free(F); mzd_free(G); if(ret == 0) { printf(" ... passed\n"); } else { printf(" ... FAILED\n"); } return ret; } int main() { int status = 0; srandom(17); status += elim_test_equality(4, 67); status += elim_test_equality(17, 121); status += elim_test_equality(65, 17); status += elim_test_equality(128, 128); status += elim_test_equality(1024, 1024); status += elim_test_equality(2047, 2047); status += elim_test_equality(65, 65); status += elim_test_equality(100, 100); status += elim_test_equality(21, 171); status += elim_test_equality(31, 121); status += elim_test_equality(193, 65); status += elim_test_equality(1025, 1025); status += elim_test_equality(2048, 2048); status += elim_test_equality(64, 64); status += elim_test_equality(128, 128); status += elim_test_equality(4096, 3528); status += elim_test_equality(1024, 1025); status += elim_test_equality(1000, 1000); status += elim_test_equality(1000, 10); status += elim_test_equality(1710, 1290); status += elim_test_equality(1290, 1710); status += elim_test_equality(1290, 1710); status += elim_test_equality(1290, 1290); status += elim_test_equality(1000, 210); if (status == 0) { printf("All tests passed.\n"); return 0; } else { return -1; } } libm4ri-20130416/testsuite/test_invert.c000066400000000000000000000040501212302366200200550ustar00rootroot00000000000000#include #include #include /** * Check that inversion works. * * \param m Number of rows of A * \param l Number of columns of A/number of rows of B * \param n Number of columns of B * \param k Parameter k of M4RM algorithm, may be 0 for automatic choice. * \param cutoff Cut off parameter at which dimension to switch from * Strassen to M4RM */ int invert_test(rci_t n, int k) { int ret = 0; printf("invert: n: %4d, k: %2d", n, k); mzd_t *I2 = mzd_init(n,n); mzd_set_ui(I2,1); mzd_t *U = mzd_init(n,n); mzd_randomize(U); for(rci_t i=0; i * * Distributed under the terms of the GNU General Public License (GPL) * version 2 or higher. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * The full text of the GPL is available at: * * http://www.gnu.org/licenses/ * ********************************************************************/ #include #include #include int test_kernel_left_pluq(rci_t m, rci_t n) { mzd_t* A = mzd_init(m, n); mzd_randomize(A); mzd_t *Acopy = mzd_copy(NULL, A); rci_t r = mzd_echelonize_m4ri(A, 0, 0); printf("kernel_left m: %4d, n: %4d, r: %4d ", m, n, r); mzd_free(Acopy); Acopy = mzd_copy(NULL, A); mzd_t *X = mzd_kernel_left_pluq(A, 0); if (X == NULL) { printf("passed\n"); mzd_free(A); mzd_free(Acopy); return 0; } mzd_t *Z = mzd_mul(NULL, Acopy, X, 0); int status = 1 - mzd_is_zero(Z); if (!status) printf("passed\n"); else printf("FAILED\n"); mzd_free(A); mzd_free(Acopy); mzd_free(X); mzd_free(Z); return status; } int main() { int status = 0; srandom(17); status += test_kernel_left_pluq( 2, 4); status += test_kernel_left_pluq( 4, 1); status += test_kernel_left_pluq( 10, 20); status += test_kernel_left_pluq( 20, 1); status += test_kernel_left_pluq( 20, 20); status += test_kernel_left_pluq( 30, 1); status += test_kernel_left_pluq( 30, 30); status += test_kernel_left_pluq( 80, 1); status += test_kernel_left_pluq( 80, 20); status += test_kernel_left_pluq( 80, 80); status += test_kernel_left_pluq( 4, 2); status += test_kernel_left_pluq( 1, 4); status += test_kernel_left_pluq(20, 10); status += test_kernel_left_pluq( 1, 20); status += test_kernel_left_pluq(20, 20); status += test_kernel_left_pluq( 1, 30); status += test_kernel_left_pluq(30, 30); status += test_kernel_left_pluq( 1, 80); status += test_kernel_left_pluq(20, 80); status += test_kernel_left_pluq(80, 80); status += test_kernel_left_pluq(10, 20); status += test_kernel_left_pluq(10, 80); status += test_kernel_left_pluq(10, 20); status += test_kernel_left_pluq(10, 80); status += test_kernel_left_pluq(70, 20); status += test_kernel_left_pluq(70, 80); status += test_kernel_left_pluq(70, 20); status += test_kernel_left_pluq(70, 80); status += test_kernel_left_pluq(770, 1600); status += test_kernel_left_pluq(1764, 1345); if (!status) { printf("All tests passed.\n"); } else { return 1; } return 0; } libm4ri-20130416/testsuite/test_misc.c000066400000000000000000000052411212302366200175040ustar00rootroot00000000000000/* * test_misc.c * * Testing small helper functions. * * Copyright (C) 2011 Martin Albrecht * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #define b(n) (m4ri_one<<(n)) int test_spread_and_shrink(const word to, const int length, ...) { word from = 0xFF; rci_t *Q = (rci_t*)calloc(sizeof(rci_t),length); va_list l; va_start(l,length); for(size_t i=0; i #include #include /** * Check that the results of all implemented multiplication algorithms * match up. * * \param m Number of rows of A * \param l Number of columns of A/number of rows of B * \param n Number of columns of B * \param k Parameter k of M4RM algorithm, may be 0 for automatic choice. * \param cutoff Cut off parameter at which dimension to switch from * Strassen to M4RM */ int mul_test_equality(rci_t m, rci_t l, rci_t n, int k, int cutoff) { int ret = 0; mzd_t *A, *B, *C, *D, *E; printf(" mul: m: %4d, l: %4d, n: %4d, k: %2d, cutoff: %4d", m, l, n, k, cutoff); /* we create two random matrices */ A = mzd_init(m, l); B = mzd_init(l, n); mzd_randomize(A); mzd_randomize(B); /* C = A*B via Strassen */ C = mzd_mul(NULL, A, B, cutoff); /* D = A*B via M4RM, temporary buffers are managed internally */ D = mzd_mul_m4rm( NULL, A, B, k); /* E = A*B via naive cubic multiplication */ E = mzd_mul_naive( NULL, A, B); mzd_free(A); mzd_free(B); if (mzd_equal(C, D) != TRUE) { printf(" Strassen != M4RM"); ret -=1; } if (mzd_equal(D, E) != TRUE) { printf(" M4RM != Naiv"); ret -= 1; } if (mzd_equal(C, E) != TRUE) { printf(" Strassen != Naiv"); ret -= 1; } mzd_free(C); mzd_free(D); mzd_free(E); if(ret==0) { printf(" ... passed\n"); } else { printf(" ... FAILED\n"); } return ret; } /** * Check that the results of all implemented squaring algorithms match * up. * * \param m Number of rows and columns of A * \param k Parameter k of M4RM algorithm, may be 0 for automatic choice. * \param cutoff Cut off parameter at which dimension to switch from * Strassen to M4RM */ int sqr_test_equality(rci_t m, int k, int cutoff) { int ret = 0; mzd_t *A, *C, *D, *E; printf(" sqr: m: %4d, k: %2d, cutoff: %4d", m, k, cutoff); /* we create one random matrix */ A = mzd_init(m, m); mzd_randomize(A); /* C = A*A via Strassen */ C = mzd_mul(NULL, A, A, cutoff); /* D = A*A via M4RM, temporary buffers are managed internally */ D = mzd_mul_m4rm( NULL, A, A, k); /* E = A*A via naive cubic multiplication */ E = mzd_mul_naive( NULL, A, A); mzd_free(A); if (mzd_equal(C, D) != TRUE) { printf(" Strassen != M4RM"); ret -=1; } if (mzd_equal(D, E) != TRUE) { printf(" M4RM != Naiv"); ret -= 1; } if (mzd_equal(C, E) != TRUE) { printf(" Strassen != Naiv"); ret -= 1; } mzd_free(C); mzd_free(D); mzd_free(E); if(ret==0) { printf(" ... passed\n"); } else { printf(" ... FAILED\n"); } return ret; } int addmul_test_equality(rci_t m, rci_t l, rci_t n, int k, int cutoff) { int ret = 0; mzd_t *A, *B, *C, *D, *E, *F; printf("addmul: m: %4d, l: %4d, n: %4d, k: %2d, cutoff: %4d", m, l, n, k, cutoff); /* we create two random matrices */ A = mzd_init(m, l); B = mzd_init(l, n); C = mzd_init(m, n); mzd_randomize(A); mzd_randomize(B); mzd_randomize(C); /* D = C + A*B via M4RM, temporary buffers are managed internally */ D = mzd_copy(NULL, C); D = mzd_addmul_m4rm(D, A, B, k); /* E = C + A*B via naiv cubic multiplication */ E = mzd_mul_m4rm(NULL, A, B, k); mzd_add(E, E, C); /* F = C + A*B via naiv cubic multiplication */ F = mzd_copy(NULL, C); F = mzd_addmul(F, A, B, cutoff); mzd_free(A); mzd_free(B); mzd_free(C); if (mzd_equal(D, E) != TRUE) { printf(" M4RM != add,mul"); ret -=1; } if (mzd_equal(E, F) != TRUE) { printf(" add,mul = addmul"); ret -=1; } if (mzd_equal(F, D) != TRUE) { printf(" M4RM != addmul"); ret -=1; } if (ret==0) printf(" ... passed\n"); else printf(" ... FAILED\n"); mzd_free(D); mzd_free(E); mzd_free(F); return ret; } int addsqr_test_equality(rci_t m, int k, int cutoff) { int ret = 0; mzd_t *A, *C, *D, *E, *F; printf("addsqr: m: %4d, k: %2d, cutoff: %4d", m, k, cutoff); /* we create two random matrices */ A = mzd_init(m, m); C = mzd_init(m, m); mzd_randomize(A); mzd_randomize(C); /* D = C + A*B via M4RM, temporary buffers are managed internally */ D = mzd_copy(NULL, C); D = mzd_addmul_m4rm(D, A, A, k); /* E = C + A*B via naive cubic multiplication */ E = mzd_mul_m4rm(NULL, A, A, k); mzd_add(E, E, C); /* F = C + A*B via naive cubic multiplication */ F = mzd_copy(NULL, C); F = mzd_addmul(F, A, A, cutoff); mzd_free(A); mzd_free(C); if (mzd_equal(D, E) != TRUE) { printf(" M4RM != add,mul"); ret -=1; } if (mzd_equal(E, F) != TRUE) { printf(" add,mul = addmul"); ret -=1; } if (mzd_equal(F, D) != TRUE) { printf(" M4RM != addmul"); ret -=1; } if (ret==0) printf(" ... passed\n"); else printf(" ... FAILED\n"); mzd_free(D); mzd_free(E); mzd_free(F); return ret; } int main() { int status = 0; srandom(17); status += mul_test_equality(1, 1, 1, 0, 1024); status += mul_test_equality(1, 128, 128, 0, 0); status += mul_test_equality(3, 131, 257, 0, 0); status += mul_test_equality(64, 64, 64, 0, 64); status += mul_test_equality(128, 128, 128, 0, 64); status += mul_test_equality(21, 171, 31, 0, 63); status += mul_test_equality(21, 171, 31, 0, 131); status += mul_test_equality(193, 65, 65, 10, 64); status += mul_test_equality(1025, 1025, 1025, 3, 256); status += mul_test_equality(2048, 2048, 4096, 0, 1024); status += mul_test_equality(4096, 3528, 4096, 0, 1024); status += mul_test_equality(1024, 1025, 1, 0, 1024); status += mul_test_equality(1000,1000,1000, 0, 256); status += mul_test_equality(1000,10,20, 0, 64); status += mul_test_equality(1710,1290,1000, 0, 256); status += mul_test_equality(1290, 1710, 200, 0, 64); status += mul_test_equality(1290, 1710, 2000, 0, 256); status += mul_test_equality(1290, 1290, 2000, 0, 64); status += mul_test_equality(1000, 210, 200, 0, 64); status += addmul_test_equality(1, 128, 128, 0, 0); status += addmul_test_equality(3, 131, 257, 0, 0); status += addmul_test_equality(64, 64, 64, 0, 64); status += addmul_test_equality(128, 128, 128, 0, 64); status += addmul_test_equality(21, 171, 31, 0, 63); status += addmul_test_equality(21, 171, 31, 0, 131); status += addmul_test_equality(193, 65, 65, 10, 64); status += addmul_test_equality(1025, 1025, 1025, 3, 256); status += addmul_test_equality(4096, 4096, 4096, 0, 2048); status += addmul_test_equality(1000, 1000, 1000, 0, 256); status += addmul_test_equality(1000, 10, 20, 0, 64); status += addmul_test_equality(1710, 1290, 1000, 0, 256); status += addmul_test_equality(1290, 1710, 200, 0, 64); status += addmul_test_equality(1290, 1710, 2000, 0, 256); status += addmul_test_equality(1290, 1290, 2000, 0, 64); status += addmul_test_equality(1000, 210, 200, 0, 64); status += sqr_test_equality(1, 0, 1024); status += sqr_test_equality(128, 0, 0); status += sqr_test_equality(131, 0, 0); status += sqr_test_equality(64, 0, 64); status += sqr_test_equality(128, 0, 64); status += sqr_test_equality(171, 0, 63); status += sqr_test_equality(171, 0, 131); status += sqr_test_equality(193, 8, 64); status += sqr_test_equality(1025, 3, 256); status += sqr_test_equality(2048, 0, 1024); status += sqr_test_equality(3528, 0, 1024); status += sqr_test_equality(1000, 0, 256); status += sqr_test_equality(1000, 0, 64); status += sqr_test_equality(1710, 0, 256); status += sqr_test_equality(1290, 0, 64); status += sqr_test_equality(2000, 0, 256); status += sqr_test_equality(2000, 0, 64); status += sqr_test_equality(210, 0, 64); status += addsqr_test_equality(1, 0, 0); status += addsqr_test_equality(131, 0, 0); status += addsqr_test_equality(64, 0, 64); status += addsqr_test_equality(128, 0, 64); status += addsqr_test_equality(171, 0, 63); status += addsqr_test_equality(171, 0, 131); status += addsqr_test_equality(193, 8, 64); status += addsqr_test_equality(1025, 3, 256); status += addsqr_test_equality(4096, 0, 2048); status += addsqr_test_equality(1000, 0, 256); status += addsqr_test_equality(1000, 0, 64); status += addsqr_test_equality(1710, 0, 256); status += addsqr_test_equality(1290, 0, 64); status += addsqr_test_equality(2000, 0, 256); status += addsqr_test_equality(2000, 0, 64); status += addsqr_test_equality(210, 0, 64); if (status == 0) { printf("All tests passed.\n"); return 0; } else { return -1; } } libm4ri-20130416/testsuite/test_ple.c000066400000000000000000000202551212302366200173330ustar00rootroot00000000000000#include #include #include int test_pluq_full_rank (rci_t m, rci_t n){ printf("pluq: testing full rank m: %5d, n: %5d", m, n); mzd_t* U = mzd_init (m,n); mzd_t* L = mzd_init (m,m); mzd_t* U2 = mzd_init (m,n); mzd_t* L2 = mzd_init (m,m); mzd_t* A = mzd_init (m,n); mzd_randomize (U); mzd_randomize (L); for (rci_t i = 0; i < m; ++i){ for (rci_t j = 0; j < i && j < n;++j) mzd_write_bit(U,i,j, 0); for (rci_t j = i + 1; j < m; ++j) mzd_write_bit(L,i,j, 0); if(i * RSA-1024 0x624ACAD5 1997-01-26 Sign & Encrypt * Fingerprint16 = 32 EC A7 B6 AC DB 65 A6 F6 F6 55 DD 1C DC FF 61 * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include int test_random(rci_t m, rci_t n) { mzd_t *A = mzd_init(m + 3, n + 64); mzd_t *W = mzd_init_window(A, 1, 13, m + 1, n + 13); mzd_t *M = mzd_init(m, n); printf("randomize m: %4d, n: %4d ", m, n); srandom(17); mzd_randomize(M); srandom(17); mzd_randomize(W); int failure = !mzd_equal(M, W); if (failure) { #if 1 printf("FAILED\n"); #else printf("FAILURE: M != W:\n"); printf("M %dx%d:\n", m, n); mzd_print(M); printf("W %dx%d:\n", m, n); mzd_print(W); #endif } else printf("passed\n"); mzd_free(M); mzd_free(A); return failure; } int main() { int status = 0; srandom(17); for (rci_t n = 0; n < 3 * m4ri_radix; n += m4ri_radix) { status += test_random(20, n + 1); status += test_random(20, n + 2); status += test_random(20, n + 32); status += test_random(20, n + 50); status += test_random(20, n + 51); status += test_random(20, n + 52); status += test_random(20, n + 63); status += test_random(20, n + 64); status += test_random(20, n + 65); } if (!status) { printf("All tests passed.\n"); } else { printf("TEST FAILED!\n"); return 1; } return 0; } libm4ri-20130416/testsuite/test_smallops.c000066400000000000000000000075241212302366200204110ustar00rootroot00000000000000#include #include #include #include "testing.h" #include //#define ABORT_ON_FAIL 1 int smallops_test_add(rci_t M, rci_t N, rci_t m, rci_t n, rci_t offset, word pattern) { int ret = 0; printf(" mzd_add: M: %4d, N: %4d, m: %4d, n: %4d, offset: %4d, pattern: 0x%" PRIx64 " ", M, N, m, n, offset, pattern); mzd_t *AA; mzd_t *A = mzd_init_test_matrix_random(M, N, m, n, offset, pattern, &AA); mzd_t *BB; mzd_t *B = mzd_init_test_matrix_random(M, N, m, n, offset, pattern, &BB); mzd_t *CC; mzd_t *C = mzd_init_test_matrix_random(M, N, m, n, offset, pattern, &CC); mzd_t *DD; mzd_t *D = mzd_init_test_matrix_random(M, N, m, n, offset, pattern, &DD); /* Creation went okay? */ ret += mzd_check_pattern(AA, m, n, offset, pattern); ret += mzd_check_pattern(BB, m, n, offset, pattern); ret += mzd_check_pattern(CC, m, n, offset, pattern); ret += mzd_check_pattern(DD, m, n, offset, pattern); /* Testing equality A+A == 0 */ mzd_add(C, A, A); if(!mzd_is_zero(C)) { ret +=1; } ret += mzd_check_pattern(AA, m, n, offset, pattern); ret += mzd_check_pattern(BB, m, n, offset, pattern); ret += mzd_check_pattern(CC, m, n, offset, pattern); /* Testing equality A+A == 0 but this time C is already zero */ mzd_add(C, B, B); if(!mzd_is_zero(C)) { ret +=1; } ret += mzd_check_pattern(AA, m, n, offset, pattern); ret += mzd_check_pattern(BB, m, n, offset, pattern); ret += mzd_check_pattern(CC, m, n, offset, pattern); /* Testing in place add. C is zero, so afterwards C == A */ mzd_add(C, C, A); if(!mzd_equal(C,A)) { ret +=1; } ret += mzd_check_pattern(AA, m, n, offset, pattern); ret += mzd_check_pattern(BB, m, n, offset, pattern); ret += mzd_check_pattern(CC, m, n, offset, pattern); /* Testing equality C (== A) + A == 0 */ mzd_add(B, C, A); if(!mzd_is_zero(B)) { ret +=1; } if(m == n) { /* Testing equality (A + B)^2 == A^2 + BA + AB + B^2 */ mzd_randomize(A); mzd_randomize(B); mzd_add(C,A,B); mzd_mul(D,C,C, 0); // (A+B)^2 mzd_mul(C,A,A, 0); mzd_addmul(C, B, A, 0); mzd_addmul(C, A, B, 0); mzd_addmul(C, B, B, 0); if(!mzd_equal(C,D)) { ret += 1; } ret += mzd_check_pattern(AA, m, n, offset, pattern); ret += mzd_check_pattern(BB, m, n, offset, pattern); ret += mzd_check_pattern(CC, m, n, offset, pattern); ret += mzd_check_pattern(DD, m, n, offset, pattern); } mzd_free_test_matrix_random(AA, A); mzd_free_test_matrix_random(BB, B); mzd_free_test_matrix_random(CC, C); mzd_free_test_matrix_random(DD, D); if(ret == 0) { printf(" ... passed\n"); } else { printf(" ... FAILED\n"); } #ifdef ABORT_ON_FAIL if (ret) abort(); #endif return ret; } int main() { int status = 0; srandom(17); status += smallops_test_add(64, 64, 10, 10, 10, 0x03030303030303llu); status += smallops_test_add(100, 100, 64, 64, 1, 0x03030303030303llu); status += smallops_test_add(100, 100, 64, 64, 1, 0x03030303030303llu); status += smallops_test_add(1024, 1024, 513, 511, 10, 0x03030303030303llu); status += smallops_test_add(1024, 1024, 513, 511, 63, 0x03030303030303llu); status += smallops_test_add(1024, 1024, 513, 511, 64, 0x03030303030303llu); status += smallops_test_add(1024, 1024, 513, 511, 65, 0x03030303030303llu); status += smallops_test_add(1024, 1024, 512, 768+30, 0, 0x03030303030303llu); status += smallops_test_add(2048, 2048, 1024, 1024, 0, 0x03030303030303llu); status += smallops_test_add(2048, 2048, 1024, 1024, 63, 0x03030303030303llu); status += smallops_test_add(2048, 2048, 1024, 1024, 64, 0x03030303030303llu); status += smallops_test_add(2048, 2048, 1024, 1024, 65, 0x03030303030303llu); if (status == 0) { printf("All tests passed.\n"); return 0; } else { return -1; } } libm4ri-20130416/testsuite/test_solve.c000066400000000000000000000037321212302366200177040ustar00rootroot00000000000000#ifndef _GNU_SOURCE #define _GNU_SOURCE #endif #include #include #include int test_pluq_solve_left(rci_t m, rci_t n, int offsetA, int offsetB){ mzd_t* Abase = mzd_init(2048, 2048); mzd_t* Bbase = mzd_init(2048, 2048); mzd_randomize(Abase); mzd_randomize(Bbase); mzd_t* A = mzd_init_window(Abase, 0, offsetA, m, m + offsetA); mzd_t* B = mzd_init_window(Bbase, 0, offsetB, m, n + offsetB); // copy B mzd_t* Bcopy = mzd_init(B->nrows, B->ncols); for (rci_t i = 0; i < B->nrows; ++i) for (rci_t j = 0; j < B->ncols; ++j) mzd_write_bit(Bcopy,i,j, mzd_read_bit (B,i,j)); for (rci_t i = 0; i < m; ++i) { mzd_write_bit(A,i,i, 1); } mzd_t *Acopy = mzd_copy(NULL, A); rci_t r = mzd_echelonize(Acopy,1); printf("solve_left m: %4d, n: %4d, r: %4d da: %4d db: %4d ", m, n, r, offsetA, offsetB); mzd_free(Acopy); Acopy = mzd_copy(NULL, A); int consistency = mzd_solve_left(A, B, 0, 1); //copy B mzd_t *X = mzd_init(B->nrows,B->ncols); for (rci_t i = 0; i < B->nrows; ++i) for (rci_t j = 0; j < B->ncols; ++j) mzd_write_bit(X,i,j, mzd_read_bit (B,i,j)); mzd_t *B1 = mzd_mul(NULL, Acopy, X, 0); mzd_t *Z = mzd_add(NULL, Bcopy, B1); int status = 0; if(consistency == 0) { status = 1 - mzd_is_zero(Z); if (status == 0) { printf("passed\n"); } else { printf("FAILED\n"); } } else { printf("skipped (no solution)\n"); } mzd_free(Bcopy); mzd_free(B1); mzd_free(Z); mzd_free_window(A); mzd_free_window(B); mzd_free(Acopy); mzd_free(Abase); mzd_free(Bbase); mzd_free(X); return status; } int main() { int status = 0; srandom(17); for(size_t i=0; i<100; i++) { size_t m = random() & 511; size_t n = random() & 1023; m = m ? (m) : 1; n = n ? (n) : 1; status += test_pluq_solve_left( m, n, 0, 0); } if (!status) { printf("All tests passed.\n"); } else { return 1; } return 0; } libm4ri-20130416/testsuite/test_transpose.c000066400000000000000000000054211212302366200205670ustar00rootroot00000000000000/* * test_transpose.c * * Application to test functionality of mzd_transpose. * * Copyright (C) 2011 Carlo Wood * RSA-1024 0x624ACAD5 1997-01-26 Sign & Encrypt * Fingerprint16 = 32 EC A7 B6 AC DB 65 A6 F6 F6 55 DD 1C DC FF 61 * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include int test_size[18] = { 1, 3, 4, 7, 8, 11, 16, 17, 32, 40, 64, 80, 128, 160, 192, 240, 256, 512 }; int test_transpose(int i) { int failure = 0; rci_t m = test_size[i]; printf("transpose m: %4d, n: ", m); for (int j = 0; j < 18 && !failure; ++j) { rci_t n = test_size[j]; printf("%d", n); if (j != 17) printf(","); int size = m * n; int loop_size = MAX(64 * 64 / size, 2); for (int i = 0; i < loop_size && !failure; ++i) { mzd_t* A = mzd_init(m, n); mzd_t* B = mzd_init(m, n); mzd_randomize(A); mzd_randomize(B); mzd_t* C = mzd_add(NULL, A, B); mzd_t* AT = mzd_init(n, m); mzd_randomize(AT); mzd_transpose(AT, A); mzd_t* BT = mzd_transpose(NULL, B); mzd_t* CT = mzd_add(NULL, AT, BT); mzd_t* CTT = mzd_transpose(NULL, CT); if (!mzd_equal(C, CTT)) ++failure; mzd_free(A); mzd_free(B); mzd_free(C); mzd_free(AT); mzd_free(BT); mzd_free(CT); mzd_free(CTT); } } printf(" "); if (failure) { printf("FAILED\n"); } else printf("passed\n"); return failure; } int main() { int status = 0; int m=3; int n=64; mzd_t* A = mzd_init(m, n); mzd_t* B = mzd_init(m, n); mzd_randomize(A); mzd_randomize(B); mzd_t* C = mzd_add(NULL, A, B); mzd_t* AT = mzd_init(n, m); mzd_randomize(AT); mzd_transpose(AT, A); mzd_t* BT = mzd_transpose(NULL, B); mzd_t* CT = mzd_add(NULL, AT, BT); mzd_t* CTT = mzd_transpose(NULL, CT); if (!mzd_equal(C, CTT)) ++status; mzd_free(A); mzd_free(B); mzd_free(C); mzd_free(AT); mzd_free(BT); mzd_free(CT); mzd_free(CTT); /* for (int i = 0; i < 18; ++i) { */ /* status += test_transpose(i); */ /* } */ if (!status) { printf("All tests passed.\n"); } else { printf("TEST FAILED!\n"); return 1; } return 0; } libm4ri-20130416/testsuite/test_trsm.c000066400000000000000000000303101212302366200175310ustar00rootroot00000000000000#include #include #include //#define RANDOMIZE int test_trsm_upper_right (rci_t m, rci_t n, int offset, const char* description){ printf("upper_right: %s m: %4d n: %4d offset: %4d ... ", description, m, n, offset); mzd_t* Ubase = mzd_init (2048, 2048); mzd_t* Bbase = mzd_init (2048, 2048); mzd_randomize(Ubase); mzd_randomize(Bbase); mzd_t* Bbasecopy = mzd_copy (NULL, Bbase); mzd_t* U = mzd_init_window (Ubase, 0, offset, n, n + offset); mzd_t* B = mzd_init_window (Bbase, 0, offset, m, n + offset); mzd_t* W = mzd_copy (NULL, B); for (rci_t i = 0; i < n; ++i){ for (rci_t j = 0; j < i; ++j) mzd_write_bit(U,i,j, 0); mzd_write_bit(U,i,i, 1); } mzd_trsm_upper_right (U, B, 2048); mzd_addmul(W, B, U, 2048); int status = 0; for (rci_t i = 0; i < m; ++i) for (rci_t j = 0; j < n; ++j){ if (mzd_read_bit (W,i,j)){ status = 1; } } // Verifiying that nothing has been changed around the submatrices mzd_addmul(W, B, U, 2048); mzd_copy (B, W); for (rci_t i = 0; i < 2048; ++i) for (wi_t j = 0; j < 2048 / m4ri_radix; ++j){ if (Bbase->rows[i][j] != Bbasecopy->rows[i][j]){ status = 1; } } mzd_free_window (U); mzd_free_window (B); mzd_free (W); mzd_free(Ubase); mzd_free(Bbase); mzd_free(Bbasecopy); if (!status) printf("passed\n"); else printf("FAILED\n"); return status; } int test_trsm_lower_right (rci_t m, rci_t n, int offset, const char *description){ printf("lower_right: %s m: %4d n: %4d offset: %4d ... ", description, m, n, offset); mzd_t* Lbase = mzd_init (2048, 2048); mzd_t* Bbase = mzd_init (2048, 2048); mzd_randomize (Lbase); mzd_randomize (Bbase); mzd_t* Bbasecopy = mzd_copy (NULL, Bbase); mzd_t* L = mzd_init_window(Lbase, 0, offset, n, n + offset); mzd_t* B = mzd_init_window (Bbase, 0, offset, m, n + offset); mzd_t* W = mzd_copy (NULL, B); for (rci_t i = 0; i < n; ++i){ for (rci_t j = i + 1; j < n; ++j) mzd_write_bit(L,i,j, 0); mzd_write_bit(L,i,i, 1); } mzd_trsm_lower_right (L, B, 2048); mzd_addmul(W, B, L, 2048); int status = 0; for (rci_t i = 0; i < m; ++i) for (rci_t j = 0; j < n; ++j){ if (mzd_read_bit (W,i,j)){ status = 1; } } // Verifiying that nothing has been changed around the submatrices mzd_addmul(W, B, L, 2048); mzd_copy (B, W); for (rci_t i = 0; i < 2048; ++i) for (wi_t j = 0; j < 2048 / m4ri_radix; ++j){ if (Bbase->rows[i][j] != Bbasecopy->rows[i][j]){ status = 1; } } mzd_free_window (L); mzd_free_window (B); mzd_free (W); mzd_free(Lbase); mzd_free(Bbase); mzd_free(Bbasecopy); if (!status) printf("passed\n"); else printf("FAILED\n"); return status; } int test_trsm_lower_left (rci_t m, rci_t n, int offsetL, int offsetB, const char *description){ printf("lower_left: %s m: %4d n: %4d offset L: %4d offset B: %4d ... ", description, m, n, offsetL, offsetB); mzd_t* Lbase = mzd_init (2048, 2048); mzd_t* Bbase = mzd_init (2048, 2048); mzd_randomize (Lbase); mzd_randomize (Bbase); mzd_t* Bbasecopy = mzd_copy (NULL, Bbase); mzd_t* L = mzd_init_window (Lbase, 0, offsetL, m, m + offsetL); mzd_t* B = mzd_init_window (Bbase, 0, offsetB, m, n + offsetB); mzd_t* W = mzd_copy (NULL, B); for (rci_t i = 0; i < m; ++i){ for (rci_t j = i + 1; j < m; ++j) mzd_write_bit(L,i,j, 0); mzd_write_bit(L,i,i, 1); } mzd_trsm_lower_left(L, B, 2048); mzd_addmul(W, L, B, 2048); int status = 0; for (rci_t i = 0; i < m; ++i) for (rci_t j = 0; j < n; ++j){ if (mzd_read_bit (W,i,j)){ status = 1; } } // Verifiying that nothing has been changed around the submatrices mzd_addmul(W, L, B, 2048); mzd_copy (B, W); for (rci_t i = 0; i < 2048; ++i) for (wi_t j = 0; j < 2048 / m4ri_radix; ++j){ if (Bbase->rows[i][j] != Bbasecopy->rows[i][j]){ status = 1; } } mzd_free_window (L); mzd_free_window (B); mzd_free_window (W); mzd_free(Lbase); mzd_free(Bbase); mzd_free(Bbasecopy); if (!status) printf(" ... passed\n"); else printf(" ... FAILED\n"); return status; } int test_trsm_upper_left (rci_t m, rci_t n, int offsetU, int offsetB, const char *description) { printf("upper_left: %s m: %4d n: %4d offset U: %4d offset B: %4d ... ", description, m, n, offsetU, offsetB); mzd_t* Ubase = mzd_init (2048, 2048); mzd_t* Bbase = mzd_init (2048, 2048); mzd_randomize (Ubase); mzd_randomize (Bbase); mzd_t* Bbasecopy = mzd_copy (NULL, Bbase); mzd_t* U = mzd_init_window (Ubase, 0, offsetU, m, m + offsetU); mzd_t* B = mzd_init_window (Bbase, 0, offsetB, m, n + offsetB); mzd_t* W = mzd_copy (NULL, B); for (rci_t i = 0; i < m; ++i){ for (rci_t j = 0; j < i; ++j) mzd_write_bit(U,i,j, 0); mzd_write_bit(U,i,i, 1); } mzd_trsm_upper_left(U, B, 2048); mzd_addmul(W, U, B, 2048); int status = 0; for (rci_t i = 0; i < m; ++i) for (rci_t j = 0; j < n; ++j){ if (mzd_read_bit (W,i,j)){ status = 1; } } // Verifiying that nothing has been changed around the submatrices mzd_addmul(W, U, B, 2048); mzd_copy (B, W); for (rci_t i = 0; i < 2048; ++i) for (wi_t j = 0; j < 2048 / m4ri_radix; ++j){ if (Bbase->rows[i][j] != Bbasecopy->rows[i][j]){ status = 1; } } mzd_free_window (U); mzd_free_window (B); mzd_free_window (W); mzd_free(Ubase); mzd_free(Bbase); mzd_free(Bbasecopy); if (!status) printf("passed\n"); else printf("FAILED\n"); return status; } int main() { int status = 0; srandom(17); status += test_trsm_upper_right( 63, 63, 0, " word boundaries, even"); status += test_trsm_upper_right( 64, 64, 0, " word boundaries, even"); status += test_trsm_upper_right( 65, 65, 0, " word boundaries, even"); status += test_trsm_upper_right( 53, 53, 10, "word boundaries, offset"); status += test_trsm_upper_right( 54, 54, 10, "word boundaries, offset"); status += test_trsm_upper_right( 55, 55, 10, "word boundaries, offset"); status += test_trsm_upper_right( 57, 10, 0, " small, even placed"); status += test_trsm_upper_right( 57, 150, 0, " large, even placed"); status += test_trsm_upper_right( 57, 3, 4, " small, odd placed"); status += test_trsm_upper_right( 57, 4, 62, " medium, odd placed"); status += test_trsm_upper_right( 57, 80, 60, " large, odd placed"); status += test_trsm_upper_right(1577, 1802, 189, " larger, odd placed"); #ifdef RANDOMIZE for(size_t i=0; i<256; i++) { status += test_trsm_upper_right(random() & 2047, random() & 2047, random() & 63, "randomized"); } #endif printf("\n"); status += test_trsm_lower_right( 63, 63, 0," word boundaries, even"); status += test_trsm_lower_right( 64, 64, 0," word boundaries, even"); status += test_trsm_lower_right( 65, 65, 0," word boundaries, even"); status += test_trsm_lower_right( 53, 53, 10,"word boundaries, offset"); status += test_trsm_lower_right( 54, 54, 10,"word boundaries, offset"); status += test_trsm_lower_right( 55, 55, 10,"word boundaries, offset"); status += test_trsm_lower_right( 57, 10, 0," small, even placed"); status += test_trsm_lower_right( 57, 150, 0," large, even placed"); status += test_trsm_lower_right( 57, 3, 4," small, odd placed"); status += test_trsm_lower_right( 57, 4, 62," medium, odd placed"); status += test_trsm_lower_right( 57, 80, 60," large, odd placed"); status += test_trsm_lower_right(1577, 1802,189," larger, odd placed"); #ifdef RANDOMIZE for(size_t i=0; i<256; i++) { status += test_trsm_lower_right(random() & 2047, random() & 2047, random() & 63, "randomized"); } #endif printf("\n"); status += test_trsm_lower_left( 63, 63, 0, 0, " word boundaries, even"); status += test_trsm_lower_left( 64, 64, 0, 0, " word boundaries, even"); status += test_trsm_lower_left( 65, 65, 0, 0, " word boundaries, even"); status += test_trsm_lower_left( 53, 53, 10, 10, " word boundaries, offset"); status += test_trsm_lower_left( 54, 54, 10, 10, " word boundaries, offset"); status += test_trsm_lower_left( 55, 55, 10, 10, " word boundaries, offset"); status += test_trsm_lower_left( 10, 20, 0, 0, " small L even, small B even"); status += test_trsm_lower_left( 10, 80, 0, 0, " small L even, large B even"); status += test_trsm_lower_left( 10, 20, 0, 15, " small L even, small B odd"); status += test_trsm_lower_left( 10, 80, 0, 15, " small L even, large B odd"); status += test_trsm_lower_left( 10, 20, 15, 0, " small L odd, small B even"); status += test_trsm_lower_left( 10, 80, 15, 0, " small L odd, large B even"); status += test_trsm_lower_left( 10, 20, 15, 20, " small L odd, small B odd"); status += test_trsm_lower_left( 10, 80, 15, 20, " small L odd, large B odd"); status += test_trsm_lower_left( 70, 20, 0, 0, " large L even, small B even"); status += test_trsm_lower_left( 70, 80, 0, 0, " large L even, large B even"); status += test_trsm_lower_left( 70, 10, 0, 15, " large L even, large B odd"); status += test_trsm_lower_left( 70, 80, 0, 15, " large L even, large B odd"); status += test_trsm_lower_left( 70, 20, 15, 0, " large L odd, small B even"); status += test_trsm_lower_left( 70, 80, 15, 0, " large L odd, large B even"); status += test_trsm_lower_left( 70, 20, 15, 20, " large L odd, small B odd"); status += test_trsm_lower_left( 70, 80, 15, 20, " large L odd, large B odd"); status += test_trsm_lower_left( 770, 1600, 75, 89, " larger L odd, larger B odd"); status += test_trsm_lower_left(1764, 1345, 198, 123, " larger L odd, larger B odd"); #ifdef RANDOMIZE for(size_t i=0; i<256; i++) { status += test_trsm_lower_left(random() & 2047, random() & 2047, random() & 63, random() & 63, "randomized"); } #endif printf("\n"); status += test_trsm_upper_left( 63, 63, 0, 0," word boundaries, even"); status += test_trsm_upper_left( 64, 64, 0, 0," word boundaries, even"); status += test_trsm_upper_left( 65, 65, 0, 0," word boundaries, even"); status += test_trsm_upper_left( 53, 53, 10, 10," word boundaries, offset"); status += test_trsm_upper_left( 54, 54, 10, 10," word boundaries, offset"); status += test_trsm_upper_left( 55, 55, 10, 10," word boundaries, offset"); status += test_trsm_upper_left( 10, 20, 0, 0,"small even, small B even"); status += test_trsm_upper_left( 10, 80, 0, 0,"small even, large B even"); status += test_trsm_upper_left( 10, 20, 0, 15," small even, small B odd"); status += test_trsm_upper_left( 10, 80, 0, 15," small even, large B odd"); status += test_trsm_upper_left( 10, 20, 15, 0," small odd, small B even"); status += test_trsm_upper_left( 10, 80, 15, 0," small odd, large B even"); status += test_trsm_upper_left( 10, 20, 15, 20," small odd, small B odd"); status += test_trsm_upper_left( 10, 80, 15, 20," small odd, large B odd"); status += test_trsm_upper_left( 70, 20, 0, 0,"large even, small B even"); status += test_trsm_upper_left( 63, 1, 0, 0," "); status += test_trsm_upper_left( 70, 80, 0, 0,"large even, large B even"); status += test_trsm_upper_left( 70, 10, 0, 15," large even, small B odd"); status += test_trsm_upper_left( 70, 80, 0, 15," large even, large B odd"); status += test_trsm_upper_left( 70, 20, 15, 0," large odd, small B even"); status += test_trsm_upper_left( 70, 80, 15, 0," large odd, large B even"); status += test_trsm_upper_left( 70, 20, 15, 20," large odd, small B odd"); status += test_trsm_upper_left( 70, 80, 15, 20," large odd, large B odd"); status += test_trsm_upper_left( 770,1600, 75, 89,"larger odd, larger B odd"); status += test_trsm_upper_left(1764,1345,198,123,"larger odd, larger B odd"); #ifdef RANDOMIZE for(size_t i=0; i<256; i++) { status += test_trsm_upper_left(random() & 2047, random() & 2047, random() & 63, random() & 63, "randomized"); } #endif if (!status) { printf("All tests passed.\n"); return 0; } else { return -1; } } libm4ri-20130416/testsuite/testing.c000066400000000000000000000031541212302366200171700ustar00rootroot00000000000000#include "testing.h" mzd_t *mzd_init_test_matrix_random(rci_t M, rci_t N, rci_t m, rci_t n, rci_t offset, word pattern, mzd_t **A) { *A = mzd_init(M, N); for(rci_t i=0; iwidth; j++) { (*A)->rows[i][j] = pattern; } } mzd_t* a = mzd_init_window(*A, offset, offset, offset + m, offset + n); mzd_randomize(a); return a; } void mzd_free_test_matrix_random(mzd_t *A, mzd_t *a) { mzd_free(a); mzd_free(A); } int mzd_check_pattern(mzd_t *A, rci_t m, rci_t n, rci_t offset, word pattern) { for(rci_t i=0; inrows; i++) { if (i= m+offset) { for(rci_t j=0; jwidth; j++) if(A->rows[i][j] ^ pattern) { return 1; } } else { for(rci_t j=0; j < (offset/m4ri_radix); j++) if(A->rows[i][j] ^ pattern) { return 1; } if ( (offset/m4ri_radix) == (offset+n)/m4ri_radix ) { word const mask = ~__M4RI_MIDDLE_BITMASK(m, offset%m4ri_radix); if( (A->rows[i][offset/m4ri_radix] ^ pattern) & mask ) { return 1; } } else { word const mask_begin = ~__M4RI_RIGHT_BITMASK(m4ri_radix - offset%m4ri_radix); word const mask_end = ~__M4RI_LEFT_BITMASK((n + offset) % m4ri_radix); if( (A->rows[i][offset/m4ri_radix] ^ pattern) & mask_begin ) { return 1; } if( (A->rows[i][(offset+n)/m4ri_radix] ^ pattern) & mask_end ) { return 1; } } for(rci_t j=(offset+n)/m4ri_radix+1; jwidth; j++) if(A->rows[i][j] ^ pattern) { return 1; } } } return 0; } libm4ri-20130416/testsuite/testing.h000066400000000000000000000005331212302366200171730ustar00rootroot00000000000000#ifndef TESTING_H #define TESTING_H #include #include mzd_t *mzd_init_test_matrix_random(rci_t M, rci_t N, rci_t m, rci_t n, rci_t offset, word pattern, mzd_t **A); void mzd_free_test_matrix_random(mzd_t *A, mzd_t *a); int mzd_check_pattern(mzd_t *A, rci_t m, rci_t n, rci_t offset, word pattern); #endif //TESTING_H