pax_global_header00006660000000000000000000000064151642363540014523gustar00rootroot0000000000000052 comment=c86b8155aa81d4598aaf29442a58c6ccaf493e52 kanzi-cpp-2.5.2/000077500000000000000000000000001516423635400134255ustar00rootroot00000000000000kanzi-cpp-2.5.2/.github/000077500000000000000000000000001516423635400147655ustar00rootroot00000000000000kanzi-cpp-2.5.2/.github/workflows/000077500000000000000000000000001516423635400170225ustar00rootroot00000000000000kanzi-cpp-2.5.2/.github/workflows/c-cpp.yml000066400000000000000000000005421516423635400205500ustar00rootroot00000000000000name: C/C++ CI on: push: branches: [ master ] pull_request: branches: [ master ] jobs: build: strategy: matrix: os: [ ubuntu-latest, macos-latest ] compiler: [ clang, gcc ] runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v6.0.1 - name: make run: cd src && make clean && make all kanzi-cpp-2.5.2/.github/workflows/codeql.yml000066400000000000000000000056211516423635400210200ustar00rootroot00000000000000# For most projects, this workflow file will not need changing; you simply need # to commit it to your repository. # # You may wish to alter this file to override the set of languages analyzed, # or to provide custom queries or build logic. # # ******** NOTE ******** # We have attempted to detect the languages in your repository. Please check # the `language` matrix defined below to confirm you have the correct set of # supported CodeQL languages. # name: "CodeQL" on: push: branches: [ "master" ] pull_request: # The branches below must be a subset of the branches above branches: [ "master" ] schedule: - cron: '18 0 * * 5' jobs: analyze: name: Analyze runs-on: ubuntu-latest permissions: actions: read contents: read security-events: write strategy: fail-fast: false matrix: language: [ 'cpp' ] # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] # Use only 'java' to analyze code written in Java, Kotlin or both # Use only 'javascript' to analyze code written in JavaScript, TypeScript or both # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support steps: - name: Checkout repository uses: actions/checkout@v6.0.1 # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL uses: github/codeql-action/init@v4.31.9 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. # By default, queries listed here will override any specified in a config file. # Prefix the list here with "+" to use these queries and those in the config file. # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs # queries: security-extended,security-and-quality # Autobuild attempts to build any compiled languages (C/C++, C#, Go, or Java). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild uses: github/codeql-action/autobuild@v4.31.9 # ℹ️ Command-line programs to run using the OS shell. # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun # If the Autobuild fails above, remove it and uncomment the following three lines. # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. # - run: | # echo "Run, Build Application using script" # ./location_of_script_within_repo/buildscript.sh - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@v4.31.9 with: category: "/language:${{matrix.language}}" kanzi-cpp-2.5.2/CMakeLists.txt000066400000000000000000000320071516423635400161670ustar00rootroot00000000000000cmake_minimum_required(VERSION 3.10) set(KANZI_PROJECT_VERSION "2.5.1") set(KANZI_ABI_VERSION "1.0.0") project(kanzi VERSION ${KANZI_PROJECT_VERSION} LANGUAGES C CXX) include(GNUInstallDirs) include(CMakePackageConfigHelpers) string(REPLACE "." ";" KANZI_ABI_VERSION_PARTS "${KANZI_ABI_VERSION}") list(GET KANZI_ABI_VERSION_PARTS 0 KANZI_SOVERSION) option(KANZI_ENABLE_NATIVE_OPTIMIZATIONS "Enable CPU-specific optimizations such as -march=native" ON) set(KANZI_INSTALL_RELATIVE_RPATH_DEFAULT ON) if(CMAKE_INSTALL_PREFIX STREQUAL "/usr") set(KANZI_INSTALL_RELATIVE_RPATH_DEFAULT OFF) endif() option(KANZI_INSTALL_RELATIVE_RPATH "Install the CLI with a relative runtime search path for the shared library" ${KANZI_INSTALL_RELATIVE_RPATH_DEFAULT} ) set(KANZI_CMAKE_INSTALL_CMAKEDIR "${CMAKE_INSTALL_LIBDIR}/cmake/Kanzi") # Set C++ standard set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) #set(CMAKE_CXX_COMPILER "clang++") # Set C standard (for TestAPI.c) set(CMAKE_C_STANDARD 11) #set(CMAKE_C_STANDARD_REQUIRED True) #set(CMAKE_C_COMPILER "clang") set(CMAKE_THREAD_PREFER_PTHREAD TRUE) find_package(Threads REQUIRED) # ---------------------------------------------- if(CONCURRENCY_DISABLED) add_definitions(-DCONCURRENCY_DISABLED) endif() if(MSVC) set(COMMON_FLAGS "/W4 /O2 /DNDEBUG") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMMON_FLAGS} /GR-") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMMON_FLAGS}") else() set(COMMON_FLAGS "-Wall -Wextra -O3 -fomit-frame-pointer -fPIC -DNDEBUG -pedantic") if(KANZI_ENABLE_NATIVE_OPTIMIZATIONS) string(APPEND COMMON_FLAGS " -march=native") endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMMON_FLAGS} -fno-rtti") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMMON_FLAGS}") endif() set(SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/src") # Source files set(LIB_COMMON_SOURCES ${SRC_DIR}/Global.cpp ${SRC_DIR}/Event.cpp ${SRC_DIR}/util/WallTimer.cpp ${SRC_DIR}/entropy/EntropyUtils.cpp ${SRC_DIR}/entropy/HuffmanCommon.cpp ${SRC_DIR}/entropy/CMPredictor.cpp ${SRC_DIR}/entropy/TPAQPredictor.cpp ${SRC_DIR}/transform/AliasCodec.cpp ${SRC_DIR}/transform/BWT.cpp ${SRC_DIR}/transform/BWTS.cpp ${SRC_DIR}/transform/DivSufSort.cpp ${SRC_DIR}/transform/SBRT.cpp ${SRC_DIR}/transform/BWTBlockCodec.cpp ${SRC_DIR}/transform/LZCodec.cpp ${SRC_DIR}/transform/FSDCodec.cpp ${SRC_DIR}/transform/ROLZCodec.cpp ${SRC_DIR}/transform/RLT.cpp ${SRC_DIR}/transform/SRT.cpp ${SRC_DIR}/transform/TextCodec.cpp ${SRC_DIR}/transform/UTFCodec.cpp ${SRC_DIR}/transform/EXECodec.cpp ${SRC_DIR}/transform/ZRLT.cpp ) set(LIB_COMP_SOURCES ${SRC_DIR}/api/Compressor.cpp ${SRC_DIR}/bitstream/DebugOutputBitStream.cpp ${SRC_DIR}/bitstream/DefaultOutputBitStream.cpp ${SRC_DIR}/io/CompressedOutputStream.cpp ${SRC_DIR}/entropy/ANSRangeEncoder.cpp ${SRC_DIR}/entropy/BinaryEntropyEncoder.cpp ${SRC_DIR}/entropy/ExpGolombEncoder.cpp ${SRC_DIR}/entropy/FPAQEncoder.cpp ${SRC_DIR}/entropy/HuffmanEncoder.cpp ${SRC_DIR}/entropy/RangeEncoder.cpp ) set(LIB_DECOMP_SOURCES ${SRC_DIR}/api/Decompressor.cpp ${SRC_DIR}/bitstream/DebugInputBitStream.cpp ${SRC_DIR}/bitstream/DefaultInputBitStream.cpp ${SRC_DIR}/io/CompressedInputStream.cpp ${SRC_DIR}/entropy/ANSRangeDecoder.cpp ${SRC_DIR}/entropy/BinaryEntropyDecoder.cpp ${SRC_DIR}/entropy/ExpGolombDecoder.cpp ${SRC_DIR}/entropy/FPAQDecoder.cpp ${SRC_DIR}/entropy/HuffmanDecoder.cpp ${SRC_DIR}/entropy/RangeDecoder.cpp ) set(TEST_SOURCES ${SRC_DIR}/test/TestEntropyCodec.cpp ${SRC_DIR}/test/TestBWT.cpp ${SRC_DIR}/test/TestCompressedStream.cpp ${SRC_DIR}/test/TestDefaultBitStream.cpp ${SRC_DIR}/test/TestTransforms.cpp ${SRC_DIR}/test/TestAPI.cpp ) set(APP_SOURCES ${SRC_DIR}/app/Kanzi.cpp ${SRC_DIR}/app/InfoPrinter.cpp ${SRC_DIR}/app/BlockCompressor.cpp ${SRC_DIR}/app/BlockDecompressor.cpp ) # Libraries add_library(libkanzi STATIC ${LIB_COMMON_SOURCES} ${LIB_COMP_SOURCES} ${LIB_DECOMP_SOURCES}) add_library(libkanzi_shared SHARED ${LIB_COMMON_SOURCES} ${LIB_COMP_SOURCES} ${LIB_DECOMP_SOURCES}) # This ensures -lpthread or -pthread is added to any executable linking these libs target_link_libraries(libkanzi PUBLIC Threads::Threads) target_link_libraries(libkanzi_shared PUBLIC Threads::Threads) target_include_directories(libkanzi PUBLIC $ $ ) target_include_directories(libkanzi_shared PUBLIC $ $ ) set_target_properties(libkanzi PROPERTIES OUTPUT_NAME "kanzi") set_target_properties(libkanzi_shared PROPERTIES OUTPUT_NAME "kanzi" VERSION "${KANZI_ABI_VERSION}" SOVERSION "${KANZI_SOVERSION}" ) #add_library(libkanzi_comp STATIC ${LIB_COMP_SOURCES}) #add_library(libkanzi_decomp STATIC ${LIB_DECOMP_SOURCES}) #add_library(libkanzi_comp_shared SHARED ${LIB_COMP_SOURCES}) #add_library(libkanzi_decomp_shared SHARED ${LIB_DECOMP_SOURCES}) # Executable target for C++ add_executable(testBWT ${SRC_DIR}/test/TestBWT.cpp) target_link_libraries(testBWT libkanzi) add_executable(testTransforms ${SRC_DIR}/test/TestTransforms.cpp) target_link_libraries(testTransforms libkanzi) add_executable(testEntropyCodec ${SRC_DIR}/test/TestEntropyCodec.cpp) target_link_libraries(testEntropyCodec libkanzi) add_executable(testDefaultBitStream ${SRC_DIR}/test/TestDefaultBitStream.cpp) target_link_libraries(testDefaultBitStream libkanzi) add_executable(testFactories ${SRC_DIR}/test/TestFactories.cpp) target_link_libraries(testFactories libkanzi) add_executable(testMalformedStream ${SRC_DIR}/test/TestMalformedStream.cpp) target_link_libraries(testMalformedStream libkanzi) add_executable(testCompressedStream ${SRC_DIR}/test/TestCompressedStream.cpp) target_link_libraries(testCompressedStream libkanzi) # Executable target for C API test (TestAPI.c) add_executable(testAPI ${SRC_DIR}/test/TestAPI.c) target_link_libraries(testAPI libkanzi) # IMPORTANT: Force use of C++ Linker because we link against C++ lib 'archon_static' set_target_properties(testAPI PROPERTIES LINKER_LANGUAGE CXX) # Main executable # Dynamically linked executable add_executable(kanzi ${APP_SOURCES}) target_link_libraries(kanzi libkanzi_shared) # Keep the CLI runnable from relocatable prefixes, but avoid embedding an # RPATH for standard system package installs. if(UNIX AND NOT APPLE AND KANZI_INSTALL_RELATIVE_RPATH) set_target_properties(kanzi PROPERTIES BUILD_WITH_INSTALL_RPATH TRUE INSTALL_RPATH "$ORIGIN/../${CMAKE_INSTALL_LIBDIR}" ) endif() # Statically linked executable add_executable(kanzi_static ${APP_SOURCES}) target_link_libraries(kanzi_static libkanzi) # Custom target to build all tests (Named to avoid conflict with CTest 'make test') add_custom_target(build_tests DEPENDS testBWT testTransforms testEntropyCodec testDefaultBitStream testFactories testMalformedStream testCompressedStream testAPI ) # --- CTest Configuration --- enable_testing() if(DEFINED ENV{TMPDIR}) set(SYSTEM_TEMP_DIR "$ENV{TMPDIR}") # Linux/macOS standard elseif(DEFINED ENV{TEMP}) set(SYSTEM_TEMP_DIR "$ENV{TEMP}") # Windows standard elseif(DEFINED ENV{TMP}) set(SYSTEM_TEMP_DIR "$ENV{TMP}") # Windows fallback else() set(SYSTEM_TEMP_DIR "/tmp") # Fallback default endif() file(TO_CMAKE_PATH "${SYSTEM_TEMP_DIR}" SYSTEM_TEMP_DIR) # Register executables as CTest tests # Syntax: add_test(NAME COMMAND ) add_test(NAME BWT COMMAND testBWT -noperf) add_test(NAME Transforms COMMAND testTransforms -type=all -noperf) add_test(NAME EntropyCodec COMMAND testEntropyCodec -type=all -noperf) add_test(NAME DefaultBitStream COMMAND testDefaultBitStream ${SYSTEM_TEMP_DIR}/testDefaultBitStream.tmp -noperf) add_test(NAME Factories COMMAND testFactories) add_test(NAME MalformedStream COMMAND testMalformedStream) add_test(NAME CompressedStream COMMAND testCompressedStream) add_test(NAME API COMMAND testAPI) # Custom target to build static libraries add_custom_target(static_lib DEPENDS libkanzi #libkanzi_comp libkanzi_decomp ) # Custom target to build shared libraries add_custom_target(shared_lib DEPENDS libkanzi_shared #libkanzi_comp_shared libkanzi_decomp_shared ) # Custom target to build all libraries (static and shared) add_custom_target(lib DEPENDS static_lib shared_lib ) # Install the statically linked executable install(TARGETS kanzi_static RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT Runtime ) # Install dynamically linked executable install(TARGETS kanzi RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT Runtime ) # Install the man page (prefer pre-compressed source file). set(KANZI_MANPAGE_GZ "${CMAKE_CURRENT_SOURCE_DIR}/doc/kanzi.1.gz") set(KANZI_MANPAGE "${CMAKE_CURRENT_SOURCE_DIR}/doc/kanzi.1") set(KANZI_MANPAGE_TO_INSTALL "") if(EXISTS "${KANZI_MANPAGE_GZ}") set(KANZI_MANPAGE_TO_INSTALL "${KANZI_MANPAGE_GZ}") elseif(EXISTS "${KANZI_MANPAGE}") find_program(GZIP_EXECUTABLE gzip) if(GZIP_EXECUTABLE) set(KANZI_MANPAGE_GZ_BUILD "${CMAKE_CURRENT_BINARY_DIR}/kanzi.1.gz") execute_process( COMMAND "${GZIP_EXECUTABLE}" -n -c "${KANZI_MANPAGE}" OUTPUT_FILE "${KANZI_MANPAGE_GZ_BUILD}" RESULT_VARIABLE KANZI_GZIP_RESULT ) if(NOT KANZI_GZIP_RESULT EQUAL 0) message(FATAL_ERROR "Failed to gzip man page: ${KANZI_MANPAGE}") endif() set(KANZI_MANPAGE_TO_INSTALL "${KANZI_MANPAGE_GZ_BUILD}") else() message(WARNING "gzip not found. Installing uncompressed man page.") set(KANZI_MANPAGE_TO_INSTALL "${KANZI_MANPAGE}") endif() endif() if(KANZI_MANPAGE_TO_INSTALL) install(FILES "${KANZI_MANPAGE_TO_INSTALL}" DESTINATION "${CMAKE_INSTALL_MANDIR}/man1" COMPONENT Runtime ) endif() # Install the libraries (static and shared) install(TARGETS libkanzi libkanzi_shared EXPORT KanziTargets ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}" COMPONENT Development LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}" COMPONENT Runtime NAMELINK_COMPONENT Development RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT Runtime INCLUDES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/kanzi" ) # Dynamically discover and install public headers # We only install headers from the library tree, preserving paths relative to # src/ so includes like and # remain stable across platforms. set(PUBLIC_HEADER_BASE_DIRS ${SRC_DIR} ${SRC_DIR}/api ${SRC_DIR}/bitstream ${SRC_DIR}/entropy ${SRC_DIR}/io ${SRC_DIR}/transform ${SRC_DIR}/util ) # List to hold all found header files set(ALL_PUBLIC_HEADERS) # Iterate through the base directories and find headers foreach(dir IN LISTS PUBLIC_HEADER_BASE_DIRS) if(EXISTS "${dir}") file(GLOB_RECURSE CURRENT_DIR_HEADERS "${dir}/*.h" "${dir}/*.hpp") list(APPEND ALL_PUBLIC_HEADERS ${CURRENT_DIR_HEADERS}) endif() endforeach() list(REMOVE_DUPLICATES ALL_PUBLIC_HEADERS) # Install each header, preserving its relative path within the 'kanzi' include directory foreach(header_file IN LISTS ALL_PUBLIC_HEADERS) file(RELATIVE_PATH REL_PATH "${SRC_DIR}" "${header_file}") if(NOT REL_PATH MATCHES "^(app|test|obj|build)/") get_filename_component(REL_DIR "${REL_PATH}" DIRECTORY) if(REL_DIR STREQUAL ".") set(HEADER_DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/kanzi") else() set(HEADER_DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/kanzi/${REL_DIR}") endif() install(FILES "${header_file}" DESTINATION "${HEADER_DESTINATION}" COMPONENT Development ) endif() endforeach() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE") install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE" DESTINATION "${CMAKE_INSTALL_DOCDIR}" COMPONENT Runtime ) endif() configure_package_config_file( "${CMAKE_CURRENT_SOURCE_DIR}/cmake/KanziConfig.cmake.in" "${CMAKE_CURRENT_BINARY_DIR}/KanziConfig.cmake" INSTALL_DESTINATION "${KANZI_CMAKE_INSTALL_CMAKEDIR}" ) write_basic_package_version_file( "${CMAKE_CURRENT_BINARY_DIR}/KanziConfigVersion.cmake" VERSION "${PROJECT_VERSION}" COMPATIBILITY SameMajorVersion ) install(EXPORT KanziTargets FILE KanziTargets.cmake NAMESPACE kanzi:: DESTINATION "${KANZI_CMAKE_INSTALL_CMAKEDIR}" COMPONENT Development ) install(FILES "${CMAKE_CURRENT_BINARY_DIR}/KanziConfig.cmake" "${CMAKE_CURRENT_BINARY_DIR}/KanziConfigVersion.cmake" DESTINATION "${KANZI_CMAKE_INSTALL_CMAKEDIR}" COMPONENT Development ) # Uninstall all files added during install configure_file( "${CMAKE_CURRENT_SOURCE_DIR}/cmake_uninstall.cmake.in" "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake" @ONLY) add_custom_target(uninstall COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake COMMENT "Uninstalling..." ) kanzi-cpp-2.5.2/LICENSE000066400000000000000000000261221516423635400144350ustar00rootroot00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "{}" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright 2022 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. kanzi-cpp-2.5.2/README.md000066400000000000000000000254331516423635400147130ustar00rootroot00000000000000# Kanzi Kanzi is a modern, modular, portable, and efficient lossless data compressor written in C++. * Modern: Kanzi implements state-of-the-art compression algorithms and is built to fully utilize multi-core CPUs via built-in multi-threading. * Modular: Entropy codecs and data transforms can be selected and combined at runtime to best suit the specific data being compressed. * Portable: Supports a wide range of operating systems, compilers, and C++ standards (details below). * Expandable: A clean, interface-driven design—with no external dependencies—makes Kanzi easy to integrate, extend, and customize. * Efficient: Carefully optimized to balance compression ratio and speed for practical, high-performance usage. Unlike most mainstream lossless compressors, Kanzi is not limited to a single compression paradigm. By combining multiple algorithms and techniques, it supports a broader range of compression ratios and adapts better to diverse data types. Most traditional compressors underutilize modern hardware by running single-threaded—even on machines with many cores. Kanzi, in contrast, is concurrent by design, compressing multiple blocks in parallel across threads for significant performance gains. However, it is not compatible with standard compression formats. It’s important to note that Kanzi is a data compressor, not an archiver. It includes optional checksums for verifying data integrity, but does not provide features like cross-file deduplication or data recovery mechanisms. That said, it produces a seekable bitstream, meaning one or more consecutive blocks can be decompressed independently, without needing to process the entire stream. For more details, see [Wiki](https://github.com/flanglet/kanzi-cpp/wiki), [Q&A](https://github.com/flanglet/kanzi-cpp/wiki/q&a) and [DeepWiki](https://deepwiki.com/flanglet/kanzi-cpp/1-overview) See how to reuse the C and C++ APIs: [here](https://github.com/flanglet/kanzi-cpp/wiki/Using-and-extending-the-code) There is a Java implementation available here: https://github.com/flanglet/kanzi There is a Go implementation available here: https://github.com/flanglet/kanzi-go ![Build Status](https://github.com/flanglet/kanzi-cpp/actions/workflows/c-cpp.yml/badge.svg) [![Quality Gate Status](https://sonarcloud.io/api/project_badges/measure?project=flanglet_kanzi-cpp&metric=alert_status)](https://sonarcloud.io/summary/new_code?id=flanglet_kanzi-cpp) [![Lines of Code](https://sonarcloud.io/api/project_badges/measure?project=flanglet_kanzi-cpp&metric=ncloc)](https://sonarcloud.io/summary/new_code?id=flanglet_kanzi-cpp) Coverity Scan Build Status [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE) [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/flanglet/kanzi-cpp) ## Why Kanzi While excellent open-source compressors like zstd and brotli exist, they are primarily based on Lempel-Ziv (LZ) algorithms. Zstd, in particular, is a fantastic general-purpose choice known for its speed. However, LZ-based tools have inherent limits regarding compression ratios. Kanzi offers a compelling alternative for specific high-performance scenarios: * Beyond LZ: By incorporating Burrows-Wheeler Transform (BWT) and Context Modeling (CM), Kanzi can achieve compression ratios that traditional LZ methods cannot. * Speed where it counts: While LZ is ideal for "compress once, decompress often" (like software distribution), it often slows down significantly at high compression settings. Kanzi leverages multi-core CPUs to maintain performance, making it highly effective for backups, real-time data generation, and one-off transfers. * Content-Aware: Kanzi features built-in, customizable transforms for specific data types (e.g., multimedia, DNA, UTF text), improving efficiency where generic compressors fail. * Extensible: The architecture is developer-friendly, making it straightforward to implement new transforms or entropy codecs for experimentation or niche data types. ## Benchmarks Kanzi version 2.5.0 C++ implementation _Note: The default block size at level 9 is 32MB. This limits the number of threads in use, especially with smaller files like enwik8, but all tests below are performed with default values._ ### silesia.tar Test machine: AMD Ryzen 9 9950X 16-Core Processor running Ubuntu 25.10 Download at http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip | Compressor | Encoding (ms) | Decoding (ms) | Size | |---------------------------------|-----------------|-----------------|------------------| |Original | | | 211,957,760 | |lz4 1.1.10 -T16 -4 | 18 | 13 | 79,910,851 | |**kanzi -l 1** | **72** | **42** | 79,331,051 | |zstd 1.5.8 -T16 -2 | 6 | 11 | 69,443,247 | |**kanzi -l 2** | **64** | **42** | 68,616,621 | |brotli 1.1.0 -2 | 880 | 333 | 68,040,160 | |gzip 1.13 -9 | 10328 | 704 | 67,651,076 | |**kanzi -l 3** | **109** | **58** | 63,966,794 | |zstd 1.5.8 -T16 -5 | 138 | 123 | 62,867,556 | |**kanzi -l 4** | **194** | **102** | 61,183,757 | |zstd 1.5.8 -T16 -9 | 320 | 114 | 59,233,481 | |brotli 1.1.0 -6 | 4039 | 299 | 58,511,709 | |zstd 1.5.8 -T16 -13 | 1820 | 112 | 57,843,283 | |brotli 1.1.0 -9 | 23030 | 293 | 56,407,229 | |bzip2 1.0.8 -9 | 8223 | 3453 | 54,588,597 | |**kanzi -l 5** | **529** | **255** | 53,853,702 | |zstd 1.5.8 -T16 -19 | 11290 | 130 | 52,830,213 | |**kanzi -l 6** | **919** | **532** | 49,472,084 | |xz 5.8.1 -9 | 43611 | 931 | 48,802,580 | |bsc 3.3.11 -T16 | 1201 | 698 | 47,900,848 | |**kanzi -l 7** | **1153** | **888** | 47,330,422 | |bzip3 1.5.1.r3-g428f422 -j 16 | 2348 | 2218 | 47,260,281 | |**kanzi -l 8** | **4473** | **4881** | 42,962,913 | |**kanzi -l 9** | **11618** | **12381** | 41,520,670 | ![Graph for Silesia on AMD Ryzen 9950X](doc/Plot_silesia.png) Round-trip graph for Silesia on AMD Ryzen 9950X (X = compTime + 2*decompTime, Y = comp size) ### enwik8 Test machine: Apple M3 24 GB macOS Sonoma 15.7.3 Download at https://mattmahoney.net/dc/enwik8.zip | Compressor | Encoding (ms) | Decoding (ms) | Size | |-----------------|----------------|----------------|--------------| |Original | | | 100,000,000 | |kanzi -l 1 | 139 | 85 | 42,870,183 | |kanzi -l 2 | 131 | 92 | 37,544,247 | |kanzi -l 3 | 215 | 123 | 32,551,405 | |kanzi -l 4 | 303 | 170 | 29,536,581 | |kanzi -l 5 | 670 | 372 | 26,528,254 | |kanzi -l 6 | 1009 | 727 | 24,076,765 | |kanzi -l 7 | 1607 | 1366 | 22,817,360 | |kanzi -l 8 | 6371 | 6752 | 21,181,992 | |kanzi -l 9 | 8260 | 8760 | 20,035,144 | ![Graph for enwik8 on AMD Ryzen 9950X](doc/Plot_enwik8.png) Round-trip graph for enwik8 on AMD Ryzen 9950X (X = compTime + 2*decompTime, Y = comp size) ### More benchmarks [Comprehensive lzbench benchmarks](https://github.com/flanglet/kanzi-cpp/wiki/Performance) [More round trip scores](https://github.com/flanglet/kanzi-cpp/wiki/Round%E2%80%90trips-scores) ## Build Kanzi * Platforms: Windows (Visual Studio), Linux, macOS, BSD * Dependencies: None. * Portability: Designed for easy porting to other OSs. * Multithreading: Supported by default. ### Visual Studio 2008 Unzip the file "Kanzi_VS2008.zip" in place. The solution generates a Windows 32 binary. Multithreading is not supported with this version. ### Visual Studio 2022 Unzip the file "Kanzi_VS2022.zip" in place. The solution generates a Windows 64 binary and library. ### mingw-w64 Go to the source directory and run 'make clean && mingw32-make.exe kanzi'. The Makefile contains all the necessary targets. Tested successfully on Win64 with mingw-w64 g++ 8.1.0. Multithreading is supported with g++ version 5.0.0 or newer. Builds successfully with C++11, C++14, C++17. ### Linux Go to the source directory and run 'make clean && make kanzi'. The Makefile contains all the necessary targets. Build successfully on Ubuntu with many versions of g++ and clang++. Multithreading is supported with g++ version 5.0.0 or newer. Builds successfully with C++98, C++11, C++14, C++17, C++20. ### macOS Go to the source directory and run 'make clean && make kanzi'. The Makefile contains all the necessary targets. Build successfully on MacOs with several versions of clang++. Builds successfully with C++98, C++11, C++14, C++17, C++20. ### BSD The makefile uses the gnu-make syntax. First, make sure gmake is present (or install it: 'pkg install gmake'). Go to the source directory and run 'gmake clean && gmake kanzi'. The Makefile contains all the necessary targets. Builds successfully with C++98, C++11, C++14, C++17, C++20. ### Makefile targets ``` clean: removes objects, libraries and binaries kanzi: builds the kanzi executable kanzi_static: builds a statically linked executable kanzi_dynamic: builds a dynamically linked executable lib: builds static and dynamic libraries test: builds test binaries all: kanzi + kanzi_static + kanzi_dynamic + lib + test install: installs libraries, headers and executable uninstall: removes installed libraries, headers and executable ``` For those who prefer cmake, run the following commands from the top directory: ``` mkdir build cd build cmake .. make ctest ``` By default, the cmake build generates a dynamically linked executable. Choose ```make kanzi_static``` to build a statically linked executable. Credits Matt Mahoney, Yann Collet, Jan Ondrus, Yuta Mori, Ilya Muravyov, Neal Burns, Fabian Giesen, Jarek Duda, Ilya Grebnov Disclaimer Use at your own risk. Always keep a copy of your original files. kanzi-cpp-2.5.2/SECURITY.md000066400000000000000000000016101516423635400152140ustar00rootroot00000000000000# Security Policy Security updates are applied only to the latest release. ## Vulnerability Definition A security vulnerability is a bug that, given a certain input, triggers a crash or an infinite loop. Compression and decompression failures do not belong in this category. ## Reporting a Vulnerability **Please do not report security vulnerabilities through public GitHub issues.** If you have discovered a security vulnerability in this project, report it privately. Please disclose it at [security advisory](https://github.com/flanglet/kanzi-cpp/security/advisories/new). Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: * Operating system * Hardware: CPU, memory * Kanzi version * Command line invoked * Error reported/crash data/log output If possible provide a minimal reproducer. kanzi-cpp-2.5.2/bin/000077500000000000000000000000001516423635400141755ustar00rootroot00000000000000kanzi-cpp-2.5.2/bin/.gitignore000066400000000000000000000001641516423635400161660ustar00rootroot00000000000000**/Debug/** **/Release/** **/*.obj **/*.o **/*.htm **/*.exe **/*.idb **/*.pdb **/*.ncb **/*.sln **/*.suo **/*vcproj*kanzi-cpp-2.5.2/cmake/000077500000000000000000000000001516423635400145055ustar00rootroot00000000000000kanzi-cpp-2.5.2/cmake/KanziConfig.cmake.in000066400000000000000000000002571516423635400203220ustar00rootroot00000000000000@PACKAGE_INIT@ include(CMakeFindDependencyMacro) find_dependency(Threads REQUIRED) include("${CMAKE_CURRENT_LIST_DIR}/KanziTargets.cmake") check_required_components(Kanzi) kanzi-cpp-2.5.2/cmake_uninstall.cmake.in000066400000000000000000000020651516423635400202100ustar00rootroot00000000000000# Used by smake to uninstall kanzi binary, libraries and header files if(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt") message(FATAL_ERROR "Cannot find install manifest") endif() file(READ "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt" files) string(REPLACE "\n" ";" files "${files}") foreach(file ${files}) if(EXISTS "${file}") message(STATUS "Removing file: ${file}") file(REMOVE "${file}") endif() endforeach() set(install_dirs "") foreach(file ${files}) get_filename_component(dir "${file}" DIRECTORY) list(APPEND install_dirs "${dir}") endforeach() # Remove duplicates list(REMOVE_DUPLICATES install_dirs) # Sort deepest directories first list(SORT install_dirs) list(REVERSE install_dirs) foreach(dir ${install_dirs}) if(IS_DIRECTORY "${dir}") file(GLOB children "${dir}/*") list(LENGTH children num_children) if(num_children EQUAL 0) message(STATUS "Removing empty directory: ${dir}") file(REMOVE_RECURSE "${dir}") endif() endif() endforeach() kanzi-cpp-2.5.2/doc/000077500000000000000000000000001516423635400141725ustar00rootroot00000000000000kanzi-cpp-2.5.2/doc/Plot_enwik8.png000066400000000000000000002167201516423635400171130ustar00rootroot00000000000000PNG  IHDR 39tEXtSoftwareMatplotlib version3.8.3, https://matplotlib.org/XS pHYsaa?iIDATxyXT{ME@e51wK34K4Rs)_bj[&ZfT5w TAEA@cbde3纸s, sO($I@DDDDDDDD@.Q"""""""" ,""""""""j"""""""" ,""""""""j"""""""" ,""""""""j"""""""" ,""""""""j"""""""" ,""""""""j"""""""" ,""""""""j"""""""" ,""":qFWWW AAA/!'BDD 2۷'^~e8qB݊+P($'DE<쳏7++ }abbf͚^ӧ\""'H~GL0xwBb8x 6n(oӧ24.;;۷Gvv6y8<6l؀y7oT۶ 6u022̙37n믿GGG3gDQQ1-ZUbԩK,mllr5Uѣҥ LLLТE ̙3%%%Ӻuktҥb4k aaaU~}6T:lKcbbP(*qqqQ;o:fffݻ7Y[MH "00ưqe;$ M6ĉU:::HKKSow% .f̘XZZmgee022BDD$Cl`Hqq1!88NNNUgxгgODGGcغu+:v[nm{ 5 G͛W_}f|w}ׯׯ;?-[bݺuMлwomwy&M֭[1h ܸqm۶Ŷm0}txc̘1}׬Y &k׮ظq#6mڄ)S ''GaѢE/xTM_Æ }>UyQF o߾ׯ_ǨQ:+ҡC#iӦǾ沂pAQmgaСڵk/޽{ҥ Μ9SU믿ɓ'Gشi.\ӧOcǎj 'B~;wTwvR߹s'UǑ$ SNɓl2US=͛ݻw#;;ΝäIмysK|d@""""ٸq@z饗ٳg%҄ >|X }ᇪu]vHuoߖtuu%ccc)%%E>!!A }u3f̐HSLQ;ի%ҪUT뜝%]]]U_̤+W%ӧ%I7xCzkmJk.Ujz{{K{~yoݺ%O$iȐ!RӦMBI:+3k,@ \]]qIǏW.22R %&&Vx4EtI$իom{=N2dHj%ܹs_vM266}]պN-[&^*I$}'4`iԨQ$IRAAdjj;z"I3<#J ,--;w{Ҙ1cT;ݻ 6mTK?cjjZ7ݻ#GT[߶m[xyy6mll`kk@888{yy\RÆ S[2dTں?ݺuT?}ٳGUݻw1tPl޼Ȩmo˿Aeggmۖ{]/e5jʕ+Uݹs7oƈ#WYiӦիX|9^uaVQrrr3 //j$ҶmPTT#FеkW\m;;;;WU~'=z(;vgϞѣv@9?UNNjG}6~i9rw^nc7oك~ xy!""l`=$''kԩS^xAÕ׸qc 11J?j.$rؔz@^^^ԖШQr窨4W)t/cr  [[[kN|o6mڄnݺ,w^Y}5jTn;CC*5^}UjB~~Z*Q6mQFa8q[ݷeKVK/kӦMoF\mTV=ߡCʝ*gggaΝU dعs'ѱcǾWϟÇѷo_{~֭駟dL<O= ;v ##=.۷_q+RPP?Wݻw/333?~8Μ9ŋuDDDE#99ܾSSSm{u4nX5޸q͚5S- MݸqcO?eG5 FBNN݋3fg SSS̜93gDZZj4Vqܹ _W޽Hh׮ն{묎z z¦M[[J;v,vڅ-[ @aݺuծ=Z鶍7B}*iAE몢{ؼy3كرcv܉.]T:t /^{ r'$$P6ʲB˖-qԩ"""9aF$Yظq#郓'OUV_l+Q]e37oV*UXX[xVRGٳ裏4^ի.C\v-T(>,l777X[[W|۷/ 0p@>}\SiӦ9r$?#77&&&U.^~e̟?Cll,,YRUyФI& `bbȉ?cDFFbʕ^B׻woҥK4hP^p!!!UgŜ9s!CT\ѣ.]}077llmܸGg}V+055Exx8rrrrJxx=t8|i6ҥKBrr/SN֭[Y/1Xz5}С-Z & 88Ǐ KƎ:::۷/0m4899aʔ)oÆ CϞ=qiL6 Uj>̚5 ;v@ǎ1i$xxx //IIIزe /^ GGG3ԩq |簴T5ڵkg}ٳg/C6ꫯ/@xx8/=_Y_~K,Axx8ڴiKKK$''cٲe8}4O^Yǧ~R=ghh֭[fG}˗/O>FZZ9 :uرc1j(⩧)RSS?Ǐq~i( l߾]=zW^Q=a?~<T|Wͭ%DDDrV5ArW8† p=1J$""llm_|7n@__ovѢEpssO?~ӧ> V[6l@DD-ZBc6LʲGll,fϞ 077I]t+vZܹs7FΝ?I& h̛7h֬FQT~cǎ8p KKK:+3ƍزe -Z;w_0|J=}4֭S{III#`]vvvhӦ ƍWwbK,Ad,\%%%ppp@NM^U5B`` UK~a˖-߿?{9lذfff8t>S,^ɰA֭h"o߾FHNңn 6n܈~7 6 OV .effVnRݻ7n̜97oެH8Znb?vND޽T$&&ݺu+W^+PXXO>?3RRRШQ#t3g΄1bD&"""""""a4= XDDDDDDDDԠqw(笺~:͡P(DCDDDDDDD$+$޽{pppNSprr]]v ޏ ,ojjzz RC'''UT ZXX< Ç]v j!x!X x!5d5Q888.H1D1Db1D1$Gl`ɈsH$sH$3H$sHr9dhҤ2sH$sH$3HSRRe˗k>>V\ B?ZiBMrHD Rh0qd`ܮ9::"&&oF^иqc˗ٳgWGcƍ*E$'ٱp!4o0 DDD ;;_㧟~^ׯ_+^^^˫4do߾iR]搈9$: /Pm O_&x~L u^x+V~oAAAU?Tiii8wΝ;$IԴF5d%#G QR̙38q".\N;*׹ b?`eeUc;̓a$䐈49$:tY4S_o\TK*b lڴIm=Ν;#G`СHLLD^^8C$BCC!IB$$U@|"''aKFҝOJJB@$$$ !!̓-F >}:WWצP(:s ߿kkk( <ʕ+===( Xܾ-ڵkW:jJFD Rr $wU9 ΚWO*䈗HmnZ5jO4k ѣG1gΜZ7**J"&&>>>ؿ?rrr>>>ꫯօbԩxgiBiHH,fH<.H /yNIQgL8o!::Νí[p S; \p˗/ŋڵCHH^z%#""$!;;[m >YYY$ -Bnn.k׮߿?"""wEdd"""""""z%#-[zTؼ `,岱1tuu!I.]W^y/,,,˗W*uե.I0zh[o6f̘gܹǡH.3<̙3~:`lٲ{aiiY"&&isH$3H$4 ,ٶ5V>/I@j򱍍 гgO ;w  ??裏b xzz"$$k֬c{{{!...]ݻEEEx饗84 [$Qcb+..V+L$].έw*P]\>l"]:thܿ={ą pB4n| ѨQ*pe)Ω&m_hQ_DGGc̘18t 22cƌ[4r|"MysH$3H$^aa4N!I]p]`iiLXXX.έtPmCD( F) DѣhӦ2sH$3HCbb"\]]y[Ɂ2H =*p֓5xy(Ak$` P΁t"N"- [bE_VXOmY@%tPX ݭ]+HƊ.H1Db1$W׮]kon߾WrrrD@ql`=w*Zhu뀰zH+.H1Db1T׊ *Jg}|._?QQQx"/^]vCȨ"y8SXO:c{%zH=Q=΍n0Db1T6l\\n݀p..uiĉ000ѵkW4o}Ν;>r LBBvm۶ fffӧRKoHxyyXp깤$( ]022ªUp055l;,Skm2B0zWDVt DZ9$a0x0࢔$##۶mç~ ccc0l0op1vX3Fm\|_ÇcԩXz5Č3ߣuֈǘ1c`jjW^yEu{sEdd$ 1vX`޽055ř3g`ff}}}ͿDq֓NGp$C)8C'"gϞ]cb.oUypAA"D@C"A ݮ:5j={b….ͻqV^_| վgӦMѬY3\|-[Tquu}NNN7n6l؀~?#Jܺ}oJ$DZZsH$3HuKQy(r#??{޽{q5lݺ={Df駟\\\w^֭[U>~DD>s,Xϟɓ'oM<۶mCbb"^^^(,,%jl`L (?*]?_]]hժbcc_|nnn;v,u놃0k,$%% M4Ge˖aŊC׮]bŊǎ*..ĉ>} .k%jNGVV,-- \r` }d<$&&յs4mؠa ݝͫ0ID5׶X2rԩ K '" R] ݻ_UrrrD@qz ),*zoH-" w}%iXrR~l`L%#."Mt DZ9$$Pt D{K z!X x%%%K 86d$-=XDB.H1Db1D.H!9r$Xg駟ЫW:;XVVV刈NI&aaKFZn`# ]cbInF BB}}}hSNENNN7&& wޭjˡPHCՖB ,+>}:M>++ }<==add;;;6lM 6mڤsDbbƎYl`s,)S`سg~zT,X@ucʕC˖-ѻwodddΥ[W_ǎôip1 2/N<L6ܥsEHH1a?2ȑ#;w"556l܏zUd߾}j1`͚56lmoff6rH"::$Iׯڄ/`޽zj$^=`ee@qq1֮]aÆN<޽{#,, 'Noo`Ĉׯ "lݺK,ի]*J۶mq5\rO.)233ER+9l]IRϞ!J999K z!X Uҙ3g?X,I͚Opp~W{N|aQFҐ!C*ȑ#޽{$Iݻ%ҦMTdggKFFFҁ}פCwիWHҗ_~ZWTTvܮ]J$I$gggijۇK=wyGV-;;;KÇW-HҢE$ID vguއ_Eܹ#ݫZ&G{y ~պ[nIڵk%IH tE6?ԴiSI~Zm6@Ȑ$I^~eiرj޷oEFF(?^jڴ'<ҺnKH7n$S||@JLLTR3Obbb*œ?oM{//x'3gЦt#8y$ڵk' " RܸOxwddd.߻z*U(;2̙3CϞ=SPP~s\t ԩj>ڶmgϪݿ_ui\aÆᣏ>R-/={=ںN:a(..V*;JPNǩDeddZ' ={zzzj]5jbb777ղ6l:tׯWF~`mm 8v.^իWXRRDxyy?z;d`nnr[j}r4Z}cKNʆ ,"""""j[nXh}}}@NNz^zaժUhҤ ^޽{ %`M ͚5Sа:*kH&%ZlYs_V }(P(*2yw ]FP(pպ&MZ U^[CEm۶֬YcƍT=_RR_&M*w͛ݻ~:rrr`aautt*ݖݦ5*{ydU^ڤIj[[l`Hsg l` "" RƊJLMM+l;wnœ9sk!^6wY(jٲ% ? << ɓ'{T*?;pݫ>>8tE$D]9$$mҼys˗ٳg?v?sssL:SLʕ+q%~P5+P(͛Ά)Əwy[nř3g0f^SEUoc׮]={6Ο?+WWMX^066֭[ѻwr> NNNh׮~g9s.\lj =ƌǏѬYrR>ΰaO?V1qD$$$… ƛo w^~e۷/~W]{ߋlNNN_dU۷]tQ]JX׎E$DrrsH$3HڤI&Xb~wx{{cΜ9믫ٳ1}t|B޽ЬY3̜9>6mSݜ9s0h  ŋm6ը._ ]k֬/OYfUxwoŒ%KPi#qCu3[lQkY[[СC>|8>n]tATT+՝S###g}:t$IزeKUVhӦ N8`)ٳ.\@.]кukL6 z >>>/0n8r8}}}DEEܹs_|O>lj˜1cjUKM)ڶfe ++ |쵥 ّ{Ѷte׮@LzÇ9q-`!X U&//puuUB]nwm1dn|Rd篿;#'N@O^8KFZ=X`_H9$"!C"A"LLLD W_}ͻJWu , l` q9%i=H,fH<%쬚S4kȐ!BGز%#?X`H\%i=H,fH7" 6dĴ0I6pe"C"A"tuuE@ql`Ɉ[˖"Mt DZ9$}ꞡHKe'N.H?~\t DZ9$ʔ *((\R^c:U===ׇ͛Sԕ|aw#$Irss++:hi` ,"!7o." Re 푘+W.G֊9 aee;;:96XrPX,"!rHDB0Db1D1$Gl`Ɉ/!$Ht DZ9$$9$9RH;YYYDff&,,,DScm{pwWtIDZ j!X x!5Dpr`K z!X x!Xr X2boo`K z!X x!X2bbb`&&&K z!X x!X2r%6tsH$3H$sHrܰEDDDDDDD2E$sH$3H$sHr\~ ,"_." %#waH;w.H1Db1D1$Gl`Ɉ>XD.H1Db1D1$G Ib#++ ̄rjHI5EWCDDDDDDDT G`Ç9HÇ.H1Db1D1$Gl` XDDDDDDD$3l`HӦM4HM.H1Db1D1$Gl`Ɉ5E$? 0Db1D1$Gl`ȅ x !`.\]cbcI6Hf///6]cbcIt6KOO]cbcIt?Bɓݞ={ ###h/۷oE$۷E@C"A"C:z(.] Gn~K.LJ~I&aTiá`K z!X x!Ql`eggcذaammm/^͛cѣꫯ믿j6 ]cbcIdkĉxgУGn{AKm]޽ GVVڏ=z ,"=*" .ak֬A\\\w 4mTm]ӦMQTT[n޾>9fΜYn}ll,LMMg077+N8pvvFII ] ŋ SSS#>>]]]\r$dee>>>8vFFF|2ɸ{. #G`ff/Pa"-- oF^~>hҤ qy222pMM6Eqq15j[[[={ЪU+dee!-- Ю];špppӧnnnEjj*eԩS˃%7o'O\\\PTTd@PPΝ;\ Ǐ4opU@@@.]lqqq[OOIII???\z022/bcc011K>>>~:ܹ}}}êϓ.\zqm"$$GEII 4iwwwܹs7oބB@۶mq1M6U-[Dvv6nܸh۶-PPP+++8::ԩS-Z //ׯ_˃\\\>Ūu8wƍѸqc;wNTMZY3kcc;;;9sFQmڴ'+++899>(((@JJ3[wD!~Gddd@OO8r$Iw#dÇ7~G;BSoÇwDC{#dee5~GwDEPHRs5`"00ϯpwww5 |jѹsgή>W-gee 싪G/_FgΝ,,L%i˗/E j!X x!5DYYYqA:vZW\\{G~~~T]RCF*} ,"N>-" %#ĝH|0rHHҟ#GCe%&&O?ѣ1{l@BBBGDDDDDDDO6dYf Xp!Ə޽{cz-,X&Lݻ1zhbccakk0?`񈊊ʕ+akk رcƭ[p-dffVqtt4GU~DYfK z!X x!ɑHs WQ}JLLT@$"1C"A"C#^B('R1`} @IQ`˖-(**B^IP(PTT ]k> VXOOO`͚5J D`` pUL2ڵ_[ڵkadd[b…?YS^^wwwO>HJJz>DDDDDDD%V͂>C[].:t@tt4߿={… Xp!7nohԨ7UΝSMھh"m0`HHH@BBu`]H>//)))4i|4vXHHH>[KjOt DZ9$$9$9bKέ{@ ?Wkb௿$ | 5kM6EF3g ;wɓ#&&6l@>}о}{L8C$+Vj2򂎎#Is,wǍ222B~5}ڵkK z!X x!XOb` x?$%ŪS~:u*UV'Oĉ'0w\b֬Yѣxbc̘1ؼy3s_vލ_077GuVۈi:HDx!X x!XOls8uF͛7rNkׯG6m*KֱcG]vUa4habb09rv?jbH6l ATK z!X x!XO:c{%@j]JJ H'&& C w}#;;TmסCe˖Uz !!#<<۷oGII cnĈK#Go;vʪ5UѣGE@C"A"C#6tM-EO@ggٲey&L#00:uW_}KKKt999q`kkKرjXjcڴipttD||<ॗ^|||{{{=frr2~mܽ{ݺuH0"""""""C!Uz0ʂ%233aaa!;w, (G^@ D(br9DJ$W\2sH$3H$sH Qm{/%ŭ_D`b @br<*H,fH< ,ط8fO0q z(Fs\'AkהQݺtsH$3H$sHr >~lGDDDDDDDԐ%ߦ:UB@vvv3|#""0uT@tt4y*?жm[umD-" %]BQ| NG>p૯yt邸8?~?# jKT7n]cbcI]]`@ԛX i_N:QQQeΫܹs>iiix___,]0k,\~F`` Ԏb ," %7eBRœ?ʆ ЭE"˗/$%%AΝ+ܮGyn޼7駟XXX 33Zuz{{#!! {/͛7# Z#""""""s`ɈmX6-%E~ݺ#SNy&;Wo1n8}ڶm 4iFXZ5k!??FR_WcUsH$3H$sHr ӑ,XZZ"33˩ ̙|rŞ=SOUibH䊟W('& "##666 j!X x!5DBpBG`WyP׮)#]pAt DZ9$$9$9bKnjJMvDDDDDDDDxzzָeo툴sH$3H$sHrܺu .]s\ݽ,prRnGDusH$3H$sHrԦ ,X|pty|NN8x!X x!X2S͚wtT @Tׯ[n>NNNF???xxxo6ZttJ$sH$3H$sHrOiӦV ,@٤JJv~Ugb"Wwq>3'p)cݺu>.UM6mD@C"A"C#6dѣn`CCCY~=<==Ѻuk|'P(Ɩ-[122ʕ+ƍW(@N*۷/ ѫW//u2!G]cbcI41vX~!!! :u*V7Bc֭x7qڵǞM6Xv- p=lܸIIIzi%%%K z!X x!X2Ҹqc6C!((Z56wFDD/WxѣGZhΝ;c߾}={'''m @ǎ_WCQv .4 S\ @.ZJ÷s,ԩS5j/888Tcǎͅ!> ###̛7O9s]sP/ D1Db1D1$G%#Ν]VذpquÕ(חj߾=;/V\z.%%Ddd$|||y˗?s9ppa@VVrssXh~ZbHB{MO _FFlllDA՘C"A"Cjj{%2Xuh߾ʛWvM| /"##?O$TffsH$3H$sHrURSG}:u {W'IG_=E@C"A"C#6 :co툈jvڱUtQqU-.Kۑj׮sH$3H$sHrűUtu nb.ϟ܎W\\sH$3H$sHrU€uf;:*ׇB%i=H,fH>^t DZ9$$9$9jpp]_ Sa:ڵkSOUx?3g,>66 ٳgq}'N8;;׮]ŋ SSS,+W,ǎ888/_"99wޅqTyyy!-- w˗r /_5Ο?z222pMM6Eqq15j[[[={ЪU+dee!-- Ю];špppӧnnnEjj* $$NB^^,--Ѽys>>~:ܹ}}}M.\P}6tuuGM4 _sn޼ Bmرc(** 6mz[llܸqжm[$$$VVVpttĩS-Z@^^_ӧ }fUw֭qy -[DBB :::jDܻwR͚5Ukp]GU}fMMMUﷷ7nܸr﷭-,--U﷧'nݺ[n>wƍѸqcMZjLgΜQ}fsrrTw6mp NNNϬ+ VQ{iCpp09IФI~G;BvwÇwDC{#;w ;!=wDNNwDC{#;?LGpssÒ%K~UV|竖L4)))hv0xrŗ_#("-f͚.H1Db1D1eeeҲƽ9a$5'>>=CCCjEWW +" 5և~} Ú5k[>̟?...AAAVZc"_W\XDB]rvvv j!X x!Qk`_Fjj*,--[gϞT5PPPS"%%_~z bEDDDDDDD2D́Uj{fCq}o \gHܿƢ j!X x!5DAM$HRRG` Vzg"9$$9$9bKFXDTﲲD@C"A"C#6dH}G`ջr9$z %#>>>H0%i=H,fH< ,9vXD;vLt DZ9$$9$9bKn""""""""Aq$%%!77M4A֭ꪉS988VETD@C"A"CZ586mBAA`ll E;v,ƍsssML022,"8a&x!X x!Q/!|0x`4k ۶mý{pm$''#77.\]v;vdT˗/E$˗E@C"A"Cի~wT|-ТE +8}4_^"""""""""'VڼzzYSQE$sH$3H$sHr^v ɪ#G`Xt&OU`e H H,fH}pUgϞ077Ǘ_~<,^XǸx"jƱ2dH1n870rH7P=occk?/o||<>cڵ ʂ^?֭ok'/r?"C"A"C#z뭷;wXǮ]4q *ڲejDR`` rJܼyz1j(cعs'UM 6:tٳk]z@@Sc9w\pppXXXptzggghT^^^ɓV<կ_?j*̝;aaaXl\\\}v@FFlll0k,Xn ==cƌ/^3gΠEڵ+"""Uذa())Av矣I&uZ/i////%i=H,fH>8UApp0P` mu\\ ll2ܼySLQXFLL N:᫯%w`ܸqҥKѿtر‰׫kժUpttiӦ^z A@@`X4t Þ={?׺V.H1Db1D1$G :ׂUb޽رc.oQQ8PH$++ ̄rjȑ#HmAK}T*p: ,LDuDȑ#h۶2sH$3H$sH Qm{/խ[ Dn4q " o}Ӽ\ x0@SZhC"A"C#4$I}vw#KJrBr!*Jp9̉N&MD@C"A"CZߵh #Gb8q;v]TeUZDži1kkk%i=H,fH88EEE8UAVVSWI]VVsH$3H$sHrñhѢr.]aÆiTiiilջ4%i=H,fH<ƗBe˰}vop!\v #F}TueX """""""$lnݺU &++ ̄rj-IgCl=DDDDDDDj{ݻwtW#qqqrs{#]\\DA՘C"A"C#́E Caa%!Rp!X x!QXƍõkתoիWTTEրETשּׂE@C"A"C74i___-ѣG۷oŋƻヒ͛cdT@WGtݓP(RP(P(}hh( bbb#F дiS˸{nKTK z!X x!QXgƅ SOah߾=7o[[[xxx`Ĉ|2-[OuSN> (Fa5aQQQ-Z3tRhV§~ jӢK z!X x!Q|ǏۈΝ;XnzꥩZJ%K,:ww~amm={ٳ( iF5<<U:u* ::NpBxyy˫u}԰(  .4'77W*JIIĉѻwoDGGC۷1gS_|.\˗ŋv!$$/c>7x+WtU5.\ׯcضm"##1y2EEEӫ< G=T͸ T<7oիq!XZZ"==]~|rHD0Db1D1$GE⩚MzL -()tcccXYYСC8~8݄ p-,_۷oaBM4U IFuƌ3Tヒ;v`ǎ;wn۵mg}駟*믿FN6Bܹs>iiix___,]0k,\~F`` Ԏb |܈̙3t ?7oCBWW666۷/9~<:uꑣ1Db1D1$GuĉqM_~[ձ3|`[ps_%<9 *JgqS|.3LJJ?Ν;W]=T^ݼy7nO? x]+퍄$$$Fc֭<ٳQ!YHm۶L4 Fp2e nܸoooo߾xk|c=Z*U>q P^&* rHD0Db1D1$GĽرccϞ=><=L/!$zrHD0Db1D1$GYmP:tɚ8>/_) ?|XD¨rHD0Db1D1$Gi`Uuy G`x,?7#6QP54o\,6*D$ sH$3H$sHrTּy`nn?!M(; W^y$իWaooKR合abcIj@OaҰxb`ܹsj$DDDDDDD$#j`kҥKC6m{{{̛77oF~4R,=Z@@G` !  .U?3p?~ER\tI#Q合abcIjJIIAVT˻v A`ii @97ӧkW!UYvvG` !  ժedd𥳐:۫=6Q合abcIj /طoOtjW!U/!$FC"9$$9$9Ukڴi?>лwo9RV7nDNj]$UM\\G` !  .ݺuñcǰc^P{>00m۶UTEDDDDDDD2Rx{{ۻƎ[S58::*p00!X x!Q.!EO~dXl`+UHH,fH< ,IJJR>(;+UHH,fH< ,9%DDDDDDD$#l`ɈA=N➐k>rBJ_x18::wލ˗zzz~zU׆["$$h߾=?^EC"9$$9$9bKF^|X3n8$$$ !! ￯Z֭vZ7>>1mۆӧOСC}9ܹÇ_~'_`ذazD@0!X x!QgvBҶ5= UCffვ:Ɩ-[ϝ;%K_~6lRSSP(/ӧOGVVѾ}{,^6l~kkkׯ֯+ S:w\pppXXXT阗.]-]vŕ+WZLKC"9$$9$9qkǷo'|޽{Cb۶m6mZ122R>~NVܹse˖۷olJ`֬Y?n:@zz:ƌ_~e_#9s-Z@׮]g}}~Z͛7q!o7nDvv6ZQ合abcIjzWT Yf7P4i{ܹSL]T%Uj݈޽{annc޼yxѵkWݻ: xxxƎ{F5TEaa!;[B$ 0K, %֯_CΝ Հ*D$ sH$3y\UPdSw 3MK˥qg˨53^֌-dVXMKc4jeVbin&. .~>~s}я眯yʡؑG/k2d_}'.!nXv`r6oѣO]CzMbb"zbٓ":NӧOիWۘytMQ~} Ə?Xd{51̙3+IHH`ݺu|Ann.=>(wq'.!n޽{N'na<aUF+C1B91K1O9;Ho͚5iӦdee˕W^I>}x'=q qի s6 `2޽*^xZʡYʠyʡQOnݺ/aҭ[7ڴiKy\TAc-)lْ-[æM8~8"<<3~go΃>ĉ=q qî]r("F("f)")bG)`E[i۶-شi'.!eX"""""""b)`uVrrrXx1x{{{:?QKĈB9#CAC#FͭJNp8 4nپ}{O\BܐDCE(C1B91K1O9;HkҤI3_zx{{Ox↣G?Q,# PDPERESŎxD7|s'N; k6-br(b2(br(vXSL?JFbƍ!>>>z`Q("br(b2(br(v[oEtt4K.eҥ|\{<裞a͚5OT1PEP,eP*uAVVV|M∋#$$(e˖1c :wL:u?Yӹsg:vHll,IIIn'&&͛ m=z4]t!..={_uNr("F("f)")bGUnb'k=qz)HLLtV^ssF">>ǻG1y"nذ'|H233Sǽ|L0} z$-- QW(""""""RxoKdd$L6O?SKh۶mZ0pѢEOqqq2{lѫWrK͹馛~ 6~kX̙3trI R7CAC#ZhAbbb_|.RV +`-X110`i7̙3رk޽{[o^cΝdee@FFk޽{av [bV\Ibb"W.9r$K,!77.]J׮]wwF=sfggk.TZlYvJʡ yc1n8t:Ɵg{27ޠK.@޽/J=f___ZlɛoYSc[.IX5h8قpp@))Ew}4z!W @=+ॗ^"00kSNѵkWx:t(})v3gQQQ婧"** 60|p"##ر#]vI&`XqŁ8p [ ''QFѩS'vʃ>ȼyk)C1B91K1O9;r82_ٿ?M6eҤI(y>3]@={6/6l(&޽{ԩ;v,s妛nr뚙A@@@[^:ۮ]wn9s5M99VOWy{Tk"JʡYʠyʡTGTIvv6gfС$''ÇٿWC믧m۶mۖojժb7i֬ӦMCu]_KqBBBԯ̙oL9XQr DB9,AVVV1m4RSS]'Mģ> … k. D׮]ر#*6VH(")bG.`թS{ws̳Ըqc*0G}ᇜ:u޽{? 2dk׮… s9233 -v`Lg1P+Qvvv/.`^zɭ5n~o{2W*UUExʠyʡQOW^lذ͛{tlڴ޽{sYK ENvv64iҤ1'Og-~ڵ4hЀݻm6Μ9CÆ iѢ7n;溆Jű{nhРm۶uy7t҅$233cǎɑ?ЩS'8'Wqqq;EDDݻСGaϞ=ңGlJ_^ׅL2xv8viiixyyѳgO֮]KNN5",,m۶ЦM2339r`9_ .Ldd$[lUV>}CTټy3gϞ%00f͚Cvv6~nս{wo'?{v/WR~}ڷo]w:uHJJs۷ |}}ԩk׮I&ԯ_={бcG<ǩ[.ݻwwMN@@k:ѣG&>>5k֐Khh(!!!رmrqp8\ve[lBBBw߭[&++Çċ? ؼy3-[ٳ999[nܹSNO֭]7a˫gv޽ k׮wcDFFҬY3>;Jn&NHJJ >>> :o?vʞ={wD- ǎN:уy}GTDwDu{#SN_-##Qyg#s`'ࡇG4hР۷'N0|}]/_^lm۶=?Ou=}СCDDD9ܹsc`ÌXN'ԭkM,գeŹb5bKj7p84i< ?3]v6l ::Ao[ZlɓIMMG̟?.>s:uXCx嗙5k9ͻd{ СCyٳg]vvj_HeӼ"f)")RUt,o Zp8p:8rrrtz&qϫ8+[EpU5󤦦RN5jT}||)x>{S>]1III8bbb*ap88R AӦ׫AHHB\y];EL+,nJ\\qqq7)%11 Qn2r("SESŎcj(ScccILL$11_#G7/d֭`"UI9 ycƏ5\?~kr8rh׮\s Wf 4C&k³E@\\_xW馛>>\q <^{b{Wٸq#]taL85|gŹ&,>6mо}{^x-Z_'"""""R}޼y1xz UXf |rKZ*:Xu7/$rrh2))-Zмys&L2dϧ~̟?|}}INNfʔ)Ջ  HOOgƌ\uU$$$zjyHMM孷b̘1vٳԭ[@Yl[n!))+WҦMٶm۷*R6pk^?y̛g:)PD2(br(QEk/5a??0uTNʕ+?KBpLǦ0n8… ˱G^'''uk?g}dL2aN<@w['HC8ePرc]=,YwTPפy%]wżyxGxgz&L`ҥ,]E'%%ï~+ڴiCJJ C oq"`WJDs>* Iϊ+ܪᆪ_~!(((K8l Cۊܹs4h櫯]v89v+W,nř8q"f͢}ҢE z7#{׿5QQQcǎ%--W^yx'R !qw?qO߇"RAC#~㏳n:.r/>, .,TWQgp ]:| ?_Plw,\_W 4ӧ3qDNʘ1chԨGu=+W}k;Kiݺubi'LDDD-RQMw{ PD2(br(vIܽ؋9T^2ի3nU:0xyWuEloծ"mNur8 * )"Q0"RAC$n-ձxe;i+W`2OD*7=6M+KHK-[™Cp;ODJNh߾=8mʡ yC{TR[l6c.O9T)22e˖1cرc׿f̘1n7~zmXK˪PEP,eP-[F۶miԨWԯm%8{/^}ILL$11G}:X/ :ԭٓO?'Ory>CT$"RCAC#L{1|~_ytRN111W4hX,f.ZlI߾}Yb#G`ٲeL4o r2QwśoV{+iРo)*CrʡYʠyʡؑGz`ЪU+OJ*`ƍW,` fԩArrrѕB(R͡T)P,ePm-"q̡T:P,ePɓ'9vÆ <|DFFI::曙0a}-tfl߾AsN:k+P,ePÆ ,{9sKK.|_ٳgsAbccL91K1O9;RFo^aaSS+1"%sX@B̝k,n]JJ 7x#3gΤcǎ{ߟ[o^{;wE@@z͆ عs'nYr%^ܯiȑ,Y\rrrXt)]v-w9zKî 4ꫯ.w;k3P,eP}3>cy"""""Y6I`` nN;vw t_{VIDjrX ,X7 ɫ̛ÆU}vPERES:hE=l[?)/K0rhs99{xr(b2(br(v3,0u deUnDj!rhs+V%ow:a~k?ʠ Uh"wdyv?lĭka p8Taep84iqq8$$$z^p6mZ4Vn4~"e 1ӽ{wreܹs+|ƍk_= rhsAT?amׯ&r(b2(br(vX6r…KTpa Xo^^^ۗ'NӪU+fРAl۶ ={dӮ];V^ Q5j#F`Ĉn8 4׿5r Çm۶:HYCW^_ܡ2i~"A91K1O9;RFܺMjXn΁5}t^ ’%K """xW^ycDzl2Bǭ]0 Ν;y{;w.g&엂Z>}.َ,IOO/aÆr5װ{n^'vŖa`C}Fw}wx[9% n6x5U~y%O91K1O9;RFnzꮅ# ެZ~xN>رcbƌDEE1tPΞ=[PW+;;]wżyxGxgz}&Lo 88}x޽{qF/ nMBjP,eP>?-Z\ 666gddpuq7۷mGs`HVJaph88a&\w{f…9sAk.ONƍ:u*QQQ4jv\jIx /Ѽ}ݬ[ѣGTKPD*r(b2(br(vp:y풙I`` nN%''Ӽy7!V?q0<+q8\Z9* JuTڋz`ÇKޘ,x/3DJ͡T P,ePt\;DDDDDDDFe2d_57.{%22z d<?ZrݦYf۷]g_>۷wM&E:u\cs;w̾}חN:vZ*z֯_={$ԭ[ݻzj `׮];55GM||hWTָq3g| 6>|3gθӟThiOصk[lOg&^1y6L,)wEcCAÚk (jj7o@ 9s&F )?G}ر#뫪‰'w X"S("f)")5/Syl?;v,w5kV&L`„ Ԣ^z;P,)wEcCAÚ)kn_\\\TKcʝCP,eP<'l~ X6w2+XV"R>Ρxr(b2(brXsy{+X/.b=6گQK (%""""""b̰a0o^&`̚7^U9"""wa%cnD*9QERESka :dyկ_yG,/y 8u 4\Dj PD쭷ˋ}r ^uZj 4m۶gϞ=`ڵkի5jU2j(F#׿ѣG3~x^{rO$ʡxr(b2(br(v?N^XnOνː!CXd AAADDD+0vX-[]wU踵kưaعs'?8{/seلЧOK#++t(=--}ï~+4h@ƍy7z"eQG("f)")bG*`و 7~= NkJJ ƍc,\rQ^xnyrrrX~}cccc2e O<wW^r-|g2i$ƌsf?PBCClgٹsk裏fܸqlڴ) PD*L91K1O9;RFBCC+v\AAAZ~ ӧO3vXә1cK,LJgκu X_wuGgq&L`ҥ,]E-B~p:ڵ˭k9 SERESŎTy4tz?ܜw aѢEdgg3x`֯_pͱc裏8wۗ8q"f͢}*Mll,d%7n=o&_5\~nOΡTr(b2(br(vܹoM#*`p{{f…9sAk.ONƍ:u*QQQ4j&\jI=5OՓO><ҥKyGС99H PD%R.͡r(b2(br(v:ts'SK\|kٱc=X{w!CdW^aر,[qk׮%,,aÆsNq^Νٳ O>lGVV餧Q꾛6mbҥlْaÆKuH)"f)")bG*`Hf<{jZJIIaܸq < R~}= /w?ONNׯ/tlll,SL'`Ջ[n>T&MĘ1c.ٖJhh(7pC˫Dpdn;x`tB\\#11ѭ.\Xb/3fйsgԩ?"ϟOΝر#$%%uMwy<"RfʡYʠyʡQW\ҦM<{jZ#((UVO?piƎKzz:3f`ɒ%: @ݺutr]w1o<yy2a.]ҥK]#Ν;'NȬYh߾=|ᇮJ8p =z(q^{3vXW_+m۶ -ZD\\ke$%%ѸqcױI&9 ((8##ifo.v[׮]СC#*uSO=ENNxE̔CAC: 7cI8to_+nןNVQ$&&33gθ?0:s%$$Ăs5hЀc.e7~˖-۷/+Vp [l&MoaÆ=zLǿ{̞={EUp֭c8N~[o1vX]92SERESŎF8ooLKb H ׭{b`͌=O?5,,twlٲb `7|kbqռysn&_>Æ ,S)92QERESŎTvec] 8]NO?*dӲeK;vӮ];ϋqtЁN:ѥK"&99sZ?KI7c ۷/)))x̜9;z}>c5jDHHH͛X9r$K,!77.]J׮]}>Bh#ݻwI/.`s9T{9{9裏_ڵk᷿-O?gժUam`<(x(n+ x׈g%--zȵs=G.]:~tM9s///BCCϋU3gO<ǏO?^>[n >kұcG+=>/YPDD91K1O9;r8: effHFFSn6msΞ=%0dKp3ϰtRxg1bgΜaԨQlڴuΒbk`]oĄ85 kW*m[߾}4i,a 7ϫ͛W|'Oc|y7=vNOH("f)")RUX6riϟ466ƍnwǹ.}vZ?֭[I̓I14OA`8}Й~Kٺu+k׮gܹsq9wMc*ltZEno*o[&N|`)ŪH("f)")bGF<ťiSc񕏋wǹ]}vƎk螗7;Ї;B`O}k_^dggg[,Y;SwV(Mƍ᧟' &&s owPx]kH5`pʡYʠyʡؑ X6aut'??Cxx>Ysi0þ}a.sZV9CAC#lt*_))W0v,DFtcA۶Ц oN2vulCD91K1O9;XgM7ºu'ֲukի!ҩu7DJPr(R )"f)")Ri,qYv&T-//7زv)SફNΟ kWk7? 7[0u9CAC:Eh*RM7_z:̟o-`MUҧ5Vݺf^4itFkq:aNX*j-_'5л5VaUH  e#7݄v '6nku Xa-y7YݻCnҸqUCCAC#ldϞ=4VQҼP=d~ɺUwoc?_UխDG[3CCAC#DBTp}ֺ#GbV^Ak8}qX…!6:v,Y3DDDDDDDt:aZEoX]deeo ۶ ֲ~=$&ZŻ:tYm@K``6]r(br(b2(br(QEk/*`aΝ;i۶fP뭢ƍukᇗ_jZj "#bRc(")"f)")RU!6rqM]bSVp-Oas^뎈KK~(NkV \ jxbKJJE4oޜB*3_|r-[Fxgپ};^^^tؑɓ'ӿJk }(b2(br(vԭ[tAZ :}v={`nص{7lHJҮ4iRy:u!ùs3gp-pp ^z%F=JP,eP}3uT!,,{^z1j(fϞ]wy'fbĈ >mpP^=ȱcx饗 { RYʠyʡؑ X6zjzeRCR' !8xZ.5|V{m )Z -~iX M>3dϟϦMחdL]w:nڵ\} 63f㏓˵^yHMMO>lGVVgϞq |Gvmh,XwCzCAC/NaCZJe iip{z(/r~B2evŌ3ؽ{7z">>ÇʤI3f%2~xWϭB/\s=ǹsxw⡇ncxyyUT*`Hxx&I] ۴)}  =RSVjjuYY]?c{\*yެZ~x;v,^^^̘1('44ȟ ;;&h뮻7o<<[m0a~;E'&&vZty֮]*ߛ R Yʠyʡؑ X6RPJ-T5{&L~Qϒ';oF6/f[7m3qppʘi+!k˻]CBB;w. b|Wp8رc\sι}'2k,ڷoO||<~!-ZdXbccKޢE իǖ-[2e YYYҨQ#Y!P,eP1K1O9;RF4Hx{[GEAlM 4 ҭ^]9v c`֢ۂbVVRq&%"߇")")bG ́%"唛%ρ5</o8}ں]{we~(תo~1WְIL{] Xk֬gϞ!R3mw!\b ^XZJH!}(b2(br(Ց&q\Mf݅e5U00$V ^=?o͹{7ٓ {¹sŜ0ϰti%MԼ[RKYʠyʡؑ X6j "5[`pH[g_g tS9WڶZsk..pefÇn ,\*XԼ[b[}(b2(br(v!g' 2 ZRrtѣ Z \Ăŭ-E ??Ͼ*߇"f)")Riر^znHV)9t8qck)YY \Vﮋ=k-;&CEV%՜~ %"RC.r"C챊^g΃nWU\qeKk袈i!BxqM7CVQ9͵Ty{%o#Gw[͚Y0JTr(bCʠyʡTGB(.1F 媫n?uʺkbV{9~֭Y\qE םjTElH1O9;RFhٲfjaб\鄴E/~\طZ//7(\֩%nUEj eP?}|[ךH!M@uNCAC4dggnHza  .+"Wi_x{CdU*DG? <\՜r(b2(br(vnHVBC%>vӚc^[ \'Ok߿pA"WTxJ("f)")bG*`Hxx&za5p@FR\yN(WJK+YYm$""WXzqU"P,ePnĺ󢈈ͩe#[6ZO9%hZw/~= ]%Li/iShРbQERESŎT,5jd"r(.AL$3RRyK))KtW\BV&V~6i}ՊsD-h޼9IIIUv]/9KYx|r-[Fٶm?0?nW_}\і C#l4ot3Dj5P$ Z:t(y3g\ܒql\z{BBJ/r.r̝;Ǟ;w뮻 /g|MN'g`+:PESŎTU+k)IvU*Wϗ~cǬe˖ .n- nU୷{O>|L:K||<*:t !!ONO ޣW^5HA;d֬Y1Ç[n%99]裏rY|M̙ëJ``G /l.3ZO9#ɟ$N AkC_;,B~[K7\=VM9>}:ǏgȐ!̟?}.َ,[nTxx8^^^ݻ~;w˞={^\rR#զ TWʡؑ X6Hn /"UB9jᰆ@N%t‰ Z%Μ)gZ''_}>>%..|Z-9LIIaܸq 2 Rn]= /C\_бL2]v1c vrG^zIMMeҤI3m?~UW]EBBB푑<*Nkr&M.ovcBWjUJKYtOPc{mr٨Q{^{/ṱe&!th[ݛ _A_3}t&NԩS3f 5ѣn5aʕVSowM~w^ty233i޼9/2?p+Տ~9lkÇӳgO8q"6mb֭4hРcKNwرc;w.7t%I`` x%UիW{/"R~ʡHr:bVz:o:ɓۦ _ygYKË`蝐 h|1"Hu߅")RURz`-^3gƺu+=7ߤYfL6 :vZ^~e X"""R9`-ZVYIi?{/l,n=NOux_pEW<g֜%'jWXFF`P~Z7dqvײeKMCJ͡/4mj-:spA+_.y(ZVnCP|0օMzz:M.sܹs&4l C"RC999[nܹSNO֭ILL :://Bٽ{rIСnڴ)c޽{8qtš5k\ X>̱cNJaaaۓNzz3~7nܘƍ}vg6##"ِ"""غu3{)ݳgO6nȹs "::mѢϟ'%%-;ٳx{{WcǎQN}G;[n;ukyEq0Zp9~޶M[l;:=:[C5;ԩSV}GTwŝʪMO>oE{Wҭ[7^y׺?[oӧOBX\hM."Wr#t`1́%5Bˠ )RUtwJhS8NƏς o.YK.-nɒ%;;ѣ&zʡy5.^tnwϻ UePĆCjW7ns>aÆ>|ÇsDӟ;\yٶm3f_>`L^[1G91FmUV*s"5D̠(bGn7xZ?sLFC\3h"z!^u"##yW馛Ղ&1O91m0gu3 S+qjlElD9;v,w5kVuW]uk·.C!Ej2PļC/oo"R3(bʡQB(c "r(br(b2(br(vZSDQESERESŎTjM,i޼&zʡyʡYʠyʡؑ X6c "r(br(b2(br(v8ptDj=PEGGn}>>"Rk)")"f)")bW]DDDDDDDD5jM,TjM,>}:-Zח=zb MoCO>)t2i$"##lٲLcEljٓ ƍ7Ȏ; ,T7x.]@@@{/pmWEɓq8cbcc]8VE*ׇ~YfMm](v 8BϝNgu"RuI3~x6nw}Wd(RڵkGbb"'N`y,_ܵ]<`ɒ%r(v!5Xƍ.*55H]D*_DD2)RE>.\ȲeˈrWEFzhݺ5L<]+("U`ݺuңGԩC:uX|9*uqeM9;QW=z`ҥ/]>}jHբE """ e,_\ Y`| -Z(]Y1tr9eP \s5lڴDmFbb"-[Tv4{~G||<{f߾}s=&bKYYY޽|޽$&&BfxyiӦ mڴ矧~9`Eeܸq||4lˁp8EJ?뮻hN<ɇ~HBB/VE@Æ ]s?iР5rWnT~rQ{9:DNXh͛774[Zv- p=;5k&L̙3;ǏӫW/,YBÆ M5Yvx _h̙35j(RɎ9~;:D`` ]ta 4PEPt:!""""""""Ŕ%""""""""՚ X""""""""R%""""""""՚ X""""""""R%""""""""՚ X""""""""R%""""""""՚ X""""""""R%"""0m4mHJJphR;v ""'OVRSS %%%ү%""Je˖СCM7͈5k0f̘J;Qp8.:tN:UZ;*ɓٳ' 6$,,o;vx:w_>>>4mڔC`_6&Nȸqhذa_+,,w<3~-tnHUJJJ+ ((g}.]pK~monE\punFeddp&M0sLZ׺M+VLL f͢n2|pzIvv6'NdӦMlݺ x]m۶c0j(~m]:r8p-[?U%ܴi]v$88J)""5z`H3vX?#7|3m۶cǎ<ìZʵ߾}' [o#GO48f̘Af{%''_|V7x뮻???Zh_D}ח9s0sL:t耯/۷gΟ?iҤ 0yBm֬>>>DFFr]<>|xÏp-AAA]<*!!_~Inꫯ&55/:#8}ZN_|-[G׮]7o^Xx1Fcǎtڕ3go>֭[WNrג'NYfL8kկ_˙2e o_}k~LFHJJ*t3fбcG|||hҤ Ǐwm|?#vZxСC AtؑEoݺ믿~Gzzk{nn.SLuЬYBΝ;\jDDD\"""ѣGϗ_nn[nξ}:׮]\j{Ϋʵ3Nooo֭[Nӹw^'qΟ??;SRRoI&uw8g͚t:Η^zoIII+V8?tu8-ZLNNv^oԼys?2aÆ97mo?n9Ώ?к׽at.[ 8/rwD]qp"x!$PEe &T(BS ]Ĝ\(:#F&*.Yr6/1E_>_$ <ۍ>}/(tRinn9s{y~m7o|b4壏>GGG9y䄮mx>N81 9wܘ1xxxHqq$''Kdd_c+7nj7!֭WJPP]Vzի%$$DEDDXڤr˗KFFE۳>+h4JUU:uJDD.\ uV1 ,'RVV&w߉VcժU+LhΈX"")~ƍTZZZ466MJ^%Fшf-$$D npacZVnȇڵK,Y"""YYYOx+cGFsss%**j*1@F믋F?S̙3NKK ]eٲes?\e֭Rm X"C9$""\ggg9v예ʶmF=j?,m ,Qo.K.h&hU͛%..n""a|Zvvv jj-,, 0 JŦ E[ww,Ybyy 22RyӃ.Ey޽FF?BBB+WxWq c?!<<\y J~[[[qu$&&Zh222,;;;d66l^g}vؕ+WbŊ(,,޽{<>F#"J^755Ϙ1ׯ_hDww7.\zjs1Vɢ-;;wFtt4v ^|Ԅ'NX1o<h`@5G""L_ѝ;;;  cƍ,~vvv nokZDEEY988"""/ѣŪU/jmmm8~8jkk~zٳNDwf;~~~qcQPP-[(PTTd1׾7֬,|hhhf׮]CSS~fcѢEGEyyUY, LaOdhKOOFAuu5jjj"kVVl2Yϟ.]YfM(+hJ1c4 }\z?ЊNtuu)ߵ Y Ο?@ܹs8777$''Cբ_~%.]`h?ɓ8{,Ν;gd}2V4rE͍,bM???6lNzxrrr`ooGbcd2ᥗ^0TiooW`` OWWWn1V/\VjtɁV0_---CPP̯a?3.\x{&")+h)))c=ŋQZZ c͚5(..֯_XGnÇ|dgg IIIǏ?Ʉ7x탏yø2fDEEARO>3x~&{[l͛188̙3pqqAjjO?puuŋ TWW?ٳgz k׮ŋN}aݺuY{QPPٳg:={6󑑑///$%%ʕ+8}4S,5 a6m6mBRRa2P__233j\xzzV|M{Ett4zzz҂4[}ޙ!"="")gܹhnnF||]yiii}saǎ Ν q^{5;w.%Xbڷo XZZ3g_Ŵi)vvv ͛5\Qm׉'еkW}JJJ*û'00]vhWh޼9 PY[n@*|BX6 wwwy~Gt氰@>}pԩj^*BXʕ+x ccc4k ={DttOeSΝN:VVV ڵk-J5 | H{ٟ7|'|077G۶mpBISbccn:ɹRRR0|pOfQ DDDX7n/r'NGDD3gmݺubۭ[}}}THNNۣW_}%}᭷ޒ\kӦMaƍb/}_{5Bz?ϟA&M$B~ J]v 7L*8pҩIuؿ ⶝ i z5 & 1{lܺu oEN?&M1bj5$⩧zk) XA_?ӦMCJJ/b=M6krvv֘NgggF ??_cGGGɶ4iqtM/044M]+++Xn^:uŸ)뫯{g]vG //j\LmWe򌍍kT,3f >o޼YM4k Gʕ+qzIIIpqqye}_.Yϩܸq͛7q-bva(ѣGcCC\\`nnsbܹy8_~ygggO> CN4֎}ƓO>Pڵ ppprcسgڷo/y>l۶}'NH=<DDDD%<RsN Æ %āҵ_ĶgϞΝ;DDDfΜ9;w.dm-""""2BX PHMMvMx8pwn&K.񈎎1l01/F`` m۶۷xܺu'Q8>ѣFȑ#~z?w}d4i;wܹsE8KJJ#FTXp""""""""HуEDDDDDDDDEQfaii J%wwt {.WT,`~:\]]NKLLKc %қhee%soj$RH9G"`Y$9eeeU?8mJ+ Xԩ "0DJ<)H ").E܉HXrway$RHEf,`@D0D<)H "i3\{{{AD`y$R**))Aaax$]r&!,`Q(,,D||t] {G"`yz8<M&7pk裏믿bb{aa!>#>}vB|| [[[_t_%^k8z(____رcxW>T?j<Qay$R}}H{)n ѣGѿ<Jn޼UrJhK."229ݖU\\\Sv|HJ*osqwų>s⩧ۧOӧ_WE۶m{7SСC{ػw/ڷo_^)-ӽ{wL6 =\DT呈H#2Bn=ōz'~N>}G"44T֧ODFFAd;vIW@T8;vTubѢEȦNTTBK,Uݻw{x燀7|Jש?DDDDDDD=7Cff&|||ZcȐ!Us 4kL֬Y3#==NNN PPP ngeehd[hSKG^=hSQe 9#G 5͛/{HVK, 2VqyDDDɓ5.PZ$#j,呈<)H &&&rw)?bƍhL:,T*vY1vO+  sss!&&yyyΜ9t]$&&(}bߥK sssx{{ԩSիv!!!YYY011Xqvv \hӦ p! @\tiiiȀzu |xwYK_G۶m/@zz:LLLЦML4 C= ___߰ٳѿSŋȲUPPbcc xyyE[n!%%FFFN<˗/ٳATTx Z۶mq5dffl*pe?_۷oAAAPZ ŋNQMMMŭ[8q%%%?7n߾4T*t'ODqq1ЬY3ZllܸqбcGDGG666pqqs8>>HOOGzz:СC~7mM6Ell,UVDjj*SNBQQ .#:̙3((( \]]qYs[XXd3[w1닛7oJ#?A`oo[[[qk֭4~GFFBVI&ppp?ZBVVn޼qmmmϋ;77)));wF-]ӧOZhv}|2affG;BRu֊P#(**?2#]AAArrr```Z-Θ@NNA W%%%CII add$W̐/kll\H-((ZLLL$T*q ) }MMMtt===ɾEEE(..J9}_e}MLL{xò} ']Vvae=./yMLLP\\\600A.>y+}fQPPӧOk|G?Tžӧ7>clܸQ|I/v܉#77WI ruuEff&556o*,UAnޅJ5 }aff۷#99&&&z*,XN:Exx8zxwu֡[nGDDQXXwyXjƏ/+egg~CCCX[[{8x 8ݻ>6mBXXFU[GT*#Ƀy$R摀@2XԠ,X[[?tEq#rss']K__%%%Uӹsg/?!!!>&a~@do $v-ާ[n>( pE[N3:uBHH^~ebΜ9?~|4i6lb1H[UG"H#2T6CH[($o!!!;wĒ%K>3f# &ի[k׮Ŵix Cm]>m'ST6;&ĉőUQe@,z(]5vXl۶ o6>wş?/1DJUYH#r0([~=lllҩlDJqe˖aРA8q"|}}1m4k裏}RRRĵ y{Axx8G᫯x l=@f觇/*ba=(..Fhh( T*[JZVg֬YX~=|||-[<z^z!88}fqCa͚5\f"HDcy$]W_}022|MܺuKK.([LjH)eiiKի˗ u QQQ(((@||<&L=Zb7PΥysc3:cC޽qE,_M6Œ%K&MԸGbȐ!2dVXQ/q7n; øq㐞^/'/jZ.=#r0Tj e ׆–&-..$&K8m**n׍86 ))kcu Y"w%lRnG"%a Ew|`LK|%0`@=u< Ν;87n܀F>2e &NHHH~G,_ǎÊ+SOaҤIPXXwww,Z}}s}jP"TK%j8Y½e@U+ wH9G;vHN.m߶aXطoϟ/)^# /^D@@ƏqIzzz>|8MM6[|SNaܸq077ȑ#{Xx1`llǣ9.\ V3"mK; ?jeifcD>&H!G"`ґWuN&/^ u___ܾ}jd"\^^^J>o<>/ƀ{8\pVN*׮]Ѷm[K^^ \DJKnW{iÇ'@bb~ݻ7Z]033WT@ZZ@|ő[Ű'$$D=e?ЫW/ 8ڵ{"R"-NdVjJ.=#r0TW)55ݯ6Zl J .Tzll,lmmѴi*q=J%JJJN#Ν;'>Īd{رr ^y={!!!XlYWXv]mQ`{eTX]WD,@D0DmYz*Z_CQPP>}СCHLL޽{ѻwo4oСCHNNFzzz?g|/g",, K,yqSNž}(UEXOo[v(-\x٪g=$"""""zx_~l{_LV ///KѣG=zvvvy!!!^^^ǎ5k`h۶-uW;KV7ހ/~in˗/{%R* TGKVV +++n vO+"""""R|bڱitwu--^Ux8A9k#t@TTT/z@ RxE$1D$;H9G/ ?3>ūɑ D@P݉*#=ɂ RH9GO@rB;qi3T,`HbHv#r0D``1,XΥp<G"`P.=4tK8d!摈d<)H yyyrw衱K8t X:˫N!$G"H#2-Sd'摈d<)H %Db,`途_8HvbHv#r0DPTT$w XSHF^xοvZ6uacc#nϙ3i0eʔ=,`逐_8HvbHv#r0FJJCCCxzzbڴii놇CRΝ;u>WݡR`ii)Jw PTX~}\g| i¬Y8::W^رc-tѩ"J]v۵}]!>>Y,`sΕ)DHDcy$mO#%%W\˗cڴiu.ApTqHII 4+,,l~X[[WYpݿ۷]vܹ.]Ì3Cᥗ^»ヒ̇ˣXr,gK)DHDcy$mgll GGGbС6l8eƍ %1tPǖڷoBBB`llÇC,\055Em6zJ¨Q22e `bb'x'N=?ƨQ`mmq(-Xx|O0fXZZEXz 00* ݻw9AA2[l?/i9s&#G7naaa}6F[[[gŋtڷo|}}aaa!6`߾}01117etM>r|ISLG}w\wFEGGCR!!!yy:aKB"ىy$"1D<155D/,,G}ӧOc׮]MOvGXXVX㭷qAb)))/l߾6l@TTZl>} ##~Wھh"i'O|'ObxqY̙3|Կŋ#$$Nĉ#66pq_رk?>s|8t]&իlll?7Pغu+ 8{,̙3?`ҤI#Fo߾6lw^Z 6myﹱuzj_\ !33S dffݕSZ-Ky;EHDcy$A<… B^^^ycp 4o?5ȑ#B&MWݻwA]vdgg &&&‘#G$Ǿ!C$ݾ}[ri&Ppvv.\X{֭`hh(?A777^?tPwޒwyG܄Ç%%%b A!>>^ :uJr6۷¡CĶ7o %K<ظ8m邩uVA!,,L \tIo5k&nO2Exꩧ} FFFBFF +Ǐ\ Exׅf͚ ^֯e?;w PԩS!>>^<~(o^rIxxx4 uΞ=N:qy$"1Dҥ O%4[jUejj t4ZccKEDDDDD:LJn=b "447n=]>}h,^q XY~C%Wُ 2 T[SU}|B% +eT*j6)tM4J۷6{{{V[ܫ콕Wߊv^^^زe ^uܹaaa%%%x0ekhBСCב++_OO-[vTȚ*joo_c,^WZSd!#Ɋy$R摪)wjܼAll,gYccc\vMwEFFFJQ*ӲeKСC!"##1uj$mGwhդ]PSFFFÅ  hK/LJ~VNNbDDDK.[n!..СCi&@OO>ZPPΟ?bӑ#GpB/>}:&O 6Ԫ)+8#¢k}suSmqw yhٓ9Hr<*<)HE022²ep޽}QYZZbڴix뭷a\|N7|#+ܠR믿"-- 077믿wy{Ņ 0n8W_U4o㣏>B\\6l؀Z\&`jj{͛|P}(}'puuENw… x"֭[dggUV߿?ƍOѼys6l0| 4H2ѣGo ::/^ݻ1ydݻw+`xg?`֭駟j}/ײeKbΜ9odɚ8|0v*N%lL,`逤,`BG"H#*{{{_?g?ѱ}fϞO?ӧ~xxx7osbh֬> + \t G<[b˖-hӦ fϞyUtŪ૯ªU\e!PƍÞ={$2[[[;v Ç@t7oƢEħ!88=:w Agiiժ:t3gΈO,Ӯ];NNN7|OX`&L:ghh͛7#66۷ǂ <7oƸqԗjZaYYYFfffsK(""|9N_D"IHV#r0dT 5J%F`` f̘!wWtowygΜA+R=(up *B"YIHDby$R333-ZE: ,,UCcKƖop !$y$"Y1D<)C~~]hpnnnRT,hZt@nnnF, %$D$+H9G"e(@b,`HN!$+)H#2*B"YIHDby1q otӧ78HV<G"` (SXX(sO]K 3fhhXgxj8BHDDDDD d``333zzO d{6AT4tU~ru@-78HV<G"` T*WݝGZ:XԠlll fKp !)Zji2IMM ehhؠRtkT)D䑈d<)HDnO2޾4BH$+IHV#r0D,6S\g}///t988Ʀ{\;N!$$D$+H9G"e`I)n aEظq#ƌUũq z4R֭[BH$+IHV#r0D,6Svڅ;w`ԨQUիW#88ѳgO'YYYF_)D䑈d<)H "i3 (ѧO_~qJݻ+}}Μ9;wFann //3gnnn())Abb" .]Bvv6SN\\\Wڵkdee8y$&&&r M6HJJ;w`dd?~ \t @<7o"##>vwUE\\u@ZZСCDFFBVI&ppp@LL UV͛7:uBTT`kk ggg?\BBBp9-Zٳg._\\$@PPbcc xyy-Z]h߾=._lQQQ600@BBm۶v233abb6m 22@i /_}6 @f`ee%>u'N۷J;ɓ(..5k&-[";;7nt(,, \\\p9'qu@pp0Ο?|XYY]U D\\rrr`aa-[":: ===g6>>wޅ)|}}ݼys!>>^߉sѮ];8qB̚7n@FFvppx}||t3[v6mMmhժ233񙵳#.\ ~fsrrݡC9s@aa!Ϭ~G 88Ǐ w#~Gw#~G4wܐPSY[z*<==c߿VΟ?7nzF`>Mۉ'СCҍaÀ~(%K=$y$"Y1D<)HrʂC^;0,, xgk}ST066KZWBH$=G#r0D,6Sdaaa9r$ ]1cw.] wwwo߾۷o벐HB"IHDby$Rf" XTBHDDDDDDD:, )Dl(Ƀy$RHEf,`۷oop !$y$"Y1D<)Hڌ,V)D䑈d<)H "i3tAN!$$D$+H9G"e`I:vX)D䑈d<)H "i3tɓ'78HV<G"`Y$m(...B"YIHDby$Rf X:ή|Sd%#Ɋy$RHEf,`f͚op !$y$"Y1D<)Hڌ,S)D䑈d<)H "i3t aKlٲ|Sd%#Ɋy$RHEf,` N!$$D$+H9G"e`InܸQ)D䑈d<)H "i3t aKtر|Sd%#Ɋy$RHEf,` N!$$D$+H9G"e`I 78HV<G"`Y$m)B"YIHDby$Rf X:ť|Sd%#Ɋy$RHEf,`sΕop !$y$"Y1D<)Hڌ,])DDDDDDDcXBH$+IHV#r0D,6cKop !$y$"Y1D<)Hڌ,p N!$$D$+H9G"e`Ik8t X: 88|Sd%#Ɋy$RHEf,`ϗop !$y$"Y1D<)Hڌ,P"BH0&r0D<)Hڌ,`eeU)D䑈d<)H "i3t{JG"H#20XgΜ),"YIHDby$Rf Xk`aKop !$y$"Y1D<)Hڌ,V78HV<G"`Y$mHJJ*B"YIHDby$Rf XSHǰ78HV<G"`Y$m+B"YIHDby$Rf X: ''|Sd%#Ɋy$RHEf,` N!$$D$+H9G"e`IZlY)D䑈d<)H "i3t@tttJG"H#20X5BHDDDDDDD:,Z)D䑈d<)H "i3t^ŢJG"H#20W^-B"YIHDby$Rf XSHǰڵkW)D䑈d<)H "i3t@|||JG"H#20Xw-B"YIHDby$Rf X:Դ|Sd%#Ɋy$RHEf,`__ N!$$D$+H9G"e`I78HV<G"`Y$mB"""""""1,`͛op !$y$"Y1D<)Hڌ,`ddT)D䑈d<)H "i3t@|||JG"H#20X5BHDDDDDDD:,жm N!$$D$+H9G"e`I78HV<G"`Y$msNJG"H#20XBH$+IHV#r0D,6cKk׮|#d%#Ɋy$RHEf,`'Nop ,"YIHDby$Rf XSHǰ78HV<G"`Y$m077/B"YIHDby$Rf X:BH$+IHV#r0D,6cKp !t_JG"H#20X7n(B"YIHDby$Rf XPT?oF}k׮ڷo˗/#;;fffywĵkא i'''~GwDNN233ĿG;"E}G(wDCGܾ}P#QgjCS###/UѣGcƌbۿ'x)))ptt8vVV\]]v a||<<<\xo^ݗG"H#20Y*))AXXF  3f`Ĉ pUCLL ֭[kbڴime8١Νѣ?@iA0:::uѡCjck3K.@Nj|0p@  Vw߉ Ç9v؁fff"""""""j|B_ڵk3fk)))"{@g{[ooꫯ0p첬|||74l]풓 / ,, b{||<7o駟aeeLqΝ;W_E\\f͚*wȑ:ǡCb5jA'|Rc#FH dl޼lll䑈d<)H "i3E xmu QQQ(((@||<&LH=U ;pGzC=z@֬Y4[∥ݻw#<<?E={DNNڷo &WF~ХKqFয়~| /2ɩ~wA=<:<G"`Y$m6/( a?RX۶>F<G"`Y$9յXT;ej+|9SK#Q~0D<)Hڌ^СC@u=T*@bb TpwwoPTPU|Bb5wJ%NmVZ===<sxyyݻ7bbbPTСF [[[nQF#~F2d 1>>رcG|D ,D$?H9G"e`I6m pi.@ʿT*ikxѧO#|M|嗘8q"8cJ  0qqqx:6oތ 6ХKjt#33}w իW&jley$"1D<)Hڌ,P%|4XeO͞]d ݻaffu>3kOV%9,Xӧ.]ԩ^|E/HMMŜ9s0~j?i$r?Aocݺu0`}jM"`y$Rf X: 66V}a#Y mӥۦcpi@nn.&Nt[/9=P\\Z4cǎŶmo?Q}]?xJ)((/%KW_֭[_5y$"y1D<)H@P=*Qi1N3L@oT:P_HH(7oF޽ [JBqq1222pfバlٲԩ󃟟 šC^z᧟~9Wv,ЪU+ v0h 8觖* ܄KWܹ3vލ</^ѴiS,Y...hҤIqUEWXQ/СC'OjJ.=#r0D,6S eee;z 8p@?M7o|9GL||<<<<y$RHES]k/J԰m_0obB"jprway$Rf X.0 heiWDDDDDDDD,mܮf#WC 0D<)Hڌ,mgVEEE"y$RHEf,`i;(i^`{eTX]WwEBvvvϳtR9s`ڴiݻwwީyfΜ ___o;v]"y$RHEf,`i;=}>iʞB8{9}{H|VE?<-ZTtQQQ8}4[ 8/QEvvvrway$Rf X:"ds<͛vJŋ{1cn޼C۶mѦM^0o<\~ B@@%_~= T<3055mjF(wH9G"e`I.\PZژ\8t7oPZw^L<kǢE0eٳG={6m6DGG# ^///u@D0D<)Hڌ,]4D1Ԃ=0vX'x >|X|m̘1x  7Hߏsb˖- r~"""""""tAuo1L-T wYXXTZe۵q @,1zh/hݺCFH6#r0D,6cKH 3KÁ͛KVWu OT_^uҰsNh䑈d<)H "i3t7 eFc ZOwGhh(-[WWJꫯpk=zYбcG)S0zJqW_}pه>PIuѣG:'''GF۶mѺukL>B#Iw=jy$R2HEf*,X[[#33VVVrw"""ЩS - \i 9Ыl߶m vy"E#ȬYpu[x0vXrwH#20$^8KtASj7+DYԩNH4eo?JBvv6#T  6l؀4mT<^RaΜ9x͛7躧O3<JCCCoIm#.aY$m8s挴> $%U =J4ERSS1~x/8u }ב6m|}}1!* /݋ɓ'#11kw[nEaa!޽;w"!!=9Dy$Rf X:@O!LItFȱcVZFρ0go~*Zѣ'kʩO }pI\t ad  n:@BB<9rшPcnn. >>+VoHbY$m8{_~)_TsȣʞzuɥX͚5ʕ+ϢK.Ɂ!̰f᭷r߽{w466?P,[FQr /@@@@9H(cY$mg%(Gi< 7;v>XߙH)juHR//6߽{W\*,, k׮?SkT*ܽ{W<\GGH "ɩ<<< 0̀@կCȣjK۾+O(..F+QCPRu#20X҆FBX~=4(yZY0k֬&()D:HEf\K$''K)DT9<ʈO GHcY$m.j)D\|Z( t@PPQ9<ʈO GHcY$m6p !l4(mۀͥ..|Z(2Q<)Hڌ뀼-EJ#ѣy$RfiF#D$H9G"e`IΜ9#mB"h䑈d<)H "i3taKI8H6y$"0D<)Hڌ,PrH+N!$FH6#r0D,6cK$&&JʦE4HDay$Rf XH+ XDDDDDDDX҆,%jTy$"0D<)Hڌ,p%iC),`5*<lG"`Y$mΖ6p !l4HDay$Rf X:\)D#Ɇy$RHEf,`oooiF#D$H9G"e`IN:%mB"h䑈d<)H "i3taKH8H6y$"0D<)Hڌ,/mB"h䑈d<)H "i3tիW BH$<lG"`Y$m.B"""""""!,`vI8H6y$"0D<)Hڌ, mB"h䑈d<)H "i3t@VVSdG" H#20X&&&N!$FH6#r0D,6cKK8H6y$"0D<)Hڌ,pIiF#D$H9G"e`I8tA}ǏGBBrssaoo@xxxE$<lG"`Y$mV֑#Gl2ڵ )222PPPOOO?&Le}Ɇ c)H#20z a1h 4oݻwq-$%%!77/^ᅬg}*r労SdG" H#20zVhh(~'U'<==1rH?ׯ_NR-q ! ‘kkkdffJZNNF֯/W~=4HDay$RfTK<011III1uT^>NOըxp !4HDay$RfYnܸ޽{9s&͛Ws玴SdG" H#20ꥀu9tuViG?eS٨hCƧɦu1D<)Hڬ^ XEEE066_x>>>HII1|p4ifffɓ'?<<*J'66ސ 6p !l4HDay$RfYrJ>|'~iѤIZxahh.\ŋaccSRRRğVZ=:Ǐ6p !l4HDay$RfA}d?,Z#GDw\ ktC ]:SHݻ#==Xn>~x\Vڽ{7BBB/oktl`` гgOqQGSdG" H#20ꥀ ɓXj޽ t833Zʕ+XbZj}a„ 2e *qrrի}vر[FϞ=qСJ/((@VVGYXXH8H6y$"0D<)HL%up\zO?4]SNE~~~Fa!$$GۦL'Nѣ5>O~R{n̙sj߿ BLL `ii 9s憒$&&(]ҥKΆ9q) qU@v퐐,_\&&&r M6HJJ;w`ddq#,,,p%@qq1=m`SnHKK:tHj4ij YYYy&SNBQQlmm+.s!??hѢΞ= tJhqq1AAAEnn.,,,ӧOZhv}|2affDEE mڵk̄ ڴiHEO333\|@:nׯ_۷ahh DDD5k+++\xTܺu  'PRR{{{x{{HKKJBǎqI͚5w˖-7n:vh...8w~: 88ϟG~~>.̪j~"..999@˖- puu3wn޼9/Dܹsh׮N8!~f7n ##C~;88Z>>>ϲlnڴ)6m*>UVDjjgpwpgHNN?U}GUV͛7˿#qq{{{;:`E}G(wDc|Gdee{P##~Gwܐ +++V^xXZZbڵhҤ N> OOOCbժUظq#hѢz!Qh䑈d<)H "iz_C~~> www$''c>s=g"??1117n׋#rwťK >|)^}uȇBj*';w7 E޽áRСC=hݺ0ިQđVe?F 2C y:g :={jDQC#Ɇy$RHEfuvvFtt4lق'O* SS=@ppS+Z|9&M>}`033#|MիX`ƎW<.22O= uCxx8^u<(,,;#TtBںz)>-gϞ5kV&j,y$"0D<)Hڬ^F`:t=z4k,_cǎ!:T({So 44w֭[kO>ZiZ ,@|fN/_~Ajj*̙W>&M{{{ۣU|rܹ}VFH6#r0D,6X=z@JJ $홙ѣG S&X)DZcpi 77'N֭[OU@h)(..3vXl۶ o6>w}ÇV_ǎ-Z}vZL:F jhy$"0D<)Hڬ^ X Tѭ[`nn^(+$T @))@Ocyf믿кukT*###GAAAA3k,_>>> -[N:=8???UsW^011u۷q߈FH6#r0D,6Sk .mll,Vq̙}Du# 8 >a߹sg޽>,zc5k,YǣI&uVsȑҮƊ 9Vu,--3g"??8q">:>=h 5.H9G"e`I:!=z4`Æ #F;S a@g;5 `q"j$<G"`Y$mV/_|cƌAͫ%%%q9O&M @ @mQ##Ɇy$RHEfRATU&NAڠXZYY?Dh4HDay$RfYvZi&&&011A6mf͚:==@LLfU?GFUiH#r0D,6X_|&OΝ;=z 2T@RG`S esiӦXl "i߼y3&O^Aeee*E222`gg'm8z"^jDTJHD`y$RfTKL!T hFqqq}\ QV߽[׋֭[JBvvv\\\\`ݺuh۶- _k}vmCBBBZ?c#JHD`y$RfY+Vh^Æ KܼySb5I:&L@tt4cv=[bСǞ:u >ۇرcZp۶m^z:UG"H#20z^?wJ5k?c=8v1bĈjj{̙3XZ }ŰaÐJ`,XgFVVcaʕرcfΜ [[[۷o߾=@OOƺxbpvvIDDDDDDD5SNI/_ϟC&:uXSMo߾biƍXx1 5kïm۶RSS1n89r[… U… Dnݐ{ś~6lJJJЩS'|駰oң<,G"`Y$mg?$m|)9sp!XZZ_|~ݺuC>}*=رc B֭Ǐ{P&pIݻ ǪU0qj=tZh"9r$`}GOy$"Y0D<)Hڬ^"yi6b !;wG?,Nܹ3ѩS'l߾:tZ86ҥ Twssajj 333 0Ǐk h`hhSTG"H#205a$&&h6mzKQ5lmm5RC'̱y3TRBrr2^x_le˖!..ٰBfQ];wƩSXfM}=rpoСCjfVE#Fk=z4rrrp͛7#00ATJHD`y$RfCO!G6mХK< 3LLLpm\plق͛colĔ5v&Db 廮Yiiix뭷Ķy֭[Xd Vh"X[[gϞѾ}{t+Wիѯ_?4i {ڸq#O۷og}_~x 'ĤI=͛71p@jOOO|wu+QE摈d<)H "i3P_IMMڵke;wN򚥥%z#44mHYYYFffV>.""BcJގOcм(-hRs6iGey$"y0D<)HrkG`f̘3fΝ;z*дiSxyyAUV-FVo(^ Nj>""""""""ԩU llltT ^^^ÇT* 1tsDHDay$RfBrss%))5;QݟG"H#20X)Ujv\M#?D$H9G"e`Iv\\PR*Z)B۟BVj;v>TG"H#20$^mVqq1/Z w\~u ¹s4 ODs$K]\X"jH呈<)H "izy իWOڵk(((@޽aii "??+WP+m<qXRN*t Np#5>H9G"e`I7|!!!}6L;8pAD6EAqgqA}I˥,[Ji-YVͶ[6iuͶ뒶K5wEwEQD#"cl=:ulqXAɥ FrռlQD(bʣ1(2gԪU͛7'))^5kV O'KGrʣq("Ơ,3\rrr =Hzq )ۋ^Ѱarjj#RGrʣq("Ơ,3KkРAL2d"##W^yCRWRKDDDDDDD].!0`dff2j(O ={=N!% )z/!JWlE)"ơ<(. &Mٳ#77?O{&uʑ] ]B(RͣT9Q8GcPř٥PNxx{RJ)11M^KE\y*<(b ʢ83́΀HKK+ɓ'qww)<<<̴CU"WbEJ)"ơ<(.#>L[Zn͖-[8t=N!%غuk+ӥ>3pssO>={?VZQN4hw 66D׮];v,~~~k׎0f証ǘ1c9r$#G,kΓرc8q"5*qDȻdp4nܸc0Y,Kyw;v,5kO?ܹsDDDI`` vbѢE :nWٌx{{;2ãc `՝`: /}ApLT(R}Grʣq("Ơ,#URXWf߿+ٿ?[n婧wީ) .,w}Gq q\""""""""ed֫s=7oo-?ϫZ%%%q}Q~}<==d%o-[d}9N'00]JHZDRQD(bʣ1(=ƌCXX~! ,bի޽{uz̀ Ç:t(oXz5ǏaÆy|uyBB4ʣT Q8GcPř٥н{w g_؞ RӧӬY3۝شinh`}֮] <7d6m˗ ml.pyu@FétK\?e[[o-ujѢE|[;v|rj֬YzEDDDDDD.#~7n&z?x ضm9+:'|SO=76lc=Fڵ=zt8qFy͙+5jDvv6}뭷kӦMԭ[.]{n.^HzhѢ۶myrʝ"##9pԭ[m۲epwwȕQǎIHHl6ADDmn&MCh߾=={ZjɆ hܸ1^^^8pVIKKFDEEa, 6Ϗ}Ѡ^'Ob9~Vak׮lڴׯO@@wM6fN< XGqehҤA٪U+.\@rr2رL|||h֬۷o#겳ILLK.ٳ .EVغu+͚5ѣtԉ'5jpeY8z(xxxо}{]7DDDpqΜ9C͚5ҥ ׯ}ٿ?`ᗒӧqww'::7KÆ g޽mۖ3gp)L&ݺucdggO< 8Ο?[&>>7|~={>{ܹsԩS0ݴiSjժÇmc8{,kצcǎlܸ[ĉzߡ]v 4Az۴iCzzwgߟƍk.g8q]m6%88mѢ.]"))-;" Ǐ;",,'O^;]vq)}G4j~nݚ g[ns%|}} bǎlْL?@TT;w$33oooBBB c{KͭgÆW0#Q/_6w}GTwDvv6ׯ7wGTt6bT@Ϟ=뮻xꩧW[ne˖lܸo4jժEtt4k֬=cqF֮][>m۶eر}HNNq"++l&88tK]Q۷m^{#?.[WnarM&I&+p!uƖ-[d2o {-[[oBTTϧ[n?о}{%f~(U#ߟ'ߓѣy*=p5ΣT:Q8GcPőf3>>>۷swzaÆ>}L $<#?d۶mtؑ /Э[7:OرcMXO>$'N <<(n"/:"RGPEAYgfI:t8rssܹ3mڴס+UE's\W!ӻ75woxݻm/4IxJJ aaaYvg}s$(cƌ!::G}Tt-ol[naҤIxtZnO=Xp!111̟?kY `pc7yhqΣT:Q8GcPő 1{-[2|pNΟ? HFJa:m).!\r%&M^zу%K0a/^Lݺuoݺut҅v0nܸWN/_f,YիWvZf̘q|||?>=QQQ^sr 7 '(rBq~ΣT:Q8GcPř٥O :tBpp08,˜-:f'#vb);v`ر,Zvi^Ϟ={̟?]S2W^DFFV_7oΝwI:u$&&ȉǿ+e_|#66͛7Ǐ.),U 1 ;fN\3qUʣq("Ơ,3Kk޼y{:Ğ={x'xq )Kݢo3 :ZqYm&%%q_a{xyyqwGo>222 M ޳gOl¾}ϋi͚5WZQFtRrssaٲeEN<>zhۤcǎ(p_믿֭["nY96"RGPEAYgfVjj*7}ݴmۖ?Ol߾ ZmA]Ǔ D%D4O" 4>sN:œO>ixbbcc"22޽{; 7ԩ=| 6^z9zY}7^z l#hҤ tԉ"*K/Dhh([ȑ#hDDDDDDD.7oޜ>n-Z0m4nvI>} ?On6K;'>'͓p'z{O5=X6Tsƍի]6Jxx8˖-:'99R{m{"R9"RiGPEAYgfX5k,r'x{SKDDDDDDD]F`|Ӈ&MpL¢Eu )F۶m˶CfW[}9"RiGPEAYgf'|SO=M7ٳgm2e{BJp̙P.MާX"vU|Ν;zvڜ?LͅD#R+"R)GPEAYgfV-/?Lxx=N!%֭[wr;wnۘL&222]?}t"##ߟ +Wd̙tЁ5jϧCDDDNB){L&;v(nرtؑHvʊ+JuLqrQD*(bʣ1(rgyGyL, 6l`ټ[|8`DEEm'm`}>C=C=1cGgܹ[ݲe /"+VI&fj(]ĆĉӧOu|58SN_O \yJ<(b ʢ834ƎKvv6'N… 5M2uTFaSH ˾4~'~߳g3f`С{$''c2bɼ˘f"##ѣӧOgoϏCVN:Vx{DŽ hҤ ޥ>n~]׼8{WWڶm[ ֬YS4j(/^̘1cX,,[ȹFѣKul>L6mذa)))lٲ8 4{1s& łd*rtO֭˷cHlUYzN8aSH .]TAr4m0o?2 8غ*wEGPEAYgfVYjUwQ|}}˿sep=);Lj]c+)S48 QDJy1QEqfv[ogeuW_}ŋV+((;MkmbhaYFxAAULL^U("bWʣq("Ơ,33Y,KEV\&f3>>>rlt޽|;? u_aVs^W*G+Q8GcPő*{\{FDӉXX-[,j` QDJy1QEqfvabccIII)4"߷i֑#EPEĮGPEAYgfw^|EڵkGF0]u9o7'vwq˷s%Raʣؕ(bʣ1(r ԩS9s&w&66+W=N! 4.>ZDDDҘL&222*|)Sb}ҤI<,^gyTٿ? SNDDDµK͍޽{PRQQQ;@uǎTH("vS^{5?É$>>g͚KU֭[:t(5SN”GcPře~oVZN͚5 _`=N#ŨD|m/8aa;H51ECy,d2zj:Dnӧm.,bcc{ e…Edd$/23gd޼yo\tڕozfZha)<(.#׿rJڶmK)]ؿb*G屰|FO>Zʶ-/_GybbbXb]k5kk׮%22gyGPqʣ1(2뫯b|8QHHHpuKpEnkn^^^Ů+صkFw|4oޜmnF\s)"Ơ,3,ZjeCI9l۶bB QD즺1'bcalϜⷝ9s& ӧhԩS,\믿}zzzj '>>xۼ'Ob/hr.y1:eQ]X&MW^… 8T5.kXBB`53$|Qj׮M޽ʼnT3vɣ؅q>z]7oT}]"Eq<8 eQ́u0RN9%]+PZmXXp0V"Ր]("vyɁ/ܼs&<vWyy"rE(. W^yrJLLiӦ;'rWK\G WUXzn׿%R,WΣ3Qř٥g޽Dxx8;w2i"wlDDDD. FAll,X,0`s̡aÆ8.«G`%Rnj܋+10оۉT6WΣ3Qř._1ܹ4Μ9Î;0#{BJo]F(RNvˣTXuɣ;#GZy%FT](btʢ83܅0888-[ƞ={X,3p@{^X"""""""b@vi`4h !s 5D*ny SECy1eQY.!_ l6ZNDDV)r%(l݁X"bUطtI2[E”GPEAYgV֭[]?x`6o\SHU˻HJrl-""""""""TuIj֬Y5jpԩBJcǎ;Xhh;H5a(ڵd2{n>crrrxUN:;XXXX"ef>>yGE|rsYp0$&> &-G)7Q8GcPő*{%͛7租~"55׳n:RSS駟y {Ϝ[="Rnʣq("Ơ,3%Wk׮:8Rx8,]j]޵ 5rl="""""""RUx8^ӦM{@M.Rnvϣ(bʣ1(rjղ)7QDMy1QEqfj`ÇH="Rnʣq("Ơ,33\kҤILƍ}lllM&{ª]?;v@nT)"bH&Roq p233i۶-x$$$<ӴoߞP?qҥR(""""  99ؾ}5ٻwo}ڴiSC?SSI_EUJ4iʕ+Kqػw/r-7T_b۶mű{nNZ;Q8GcPřUF 7nl{4lݽ *5cǎ;/of㋸JɣTw}޽{Ӷm[fϞ O?Ddd_~IBB 4k24iR=<<:t(& =zpСRճuVHZ0L :ίPECy1eQ!XI&hт#F/;w&00nBl.pfgϞA̗bXUJҘL&V^͒%K_ʱc:t(OFLLL?aÆ]h"Ν;ǥK3gN/?”GPEAYgVQꫯh۶-'O7ޠW^ܹ>00O?(믹_~E㭷W_-M[.]ta\xzѢE ]ԼysrssmɁȠnݺmۖ-[;Gcǎ$$$`6 ""͛7KS<<cVZe_("ơ<(d6)wܺuҡC[4zQkۻÙuNțNζ9D\PQ$'bcalϜⷝ9s&`mNӇ$nv [,̞=e˖ۼ7or듙iڟoĉKGPEAYgfK(++ݻwӷoRe˖ M` YYp:"-X?WFSBQXծ]޽{s)>#yW9uO>m^{Wܢ0a-[d򆮗$==뮻wwwrrrx'J=+3%O?4Æ Yfoo}v7o?ORR_}SL!$$.]7|ot*:ю9B__.ϝ wesJˣ\ӂ0|8O+7d޼Xe28w^^^;ؕ(bʣ1(HnVbb"#G$55 ңG֭[g Yrrm=K.OD:udСz UnݺsDuX"PiyXG^O2pm^剃("ơ<(p 9s攸~֬Y~8qb4h)ʝ#DdG)ѪU/#n׿}iRQ8GcPř~wqț|ʭ8ED]X. <}8{,1ZN:1h v @ll,&]2vXh׮ׯ`̘1FI3#G2r ̺ur-p]w1bڶm[IUyyдi烂8.qQ8GcPř*uPva].Ckڴi< 2KKƍyǙ:u*Ǐgʕ<۴iİo>}Y~afϞ͗_~I@@zfJzzz=zz 7JZE{ r%֟yU])"ơ<(L ,7JRԨ;Z$%%#0x`/^'O/ !..L<{)ݻs]wߓ¤I7n5kyGiذ! 6n+r;Yfs=믿FwUQJ#?u`<(b ʢ83́%֥ l@ а/uݝuֱuVpǏ͍3gİa,oÆ X,|A͛DŽ xWJUĉ+rܶܳgO[mV㋈HS]RWi:M:7  /O{ mٳ4hfkDvv6iiiYR /0k,BCCfΜ9hтݻ_xxxMOO禛no' ЧOR&rJϣ(bʣ1(t 񩼃&?i9f>"ٓŋsE 6m 4'((뗺5kXٳ6i'|RWvm_͛;v,_~e-SQDDy1QEqf&ꛍWOfvt9e~kD*ڬ*SbFx7i^N(RTZE̔GPEAYGhE#xV/4/fv""""""""D ,Z9 g͙+ۉPy2SECy1eQX. 55r\'q_>+ۉPy2SECy1eQX.ҾmTz VER bʣq("Ơ,3SUF7wxy4p˄iwTZE̔GPEAYgw!t{['l?^tՉ.ƍ+7i⟡W"E:t ""pJUc=FHH&;vXqFzMǎ_-19JE\(b ʢ8.@*ʾ!, v킭[!3<<"NUR?w]c} }4 @hh(xbdFͣHu<(L ,)u:t(OFLL | !!!l߾m۶{kg餤?EvZj׮]ڵGruѹsg^z%rrr*t ШQ#ϟٷo_/MQiӦNvuk͚;<ʕ+4i?#գG,Y &xb֭[~֭K.kqUj/_f,YիWvZf̘Q.Z?.]0m4C͚5P(R("Ơ,3SW:t//ᅢRuqU+rr 6f϶,nҎ;;v,-]׳gO޽;ϧk׮Et!z"22W7˨yyԩSOOObbbذaC+e_|5۱cG~g/9~8Sy)"Ơ,3STjԀ=ǏÑ#Uwn'Py,0F >$nv """l>|///n>#GFFӳgOl¾}ϋi͚5dz5KKNN˖-SN=zȱc^'N-gԭ[믿uUuExʣ1(;yj!R-XÇCbb瓒_9uO>mŋ%**Hz;#7pϟSNHGcPř7n\'z"TUo_W?n2Apu;!>HGcPřvU'՝ETUytwSlb>e&p!>HGcPř% ͚Yׯ]K-Rbb`Z#GV-HKu]H0\E1Q8GcPő*{,pGPɔ? %uh9"UpyƔGPEAYg pt ]};*fg]>pvrl="UyGPEAYg pwwwt eR:"Ռ(bʣ1(rGqt ŻsT"UyfGPEAYgTf͠Kpc رK(՗j8QQECy1eQX&Md2x4nܸ}~7e˖L>5GP/#?aeTQQECy1eQX$''۷o/vÇ3tP˖-[c=j(1͎.dЮuy*HJrl="yFGPEAYgfV5hܸѰab>}:͚5cʔ)VaŎJf2e#R GjDy1QEqfl`߿&MТE FCvڵ ĉՋӧO'hԨQ5jDvv6El.pf7ovt ֮]7m[O fϞR6يg1f._o+#s999$&&йsgu[Ç9wu!,,+ 6mJZ8|0`c8{,kצcǎlܸ[. <<'NF͚5ҥ ׯ Jjj*ƍͥA4hЀ={>餤Ӹqcve̞?~wڕm۶/ZhKH2[Iy]ɓ' ~mЬY3= @N8x xzzjQF :tѣAٴixzz>?~3g^;"::mذ!ݻmrN:dw#Ο?Ozz#}G;*#f3/_6w}GTwę3gX~#}G;"000Y=6h Zn'|Rh]~ܹ3SN=pBn.\@͚5 퓕EVVwLpp0x{{W΋Dǎ#88e\[R4k[hѢ͛7gĉ< 2IJJÃ#G0ydwիeL8TfΜu]Gll,ׯ\tgyf̘qlϢAff&5kLJorJ"""v4oޜ}Q~}^u~ab8NGj@y1QEq$ٌO{/ؽ{w={lٲ-]"WkۻÙ9D|MB|e j$yŋӼ/;999}p&Os=`wޝ&MĸqYˣ>JÆ iذ!v[y#;w.<#%MS\QPECy1eQXO?4f >?ѣm?C9rzݻw3sLO;%T&71cg*.uחuن^pFV-]ڵkFGɻe^33;;z7o&LW^)U-'Ndٲe,[{zxǸ뮻۷/6LU\SQ)"ơ<(p DFIv툉VZ[͛lF׋O믿·~ȝw騗 %6u 'c!7]駟faX0Ldggܹs \z-/f"44h̙c.$ 8U6<ӧOgŊxyyѣGR'""""""" 79sJ\?Q;]w]KǪ;DÎ. "~oxy[n={d| 4+V0m4^x}ƍG[Y={&m޽{_/loeѢEtԉz&MTb|NG<(b ʢ83ĽUt"1Gۻw/ڵst[>8\3́)% *".Ny1QEq$]ٳ.trsW\%\N(btNGj@y1QEqfj`Zj99 ~1n^`]jUUV%bWNGj@y1QEqfj`HGP:59se;'4yGPEAYXd tؑ=zuVG6lJNu]NI9MEQ8GcPZΜ9}_Ͷmۘ}:lڴ֭[k]xmڴԩS[ ABBB%k⼼]B ?c/fW~@ ZD*ȩ(GPEAYtM#Gdʕ 2+VРA~a4iK/tc0|{9Ν;G>}f͚U~i 8pK({4صemjHՉTQą)"ơ<蚲ٱc$%%X믓Ӌ]f_Я_?bccټy3?8~BG,q 7whߺ|uw{抈X !0"fyh׮_~% **֭[W_woQFѣG3z%''h믿_O֭dX,.f3>>>rl6;e6 ywĘ?bb[H99}E\(bʣ1(cxqHL.(N-?=6lӓ9s0yd֮]k5c>>>~ΦgϞ|GV5Uڿ?mڴqtp-pضֲDE\(bʣ1(a>:jy*͛c>*{X. --%TСгuy.=۱KQE("ơ<h|99֑WE 5{'U7j`eu*&fﯼ/;rr<Q8GcPoժ ǎYnrQQQ.> n.:3g:rp<Q8GcP/9پ۹5\]գ^23WH9TE(bʣ1(wvΕ\j[IIG\*"NNy1QEz d`vՍX.aÆ.|KX`\."NLy1QEswSlb>euF ,}{xa 0qbc2ȰcaDFF-[w^Ӯ]; ?m4h߾=;v$SKˣSECy1e9yдi烂8.GS۷%kAٳ y^{x㉎?ޮ];n~'WhE߲n:v˩Y^؛KQI)"ơ<3f-u+L}Xj:ubϞ=??K~GFMzzzi~R%Ue(℔GPEAYg 8uꔣK<{Xϝ d2ѲeKvō7իi߾=gΜ)gϞM~wf͚1rH禛nbÆ >KQ("ơ<(L ,{vֹccKk̙$$$ӧO1Lz뭼X,;ưaꫯ8q.\(tm %5%KoFN}<1ϣQECy1eQ>.k׮.r´i,bՀ`N|:(rڵkӻwoG}Dpp0۷oW^tؑ.]tؑ~/2x`:uu]Wx뭷 bڵ3 |IN8Axx8QQQtM%yq"ʣq("Ơ,33Y,Ef|||HOO٦MvtnQ7 ڰHq'|nI7q۽^PQ("ơ<(TދF`s99Wv;Dx9HUyqʣq("Ơ,3Sԯ_%TU 5=OYJHHd2b"d2a2J}1Lj.!R(,GPEAYg pt .9p&듓={6g.׾ 6?{l}Qufd2QdE9~8 (նcǎDFFҷo_Kŋyg\7sL:t@5?YhСPsyqʣq("Ơ,3S޽%T@8Nb[ϻYaY̘1777ٳgiժuϏAXL&]veرѮ];֯_1c 3f #Gdȑe/[.#F= PlȚ4iʕ+KܹsٶmL0xTz뭼;Ebܹ5к-[//sN֭[W?CE(bʣ1(з/{9|dsq?۷lǝ6m?0C aҥҸqcqNYr%>`6mD@@111۷g}~ٳg_zfJzzzn߾e˖ѲeKbbb+ɻK޽i۶mO?Ddd_~IBB 4k24iR|}}m饾Y>|x:uDXXXz0aMXxzz""""""Ryj86m8Jm=I}I im'>8 ?sF:ueݺulݺ .0~xRSS9s&K.vdffطaÆWvv6|y1a^yR'l2-[fkP̞=3vR*Pkٲ%}aU\I&?R^2W_q1xJdl޼%Kzj֮]ˌ3tGg<(b ʢ835\ɓ']B^/L{+19O2> #!|ey?cO?b`2&--s璕U^xf͚Ehh(̙36?VI8p $**>#.] }i[wرcYh￟+W9kKZ_Z͛7;N:xzzÆ tGSECy1eQXtݡ9? 瞁@]4sp}6zŋx" bL6 T[ͮY={&mO*8<3fvʔ0{gN٘9s& Ӈ$nv """J7oWFbҥ撓òeԩS'""""""aX,Ghfvt9RV/}qG&#I2LU-X?ԩyM&'OfѢE:uW_}#GꫯӢE ۶;v$::?vwy'/^͍ yg͚ŏ?B{9Μ9CZ[.=;w&77~wwwԩSmHTK..qNZ/L+a`JhԿjjrb o+ĪĹs1p!OncVϣ("ơ<(Tދ.!t/_vt u1\c3WXG^{'J!{衇rjGQECy1eQX.%8V@[HFjUbcǬًb7|]vRP(b ʣq("Ơ,3Sn.a_m(&pD:vKr)vQ@GPEAYg عsKp,7wxy4p˄d3t>XPj*0оUG>"<(b ʢ8355 ?c۠?ƕ? &|c@n._Ц <ZG߾ַTh6 ۉT݅ ajj* 4ptƐVY'lhݺCxmHO7xQpLwB(8{e݅("ơ<(b ʢ8B(\p%;4!#?WW„ P3ggm[5طث11&UӦ R4GPECy1eQX. Y3jMoТc`X j&&`J?V4GPECy1eQXR}5onqu+|s;w­BްbEmdC0r +''wu*gu >u뚹\JEy1Q8GcPő4cG5k`|߬ kW8Q8GPEAYg tt dNm|ub<˗C^0t(lДGPECy1eQX.%ww5:֬Yвe]ۭg\Ey1Q8GcPř5k\W;}W׋Adu+QEy1Q8GcPřo\_͚oLM[ᦛ`j(<(bʣ1()ڵ2-Y}@tOP8O>=z߫U("ơ<(b ʢ835\@vvK<<ࡇ`~dWߵpV{#P<ٺT Q8GPEAYg %b^=Xqqng}.Ͼ}s  K)"ơ<(b ʢ835\K1sc͘Wп\GNh|<(bʣ1( z뭷0L<nd*سgO`.An]+Ax%4npww*E;  g['t1ڰCAp0odf:\ zG8p`ܹ3p \m0,((%3NY/!,,˰hu4jG?@VJ62Q8GPEAYgVќ9scƍ>00O?(믹_~E[oꫯz~ӦMԭ[.]{n.^HzhѢ۶myr1"##9pԭ[m۲eΑ#Gر# Uk>>4k֌۷Bvv6t҅={phժ[nYf=zN:qA222$446 6((5jpe:pQ}lڴF6""Ǐsj֬I.]X~=5ۛFJJ Oݝh6nHnn. 6ߟ{жm[Μ9éS0Lt֍͛7?5߭[&##'NЭ[7tcZlIff&Ǐ **;w7!!!>999s۷E֭ 88c92`y3̞MS!1&릛8ҽ;]iߥ ǎٳԮ];rܸqc֭k{9qiii|||lwhh(>yw hРmλ6mڐNJJJϬ?7f׮]mw׮]ٶmYYYl̶hтK.d ɓ'|G>|sQNlwӦMUUQvm aĿG;BUaX0w}GTwđ#GHLL4wGR&8;v,]N:п"##2eJ3l0L&/.r}VVYW1w^#_ݻ; qU99  o ww 75kVu<(bʣ1(Hfr^ u ͛III!**5jPF ~7>CjԨANNNӣG[G(kۻCD¬Yp$,^ os,|tuά?z;0sن;v,<쳴o߾T>|8iii믥ھ]@Gp.CL ε622 oSu0tuSECy1QEq$U^=ڷo_Qn]ׯok^==ڶϔ)Sؿ?;wg<裎zU.m*~ ))`amZ9z{ a](bʣq("Ơ,33$ג\ t.]駟&)):u?СCXeJ/jN"Tuw |9,\O[ΆK{wvfhL&{QECy1QEqfQ­[&1lw BqӬupֆRECy1QEq^X999; Y,omd}\ml!uXXy +7vʣq("ơ<(Rs`Ilڴ%\;kmpLAZ]?h̰ys{~V~i)"ơ<(b ʢ835D1ZΉjOƍ n};L 7ֻNvYGusIܥ@G R1O [XGa#lܘߨp~h>nzCSECy1QEqf +55 8 ʑba ߶KfV^Yuu^<(bʣ1(HK8xK<0j̚IIK { )|¸8xmZЧvB<(bʣ1(t 8 ڷ>z 23aY^l]z5&Ԭ ݻCGϞ%""""""eKqK322rt"wnǾ}o{uCwokCǧ%("ơ<(b ʢ8RE{/j` }Ѷm[G!b<Ǐ7bcaջw#$|("ơ<(b ʢ8RE{/9s%S&F+WZZokXھO>XiUQ8GPEAYg y+6-JLϟ+kV>9ͳ>:gVt4th֬(-Q8GPEAYgKqKENΝZY+I@յk^'9Xׯ{.CX/)\&uȵk*e [D|1QEq$́%"RYݭͧH?ɓqaVpٳk tΝU+qjj`F`Өr52WcN,_^SIfV.FYVe/GĕGPEAYg pE\d]ժɅ4<Ճ;`&ز`6xǏ[?DD@СCπ}]".@>(b ʢ835\uA7*Ov6ۗڲ8{22ɯ__ 6ڷ6ի$磈q("Ơ,3SKD*ԨǽZX !!mu֡C?u ~ZaRZhAIHH)}Toʕ+߿?=*{͍z-_U8X. ,,%eʣ-ZX11gd]; a};>Ƒ#ǒ%- -fr6WO_F/^䮻˼={wyQFqq;V磈q("Ơ,3L. K[7ӟ`rO+`TxAѣKϞu`,x9vks0xIck|;3fF>}8{,1ZN:1h v @ll,&]2vXh׮\9fL&Sǘ1c9r$#G,W999L&jժb磈q("Ơ,3,piZn2DJc@\b'`ϞG #;>BBҷnm}-l  bڴi< 2Iƍyȑ#L<|իW۴i_=111̜9g}X~anF.]3 (yllW$ҥE'$h޼f͠NyU ))Gy!Cxbj^Ӽ$''۶+oxx8'Of̜9н{w1b)))L4q]G}/뮻/_kFVV9<{ャ_77 Σ?ECy1eQRd]lLJtVTD\_n.?nlAϫϝ+q5*Ժz98Vd [M^^=ݱX,,_h.\7nnn̘1 FVVX `k2QF8q8 pJa}3E6zŋ4h+V`ڴi 7s}5kXٳ6aֵ4hЀ3i$^uׯo. ϣ QEAYgKqK:D˖-]< ǎnl%&?of]j??/7fFx7i.'S(b ʣ1(HPtty6mX,IIZl.,9)좖4o)T IQDAex FG.Y3|?ϳw5q}wfjjK/]үNNE-w ^EDDDDI ?Vii) "hpO҄ḿ G@ rptcwh{$+Яcy$HE2$΁E"́ yB@*^ } xZϣ7(pTTܻo ]ۻS,,BVsam..~G0D<Hƌ,P\\;I(.&l/t`T u07=BJ55@q4zKWCý_rphV֤Gy1D<Hƌ,0_> (g?5W':FjWZEiKzZQV&-wߝ-J( ]ݞxEH0D<Hƌs`"""&PUv؝llZ\\Zokn7^DDDDtoÇDzB!lmWί=wU{FXEãyF1D<Hƌ,Р\*D+GKKCZ!P^޲U\,^_JJQ_ NBHNG"y`ɘe "jϟxm۶FAAT*"##qFȑ#7n"## nnn裏9s((͞=III9s&`ƌcff&.]!Cí[{?ƍ?Lȑ# "j<G̣"-HA[j |/,*s4S(\䲱}2dp Xp!bbbcX[[K,J¥K~z̝;?v<Ø6mlق+Wȑ#?>&M:<(** ܸqC.;vLٳIgϞ}>_Dcx뭷lٲ.?BtEF(++a'NM":H>G2)Ҽbw:u_XZ6KKi;X߶cn.Z37o{io>ʹxwv';{U`mH6!uXDDDDDrgfv{XW2zU[+ͺa))(T(T_ߵ3Ws3Kֲx/A6/'$" 0t H$#Q'YZbibv !M\̪ZA*j55kl\6BCU3^떱F2f,`AD<H$zQ=U7ѺF[Ժ^[ghH5wvݝ"Wg((4;61c "0Dr<ɇ:y׎〒vqtWB'")-ɓnm1uLo w!߅LG$#|0D"`y$摺,x n5W9\f#E2f,`cT1D<Hp6t/Hƌ???Cw0D<H$"3L@FF@DMG"`y$f XDDDDDDDD$k,` DԄy$H>G"y`ɘe4@DMG"`y$f X&ʕ+5ay$HE2f,`)Н0r888 N "0Dr<H$"RWk/e~7Cw0D<H$"3L@UU@DMG"`y$f X&] &#|0D<Hƌ,h.QH>G"`Y$c HOO7t H$#|0D,1u+>> K.mwd >* xzDDDDDDDDd[:u6oތvx"}Q3?3^|E,^;v襞@DMG"`y$f, X5k|}}a`ܹx[oRo LD}H$#|0D,1{ ǏǏĉubbbꢬ\t] &#|0D<HۑSN}aa!uЀׯӳ>~_^^޵NQUXd J~ B{!D+hOII """/_ :ϟGee%lll0h oookEyy9T*TT*.\ Õ+WpMXYYaС8y$8< ΝCII ,,,0|p CCC\ٳgQ]] [[[w}_<!CJX[[#88iiiyyy(++JBXXRRRFNN`(((@ii),--'N 8w $$EEEqSNpvvFvv6`РA(--Eqq1 FT444;00(,,9騫#믿joݺqܺu 9g56l~7TUUڻL眽x"***V}+++\xQ~_|7oބRDxxBx}qu\~]{6C~pY9[VV笳3<<<=g#Z8::G{s3"$$׮]g?#gd!G3aiizY}FgDOFh48qBvr?#3Fꫯ6FB333zÆ ?Omۮ]O-,޾^YDhhAD`y$HE2r;S{GyD[l__ʕ+[`ԨQotߏVWT*T*VQQa.QH>G"`Y$c&tlllm>0oG"y`ɘ.ze@^tϞ=8rW_}7nw DԄy$H>G"y`ɘj,Cuv DEEDHNG"`Y$C29 W^^ntNUU0D<H$"RqT,`Dv>>> 骨C%QPP;;;( CwC˗/Dy$H>G"y`Є:>%;G`033%" H>G"`Y$CȫfFwB""""""""[X"""""""""Yc)J^J]!G"`y$f'q'""""""""Y,""""""""5HX"""""""""Yc%$$* Ç?`.GbBW_}^5k j8sa:Kd1b SNEvv6#Qشiaoo{{{5 {ծg #>> K.ն1dX2b}.]UVƘ1c>ހ=#; vڅS入KbʕZc_bb!99=Hd`x7O3DHHHkCbÆ HF#T]]RSS1qD'رc]x:T*;v,IH4#h4l߾UUU5jHdqqqx0~xv摌;@suh4봻@"K.KD}˖-ѣy$mOƨQp-b׮] R,۷#-- Nj?ɘe BmDM޵pBdddwu#Q Bzz:n޼;v`HNN֮gz˗d߿*G2FH-F[Qf-ZwÇmgzx 2?E^" >@rr26n mG2F,`)+++ >i?pxᡓͺ:$''3DL bΝﯳy$2,!jkkE^#HOO.5k<%FlٲexꩧQFaüy 5"VYYkx" ___,]֭1p@[x' k"O>_5MvppZB`zɋ/X۷oǑ#Gw1DN;d3hۙG2V,`ӧƍXv-^0ٳ 0t׈LZJJ ƍ~ٲeٳg#)) +V@MM ,XRDEEa3TLҦM:[nŜ9sy$%׮]SO=W0a"0dBaNs`EDDDDDDDD XDDDDDDDD$k,`EDDDDDDDD XDDDDDDDD$k,`a!77 =c?g(v\zaaa=֏#Fnnn:u*8KT + <<طo6oތg-贲2h֭[1i$m!*???$%%!::Z'M3f`ĈhhhUpidffƦAaڵG~~>vڅwys͛Xrg]rp{嘧Oȑ#QPP''^9&7"">gP(8y$q 4ƲeO?iÔ)S`kk {{{<vv5k0tPlٲhonnns|BM6!66j//% J?uV@R!88 pBxzzBR:R/^]w%T_󃃃f̘G.v%TGB}0l0j<(**޽{{{{̜9c !o jC _~ ;̙3Ɛ!CuV!55?~<&M%޼yXjUDz|||ׯᅬDO'''`ʔ)y-[`P*… x?ǐ!CtW.]ɓ <{Ѯģ> [[[㩧׵~zBTW\]v7"""-ADDԇܸqC( nݺvkllÆ G)))⧟~bرmV^-lmm?.Μ9#v-DLLXh8{زe ?pqq";;[o"33S!ŋ'v!.\ ͛mǎY$%% !x78z?OB|^ٳG\tI8qBl޼Yۧwyï}ڴiѣCz;vik~?BÇ w-D``;v8qHKKG..._>ϋ/(w}'rrr֭[RGѫoÇnΝӧʕ+IlذA!EddkscNJ%Khh4I̟?_!DUU8pxEFFO> B!J6lɓ:x?e1o<{LL0Addd7߈d!_~^YYY"--ML0A7N+HJJϟ?HLL9OҾO7oyyymѦ "##~=iӦ!>>?0h !NMMagg3nݺs`@NNP[[LV\HDD Cw7 8 YYY:ujY h Emǻs9 ""/^޽{qA<?~3ر%%%7ȑ#8~8N>X={Rhh(J%ZOwζ߿!pBܹοI{/_333ݻ7n ~!JKK?T9w\jxذalyaΝX|9Hי3g86668p juW_Æ ڋ&">#IHH<#GbڵGCC8M6!++ ǏGxx8f͚ 6 ,رcu.m/=z4mۆ'O>hw5k`ŰGll,jkkR,[ <==1tP/F(X[[_j5 8={{9Ecc#Fr;v ={v7..| k١t9Z>~-lقǏ#""ٳg###NNNmhhh@~~>v܉wyǸqf›o)S`ڵF^^v܉Xf ͛777Ƣ?#-Z󿏝111;w.4vDҥKAD̜9?[+`LJ~m۶!44}ΝERR~;III(X~="##1bbϞ=033#"<<7|^}UˈGHHbbb7=*36mڄ2DGGSg5kXz50o޼vOOOw}L|gHHHncmmGӦMCHH~i0{llذ +#""Kg}йbO(**?3gqȴEDDDDYXX`ժUr,777XWEDDqw""""""""5HX"""""""""Ycd,""""""""5HX"""""""""Ycd,""""""""5HX"""""""""Ycd,""""""""5HJ2IENDB`kanzi-cpp-2.5.2/doc/kanzi.1000066400000000000000000000235521516423635400153770ustar00rootroot00000000000000.TH "KANZI" "1" "Feb 2026" "kanzi 2.5" "User Commands" .SH "NAME" \fBkanzi\fR \- Compress and decompress \.knz files .SH "SYNOPSIS" \fBkanzi\fR [\fIOPTIONS\fR] [\-i \fIINPUT\-FILE\fR] [\-o \fIOUTPUT\-FILE\fR] .SH "DESCRIPTION" \fBKanzi\fR is a modern, modular, portable and efficient lossless data compressor\. Modern algorithms are implemented and multi-core CPUs can take advantage of the built-in multi-threading. An entropy codec and a combination of transforms can be provided at runtime to best match the kind of data to compress. The code is optimized for efficiency (trade-off between compression ratio and speed)\. Unlike the most common lossless data compressors, \fBKanzi\fR uses a variety of different compression algorithms and supports a wider range of compression ratios as a result\. \fBKanzi\fR is multithreaded by design and uses several threads by default to compress or decompress blocks concurrently\. It is not compatible with standard compression formats such as zip, gz, zstd, br, lz4, xz\. \fBKanzi\fR is a lossless data compressor, not an archiver\. It uses checksums (optional but recommended) to validate data integrity but does not have a mechanism for data recovery. It also lacks data deduplication across files\. \fBKanzi\fR generates a bitstream that is seekable (one or several consecutive blocks can be decompressed without the need for the whole bitstream to be decompressed)\. .SH "OPTIONS" .SS "Operation Mode" Help\. \fB-h, --help\fR display this message Compression mode\. \fB-i, --input=\fR Mandatory name of the input file or directory or 'stdin' When the source is a directory, all files in it will be processed. Provide /. at the end of the directory name to avoid recursion (e.g., myDir/. => no recursion) \fB-o, --output=\fR Optional name of the output file or directory (defaults to ) or 'none' or 'stdout'. 'stdout' is not valid when the number of jobs is greater than 1 \fB-b, --block=\fR Size of blocks (default 4|8|16|32 MB based on level, max 1 GB, min 1 KB) 'auto' means that the compressor derives the best value based on input size (when available) and number of jobs \fB-l, --level=\fR Set the compression level [0..9] Providing this option forces the entropy codec and transform. See the definitions of the transforms and entropy codecs in the last section. 0 = NONE&NONE (store) 1 = LZX&NONE 2 = DNA+LZ&HUFFMAN 3 = TEXT+UTF+PACK+MM+LZX&HUFFMAN 4 = TEXT+UTF+EXE+PACK+MM+ROLZ&NONE 5 = TEXT+UTF+BWT+RANK+ZRLT&ANS0 6 = TEXT+UTF+BWT+SRT+ZRLT&FPAQ 7 = LZP+TEXT+UTF+BWT+LZP&CM 8 = EXE+RLT+TEXT+UTF+DNA&TPAQ 9 = EXE+RLT+TEXT+UTF+DNA&TPAQX \fB-e, --entropy=\fR entropy codec [None|Huffman|ANS0|ANS1|Range|FPAQ|TPAQ|TPAQX|CM] \fB-t, --transform=\fR transform [None|BWT|BWTS|LZ|LZX|LZP|ROLZ|ROLZX|RLT|ZRLT] [MTFT|RANK|SRT|TEXT|MM|EXE|UTF|PACK] e.g., BWT+RANK or BWTS+MTFT (default is BWT+RANK+ZRLT) \fB-x, -x32, -x64, --checksum=\fR Enable block checksum (32 or 64 bits). During decompression data is verified against the checksum in each block. -x is equivalent to -x32. \fB-s, --skip\fR copy blocks with high entropy instead of compressing them \fB--rm\fR Remove the input file after successful (de)compression. If the input is a directory, all processed files under the directory are removed. Decompression mode\. \fB-i, --input=\fR Mandatory name of the input file or directory or 'stdin' When the source is a directory, all files in it will be processed. Provide /. at the end of the directory name to avoid recursion (e.g., myDir/. => no recursion) \fB-o, --output=\fR Optional name of the output file or directory (defaults to ) or 'none' or 'stdout'. 'stdout' is not valid when the number of jobs is greater than 1. \fB--from=blockId\fR Decompress starting from the provided block (included). The first block ID is 1. \fB--to=blockId\fR Decompress ending at the provided block (excluded). \fB--rm\fR Remove the input file after successful (de)compression. If the input is a directory, all processed files under the directory are removed. Info mode\. \fB-i, --input=\fR Mandatory name of the compressed input file. When the source is a directory, all files in it will be processed. Provide /. at the end of the directory name to avoid recursion (e.g., myDir/. => no recursion) Operation modifiers\. \fB-j, --jobs=\fR Maximum number of jobs the program may start concurrently (default is half the available cores, maximum is 64) \fB-v, --verbose=\fR 0=silent, 1=default, 2=display details, 3=display configuration, 4=display block size and timings, 5=display extra information Verbosity is reduced to 1 when files are processed concurrently Verbosity is reduced to 0 when the output is 'stdout' \fB-f, --force\fR Overwrite the output file if it already exists \fB--skip-links\fR Skip symbolic links \fB--skip-dot-files\fR Skip dotfiles .SS "Examples" Compress recursively all files under 'dir' in test mode (no output file) using a 4 MB block, compression level 4, and extra verbosity. kanzi -c -i dir -o none -b 4m -l 4 -v 3 Compress foo.txt to foo.txt.knz (overwrite it if it already exists) using the BWT, MTFT, and ZRLT transforms, the FPAQ entropy codec, and 4 threads; generate a checksum for each 4 MB block. kanzi -c -i foo.txt -f -t BWT+MTFT+ZRLT -b 4m -e FPAQ -j 4 -x Compress from stdin (--input option is omitted) to foo.knz using compression level 2, 64 KB blocks, and the default number of threads. cat foo.txt | kanzi -c -o foo.knz -l 2 -b 64k Decompress foo.txt.knz to foo.txt.knz.bak using 2 threads. kanzi -d -i foo.txt.knz -j 2 Decompress foo.txt.knz to stdout and delete the compressed file. kanzi -d -i foo.txt.knz -o stdout --rm Decompress foo.txt.knz to foo.txt (overwrite it if it already exists) from block 5 to block 11, using 8 threads and extra verbosity. kanzi -d -i foo.txt.knz -o foo.txt -f -j 8 --from=5 --to=11 -v 4 .SS "Transforms" BWT: Burrows-Wheeler Transform is a transform that reorders symbols in a reversible way that is more amenable to entropy coding. This implementation uses a linear time forward transform and parallel inverse transform. BWTS: Burrows-Wheeler Transform by Scott is a bijective variant of the BWT. LZ: Lempel-Ziv implementation of the dictionary-based LZ77 transform that removes redundancy in the data. LZX: Lempel-Ziv Extra. Same as above with a bigger hash table and more match searches. LZP: Lempel-Ziv Prediction can be described as an LZ implementation with only one possible match (no offset is emitted). RLT: Run-Length Transform is a simple transform that replaces runs of similar symbols with a compact representation. ZRLT: Zero Run-Length Transform. Similar to RLT but only processes runs of 0. Usually used post-BWT. MTFT: Move-To-Front Transform is a transform that reduces entropy by assigning shorter symbols to recent data (like an LRU cache). Usually used post-BWT. RANK: Rank Transform is a transform that reduces entropy by assigning shorter symbols based on symbol frequency ranks. Usually used post-BWT. EXE: A transform that reduces the entropy of executable files (X86 & ARM64) by replacing relative jump addresses with absolute ones. TEXT: A text transform that uses a dictionary to replace common words with their dictionary index. ROLZ: Reduced Offset Lempel-Ziv is an implementation of LZ that replaces match offsets with indexes, creating a more compact output with slower decoding speeds. ROLZX: Extended ROLZ with more match searches and a more compact encoding. SRT: Sorted Rank Transform is a transform that reduces entropy by assigning shorter symbols based on symbol frequency ranks. Usually used post-BWT. MM: Multimedia transform is a fast transform that removes redundancy in correlated channels in some multimedia files (e.g., wav, pnm). UTF: A fast transform replacing UTF-8 codewords with aliases based on frequencies. PACK: A fast transform replacing unused symbols with aliases based on frequencies. DNA: Same as PACK but triggered only when DNA data is detected. .SS "Entropy codecs" Huffman: A fast implementation of canonical Huffman. Both encoder and decoder use code tables and multiple streams to improve performance. RANGE: A fast implementation of a static range codec. ANS: Based on Range Asymmetric Numeral Systems by Jarek Duda (specifically an implementation by Fabian Giesen). Works in a similar fashion to the Range codec but uses only one state instead of two, and encodes in reverse byte order. FPAQ: A binary arithmetic codec based on FPAQ1 by Matt Mahoney. Uses a simple adaptive order 0 predictor based on frequencies. CM: A binary arithmetic codec derived from BCM by Ilya Muravyov. Uses context mixing of counters to generate a prediction of the next bit value. TPAQ: A binary arithmetic codec based initially on Tangelo 2.4 (itself derived from FPAQ8). Uses context mixing of predictions produced by one-layer neural networks. The initial code has been heavily tuned to improve compression ratio and speed. Slow but usually excellent compression ratio. TPAQX: Extended TPAQ with more predictions and more memory usage. Slowest but usually the best compression ratio. .SH BUGS Report bugs at: https://github.com/flanglet/kanzi-cpp/issues .SH AUTHOR Frederic Langlet .SH REPORTING BUGS https://github.com/flanglet/kanzi-cpp kanzi-cpp-2.5.2/lib/000077500000000000000000000000001516423635400141735ustar00rootroot00000000000000kanzi-cpp-2.5.2/lib/.gitignore000066400000000000000000000001641516423635400161640ustar00rootroot00000000000000**/Debug/** **/Release/** **/*.obj **/*.o **/*.htm **/*.exe **/*.idb **/*.pdb **/*.ncb **/*.sln **/*.suo **/*vcproj*kanzi-cpp-2.5.2/src/000077500000000000000000000000001516423635400142145ustar00rootroot00000000000000kanzi-cpp-2.5.2/src/BitStreamException.hpp000066400000000000000000000030121516423635400204720ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_BitStreamException #define knz_BitStreamException #include #include #include "types.hpp" namespace kanzi { class BitStreamException : public std::runtime_error { private: int _code; public: enum BitStreamStatus { UNDEFINED = 0, INPUT_OUTPUT = 1, END_OF_STREAM = 2, INVALID_STREAM = 3, STREAM_CLOSED = 4 }; BitStreamException(const std::string& msg) : std::runtime_error(msg) { _code = UNDEFINED; } BitStreamException(const std::string& msg, int code) : std::runtime_error(msg), _code(code) { } #if __cplusplus >= 201103L BitStreamException(const BitStreamException&) = default; BitStreamException& operator=(const BitStreamException&) = default; #endif int error() const { return _code; } ~BitStreamException() NOEXCEPT {} }; } #endif kanzi-cpp-2.5.2/src/Context.hpp000066400000000000000000000104661516423635400163600ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_Context #define knz_Context #include #include #if __cplusplus >= 201703L #include #endif #include "concurrent.hpp" // definition of CONCURRENCY_ENABLED namespace kanzi { #if __cplusplus >= 201703L // C++17+ version using std::variant typedef std::variant ContextVal; #else // C++98 / C++03 / C++11 / C++14 struct ContextVal { int64 lVal; std::string sVal; bool isString; ContextVal() : lVal(0), isString(false) {} ContextVal(int64 v) : lVal(v), isString(false) {} ContextVal(const std::string& s) : lVal(0), sVal(s), isString(true) {} }; #endif class Context { public: #ifdef CONCURRENCY_ENABLED Context(ThreadPool* p = nullptr) : _pool(p) {} Context(const Context& c) : _map(c._map), _pool(c._pool) {} Context(const Context& c, ThreadPool* p) : _map(c._map), _pool(p) {} Context& operator=(const Context& c) = default; #else Context() {} Context(const Context& c) : _map(c._map) {} Context& operator=(const Context& c) { _map = c._map; return *this; } #endif ~Context() {} bool has(const std::string& key) const; int getInt(const std::string& key, int defValue = 0) const; int64 getLong(const std::string& key, int64 defValue = 0) const; std::string getString(const std::string& key, const std::string& defValue = "") const; void putInt(const std::string& key, int value); void putLong(const std::string& key, int64 value); void putString(const std::string& key, const std::string& value); #ifdef CONCURRENCY_ENABLED ThreadPool* getPool() const { return _pool; } #endif private: std::map _map; #ifdef CONCURRENCY_ENABLED ThreadPool* _pool; #endif }; inline bool Context::has(const std::string& key) const { return _map.find(key) != _map.end(); } inline int Context::getInt(const std::string& key, int defValue) const { return int(getLong(key, defValue)); } inline int64 Context::getLong(const std::string& key, int64 defValue) const { const std::map::const_iterator it = _map.find(key); if (it == _map.end()) return defValue; #if __cplusplus >= 201703L if (std::holds_alternative(it->second)) return std::get(it->second); return defValue; #else return it->second.isString ? defValue : it->second.lVal; #endif } inline std::string Context::getString(const std::string& key, const std::string& defValue) const { const std::map::const_iterator it = _map.find(key); if (it == _map.end()) return defValue; #if __cplusplus >= 201703L if (std::holds_alternative(it->second)) return std::get(it->second); return defValue; #else return it->second.isString ? it->second.sVal : defValue; #endif } inline void Context::putInt(const std::string& key, int value) { #if __cplusplus >= 201703L _map[key] = int64(value); #else _map[key] = ContextVal((int64)value); #endif } inline void Context::putLong(const std::string& key, int64 value) { #if __cplusplus >= 201703L _map[key] = value; #else _map[key] = ContextVal(value); #endif } inline void Context::putString(const std::string& key, const std::string& value) { #if __cplusplus >= 201703L _map[key] = value; #else _map[key] = ContextVal(value); #endif } } #endif kanzi-cpp-2.5.2/src/EntropyDecoder.hpp000066400000000000000000000024421516423635400176550ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_EntropyDecoder #define knz_EntropyDecoder #include "InputBitStream.hpp" namespace kanzi { // EntropyDecoder entropy decodes data from a bitstream class EntropyDecoder { public: // Decode the array provided from the bitstream. Return the number of bytes // read from the bitstream virtual int decode(byte block[], uint blkptr, uint len) = 0; // Return the underlying bitstream virtual InputBitStream& getBitStream() const = 0; // Must be called before getting rid of the entropy decoder. // Trying to decode after a call to dispose gives undefined behavior virtual void dispose() = 0; virtual ~EntropyDecoder(){} }; } #endif kanzi-cpp-2.5.2/src/EntropyEncoder.hpp000066400000000000000000000024511516423635400176670ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_EntropyEncoder #define knz_EntropyEncoder #include "OutputBitStream.hpp" namespace kanzi { // EntropyEncoder entropy encodes data to a bitstream class EntropyEncoder { public: // Encode the array provided into the bitstream. Return the number of bytes // written to the bitstream virtual int encode(const byte block[], uint blkptr, uint len) = 0; // Return the underlying bitstream virtual OutputBitStream& getBitStream() const = 0; // Must be called before getting rid of the entropy encoder. // Trying to encode after a call to dispose gives undefined behavior virtual void dispose() = 0; virtual ~EntropyEncoder(){} }; } #endif kanzi-cpp-2.5.2/src/Error.hpp000066400000000000000000000026451516423635400160250ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_Error #define knz_Error namespace kanzi { struct Error { public: enum ErrorCode { ERR_MISSING_PARAM = 1, ERR_BLOCK_SIZE = 2, ERR_INVALID_CODEC = 3, ERR_CREATE_COMPRESSOR = 4, ERR_CREATE_DECOMPRESSOR = 5, ERR_OUTPUT_IS_DIR = 6, ERR_OVERWRITE_FILE = 7, ERR_CREATE_FILE = 8, ERR_CREATE_BITSTREAM = 9, ERR_OPEN_FILE = 10, ERR_READ_FILE = 11, ERR_WRITE_FILE = 12, ERR_PROCESS_BLOCK = 13, ERR_CREATE_CODEC = 14, ERR_INVALID_FILE = 15, ERR_STREAM_VERSION = 16, ERR_CREATE_STREAM = 17, ERR_INVALID_PARAM = 18, ERR_CRC_CHECK = 19, ERR_RESERVED_NAME = 20, ERR_UNKNOWN = 127 }; }; } #endif kanzi-cpp-2.5.2/src/Event.cpp000066400000000000000000000160111516423635400160000ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include "Event.hpp" #include "util/strings.hpp" using namespace kanzi; Event::Event(Event::Type type, int id, const std::string& msg, WallTimer::TimeData evtTime) : _type(type) , _time(evtTime) , _msg(msg) , _id(id) , _size(0) , _offset(-1) , _hash(0) , _hashType(NO_HASH) , _skipFlags(0) , _info(nullptr) { } Event::Event(Event::Type type, int id, const HeaderInfo& info, WallTimer::TimeData evtTime) : _type(type) , _time(evtTime) , _msg("") , _id(id) , _size(0) , _offset(-1) , _hash(0) , _hashType(NO_HASH) , _skipFlags(0) { _info = new HeaderInfo(); _info->inputName = info.inputName; _info->bsVersion = info.bsVersion; _info->checksumSize = info.checksumSize; _info->blockSize = info.blockSize; _info->entropyType = info.entropyType; _info->transformType = info.transformType; _info->originalSize = info.originalSize; _info->fileSize = info.fileSize; } Event::Event(Event::Type type, int id, int64 size, WallTimer::TimeData evtTime, uint64 hash, HashType hashType, int64 offset, uint8 skipFlags) : _type(type) , _time(evtTime) , _msg() , _id(id) , _size(size) , _offset(offset) , _hash(hash) , _hashType(hashType) , _skipFlags(skipFlags) , _info(nullptr) { } Event::Event(const Event& other) : _type(other._type) , _time(other._time) , _msg(other._msg) , _id(other._id) , _size(other._size) , _offset(other._offset) , _hash(other._hash) , _hashType(other._hashType) , _skipFlags(other._skipFlags) , _info(nullptr) { if (other._info != nullptr) { _info = new HeaderInfo(); _info->inputName = other._info->inputName; _info->bsVersion = other._info->bsVersion; _info->checksumSize = other._info->checksumSize; _info->blockSize = other._info->blockSize; _info->entropyType = other._info->entropyType; _info->transformType = other._info->transformType; _info->originalSize = other._info->originalSize; _info->fileSize = other._info->fileSize; } } Event& Event::operator=(const Event& other) { if (this != &other) { _type = other._type; _time = other._time; _msg = other._msg; _id = other._id; _size = other._size; _offset = other._offset; _hash = other._hash; _hashType = other._hashType; _skipFlags = other._skipFlags; if (_info != nullptr) { delete _info; _info = nullptr; } if (other._info != nullptr) { _info = new HeaderInfo(); _info->inputName = other._info->inputName; _info->bsVersion = other._info->bsVersion; _info->checksumSize = other._info->checksumSize; _info->blockSize = other._info->blockSize; _info->entropyType = other._info->entropyType; _info->transformType = other._info->transformType; _info->originalSize = other._info->originalSize; _info->fileSize = other._info->fileSize; } } return *this; } #if defined(__cplusplus) && (__cplusplus >= 201103L) Event::Event(Event&& other) noexcept : _type(other._type) , _time(other._time) , _msg(std::move(other._msg)) , _id(other._id) , _size(other._size) , _offset(other._offset) , _hash(other._hash) , _hashType(other._hashType) , _skipFlags(other._skipFlags) , _info(other._info) { other._info = nullptr; } Event& Event::operator=(Event&& other) noexcept { if (this != &other) { _type = other._type; _time = other._time; _msg = std::move(other._msg); _id = other._id; _size = other._size; _offset = other._offset; _hash = other._hash; _hashType = other._hashType; _skipFlags = other._skipFlags; if (_info != nullptr) delete _info; _info = other._info; other._info = nullptr; } return *this; } #endif std::string Event::toString() const { if (_msg != "") return _msg; std::stringstream ss; ss << "{ \"type\":\"" << getTypeAsString() << "\""; if (_id >= 0) ss << ", \"id\":" << getId(); if (_info != nullptr) { ss << ", \"inputName\":\"" << escapeJSONString(_info->inputName) << "\""; ss << ", \"bsVersion\":" << _info->bsVersion; ss << ", \"checksum\":" << _info->checksumSize; ss << ", \"blockSize\":" << _info->blockSize; ss << ", \"entropy\":\"" << _info->entropyType << "\""; ss << ", \"transform\":\"" << _info->transformType << "\""; if (_info->fileSize >= 0) ss << ", \"compressed\":" << _info->fileSize; if (_info->originalSize >= 0) ss << ", \"original\":" << _info->originalSize; } else { ss << ", \"size\":" << getSize(); if (getType() != BLOCK_INFO) ss << ", \"time\":" << getTime().to_ms(); if (_hashType != NO_HASH) { ss << ", \"hash\":\""; ss << std::uppercase << std::setfill('0'); if (_hashType == SIZE_32) ss << std::setw(8) << std::hex << getHash() << "\""; else ss << std::setw(16) << std::hex << getHash() << "\""; ss << std::dec; } if (getType() == BLOCK_INFO) { ss << ", \"offset\":" << getOffset(); ss << ", \"skipFlags\":"; for (int i = 128; i >= 1; i >>= 1) ss << ((_skipFlags & i) == 0 ? "0" : "1"); } } ss << " }"; return ss.str(); } std::string Event::getTypeAsString() const { switch (_type) { case AFTER_HEADER_DECODING: return "AFTER_HEADER_DECODING"; case COMPRESSION_END: return "COMPRESSION_END"; case BEFORE_TRANSFORM: return "BEFORE_TRANSFORM"; case AFTER_TRANSFORM: return "AFTER_TRANSFORM"; case BEFORE_ENTROPY: return "BEFORE_ENTROPY"; case AFTER_ENTROPY: return "AFTER_ENTROPY"; case DECOMPRESSION_START: return "DECOMPRESSION_START"; case DECOMPRESSION_END: return "DECOMPRESSION_END"; case COMPRESSION_START: return "COMPRESSION_START"; case BLOCK_INFO: return "BLOCK_INFO"; default: return "Unknown Type"; } } kanzi-cpp-2.5.2/src/Event.hpp000066400000000000000000000060161516423635400160110ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_Event #define knz_Event #include #include #include "types.hpp" #include "util/WallTimer.hpp" namespace kanzi { class Event { public: enum Type { COMPRESSION_START, COMPRESSION_END, BEFORE_TRANSFORM, AFTER_TRANSFORM, BEFORE_ENTROPY, AFTER_ENTROPY, DECOMPRESSION_START, DECOMPRESSION_END, AFTER_HEADER_DECODING, BLOCK_INFO }; enum HashType { NO_HASH, SIZE_32, SIZE_64 }; typedef struct HeaderInfo { std::string inputName; int bsVersion; int checksumSize; int blockSize; std::string entropyType; std::string transformType; int64 originalSize; int64 fileSize; } HeaderInfo; Event(Type type, int id, const std::string& msg, WallTimer::TimeData evtTime); Event(Type type, int id, int64 size, WallTimer::TimeData evtTime, uint64 hash = 0, HashType hashType = NO_HASH, int64 offset = -1, uint8 skipFlags = 0); Event(Type type, int id, const HeaderInfo& info, WallTimer::TimeData evtTime); Event(const Event& other); Event& operator=(const Event& other); #if defined(__cplusplus) && (__cplusplus >= 201103L) Event(Event&& other) noexcept; Event& operator=(Event&& other) noexcept; #endif virtual ~Event() { if (_info != nullptr) delete _info; } int getId() const { return _id; } int64 getSize() const { return _size; } Event::Type getType() const { return _type; } WallTimer::TimeData getTime() const { return _time; } uint64 getHash() const { return _hashType != NO_HASH ? _hash : 0; } int64 getOffset() const { return _offset; } HashType getHashType() const { return _hashType; } HeaderInfo* getInfo() const { return _info; } std::string toString() const; std::string getTypeAsString() const; private: Event::Type _type; WallTimer::TimeData _time; std::string _msg; int _id; int64 _size; int64 _offset; uint64 _hash; HashType _hashType; uint8 _skipFlags; HeaderInfo* _info; }; } #endif kanzi-cpp-2.5.2/src/Global.cpp000066400000000000000000000306541516423635400161300ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include "Global.hpp" using namespace kanzi; using namespace std; // int(Math.log2(x-1)) const int Global::LOG2[256] = { 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, }; // 4096*Math.log2(x) const int Global::LOG2_4096[257] = { 0, 0, 4096, 6492, 8192, 9511, 10588, 11499, 12288, 12984, 13607, 14170, 14684, 15157, 15595, 16003, 16384, 16742, 17080, 17400, 17703, 17991, 18266, 18529, 18780, 19021, 19253, 19476, 19691, 19898, 20099, 20292, 20480, 20662, 20838, 21010, 21176, 21338, 21496, 21649, 21799, 21945, 22087, 22226, 22362, 22495, 22625, 22752, 22876, 22998, 23117, 23234, 23349, 23462, 23572, 23680, 23787, 23892, 23994, 24095, 24195, 24292, 24388, 24483, 24576, 24668, 24758, 24847, 24934, 25021, 25106, 25189, 25272, 25354, 25434, 25513, 25592, 25669, 25745, 25820, 25895, 25968, 26041, 26112, 26183, 26253, 26322, 26390, 26458, 26525, 26591, 26656, 26721, 26784, 26848, 26910, 26972, 27033, 27094, 27154, 27213, 27272, 27330, 27388, 27445, 27502, 27558, 27613, 27668, 27722, 27776, 27830, 27883, 27935, 27988, 28039, 28090, 28141, 28191, 28241, 28291, 28340, 28388, 28437, 28484, 28532, 28579, 28626, 28672, 28718, 28764, 28809, 28854, 28898, 28943, 28987, 29030, 29074, 29117, 29159, 29202, 29244, 29285, 29327, 29368, 29409, 29450, 29490, 29530, 29570, 29609, 29649, 29688, 29726, 29765, 29803, 29841, 29879, 29916, 29954, 29991, 30027, 30064, 30100, 30137, 30172, 30208, 30244, 30279, 30314, 30349, 30384, 30418, 30452, 30486, 30520, 30554, 30587, 30621, 30654, 30687, 30719, 30752, 30784, 30817, 30849, 30880, 30912, 30944, 30975, 31006, 31037, 31068, 31099, 31129, 31160, 31190, 31220, 31250, 31280, 31309, 31339, 31368, 31397, 31426, 31455, 31484, 31513, 31541, 31569, 31598, 31626, 31654, 31681, 31709, 31737, 31764, 31791, 31818, 31846, 31872, 31899, 31926, 31952, 31979, 32005, 32031, 32058, 32084, 32109, 32135, 32161, 32186, 32212, 32237, 32262, 32287, 32312, 32337, 32362, 32387, 32411, 32436, 32460, 32484, 32508, 32533, 32557, 32580, 32604, 32628, 32651, 32675, 32698, 32722, 32745, 32768 }; char Global::BASE64_SYMBOLS[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; char Global::NUMERIC_SYMBOLS[] = "0123456789+-*/=,.:; "; char Global::DNA_SYMBOLS[] = "acgntuACGNTU"; // either T or U and N for unknown const Global Global::_singleton; int Global::SQUASH[4096]; int Global::STRETCH[4096]; Global::Global() { // 65536 /(1 + exp(-alpha*x)) with alpha ~= 0.54 const int INV_EXP[33] = { 0, 8, 22, 47, 88, 160, 283, 492, 848, 1451, 2459, 4117, 6766, 10819, 16608, 24127, 32768, 41409, 48928, 54717, 58770, 61419, 63077, 64085, 64688, 65044, 65253, 65376, 65448, 65489, 65514, 65528, 65536 }; for (int x = 1; x < 4096; x++) { const int w = x & 127; const int y = x >> 7; SQUASH[x - 1] = (INV_EXP[y] * (128 - w) + INV_EXP[y + 1] * w) >> 11; } SQUASH[4095] = 4095; int n = 0; for (int x = -2047; x <= 2047; x++) { const int sq = squash(x); while (n <= sq) STRETCH[n++] = x; if (n >= 4096) break; } STRETCH[4095] = 2047; #if defined(WIN32) || defined(_WIN32) || defined(_WIN64) static const string reserved[27] = { "AUX", "COM0", "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9", "COM¹", "COM²", "COM³", "CON", "LPT0", "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9", "NUL", "PRN" }; for (int i = 0; i < 27; i++) _reservedNames.insert(reserved[i]); #endif } // Return 1024 * log2(x). Max error is around 0.1% int Global::log2_1024(uint32 x) { if (x == 0) throw std::invalid_argument("Cannot calculate log of a negative or null value"); if (x < 256) return (Global::LOG2_4096[x] + 2) >> 2; const int log = _log2(x); if ((x & (x - 1)) == 0) return log << 10; return ((log - 7) * 1024) + ((LOG2_4096[x >> (log - 7)] + 2) >> 2); } int Global::log2(uint32 x) { if (x == 0) throw std::invalid_argument("Cannot calculate log of a negative or null value"); return _log2(x); } int Global::log2(uint64 x) { if (x == 0) throw std::invalid_argument("Cannot calculate log of a negative or null value"); return _log2(x); } // If withTotal is true, the last spot in each frequencies order 0 array is for the total void Global::computeHistogram(const kanzi::byte block[], int length, uint freqs[], bool isOrder0, bool withTotal) { const uint8* p = reinterpret_cast(&block[0]); if (isOrder0 == true) { if (withTotal == true) freqs[256] = uint(length); uint f0[256] = { 0 }; uint f1[256] = { 0 }; uint f2[256] = { 0 }; uint f3[256] = { 0 }; const uint8* end16 = reinterpret_cast(&block[length & -16]); uint64 q; while (p < end16) { memcpy(&q, &p[0], 8); f0[uint8(q>>56)]++; f1[uint8(q>>48)]++; f2[uint8(q>>40)]++; f3[uint8(q>>32)]++; f0[uint8(q>>24)]++; f1[uint8(q>>16)]++; f2[uint8(q>>8)]++; f3[uint8(q)]++; memcpy(&q, &p[8], 8); f0[uint8(q>>56)]++; f1[uint8(q>>48)]++; f2[uint8(q>>40)]++; f3[uint8(q>>32)]++; f0[uint8(q>>24)]++; f1[uint8(q>>16)]++; f2[uint8(q>>8)]++; f3[uint8(q)]++; p += 16; } const uint8* end = reinterpret_cast(&block[length]); while (p < end) freqs[*p++]++; for (int i = 0; i < 256; i++) freqs[i] += (f0[i] + f1[i] + f2[i] + f3[i]); } else { // Order 1 const int quarter = length >> 2; int n0 = 0 * quarter; int n1 = 1 * quarter; int n2 = 2 * quarter; int n3 = 3 * quarter; if (withTotal == true) { if (length < 32) { uint prv = 0; for (int i = 0; i < length; i++) { freqs[prv + uint(p[i])]++; freqs[prv + 256]++; prv = 257 * uint(p[i]); } } else { uint prv0 = 0; uint prv1 = 257 * uint(p[n1 - 1]); uint prv2 = 257 * uint(p[n2 - 1]); uint prv3 = 257 * uint(p[n3 - 1]); for (; n0 < quarter; n0++, n1++, n2++, n3++) { const uint cur0 = uint(p[n0]); const uint cur1 = uint(p[n1]); const uint cur2 = uint(p[n2]); const uint cur3 = uint(p[n3]); freqs[prv0 + cur0]++; freqs[prv0 + 256]++; freqs[prv1 + cur1]++; freqs[prv1 + 256]++; freqs[prv2 + cur2]++; freqs[prv2 + 256]++; freqs[prv3 + cur3]++; freqs[prv3 + 256]++; prv0 = 257 * cur0; prv1 = 257 * cur1; prv2 = 257 * cur2; prv3 = 257 * cur3; } for (; n3 < length; n3++) { freqs[prv3 + uint(p[n3])]++; freqs[prv3 + 256]++; prv3 = 257 * uint(p[n3]); } } } else { // order 1, no total if (length < 32) { uint prv = 0; for (int i = 0; i < length; i++) { freqs[prv + uint(p[i])]++; prv = 256 * uint(p[i]); } } else { uint prv0 = 0; uint prv1 = 256 * uint(p[n1 - 1]); uint prv2 = 256 * uint(p[n2 - 1]); uint prv3 = 256 * uint(p[n3 - 1]); for (; n0 < quarter; n0++, n1++, n2++, n3++) { const uint cur0 = uint(p[n0]); const uint cur1 = uint(p[n1]); const uint cur2 = uint(p[n2]); const uint cur3 = uint(p[n3]); freqs[prv0 + cur0]++; freqs[prv1 + cur1]++; freqs[prv2 + cur2]++; freqs[prv3 + cur3]++; prv0 = cur0 << 8; prv1 = cur1 << 8; prv2 = cur2 << 8; prv3 = cur3 << 8; } for (; n3 < length; n3++) { freqs[prv3 + uint(p[n3])]++; prv3 = uint(p[n3]) << 8; } } } } } // Return the zero order entropy scaled to the [0..1024] range // Incoming array size must be 256 int Global::computeFirstOrderEntropy1024(int blockLen, const uint histo[]) { if (blockLen == 0) return 0; uint64 sum = 0; const int logLength1024 = Global::log2_1024(uint32(blockLen)); for (int i = 0; i < 256; i++) { if (histo[i] == 0) continue; sum += ((uint64(histo[i]) * uint64(logLength1024 - Global::log2_1024(histo[i]))) >> 3); } return int(sum / uint64(blockLen)); } void Global::computeJobsPerTask(int jobsPerTask[], int jobs, int tasks) { if (jobs <= 0) throw std::invalid_argument("Invalid number of jobs provided"); if (tasks <= 0) throw std::invalid_argument("Invalid number of tasks provided"); int q = (jobs <= tasks) ? 1 : jobs / tasks; int r = (jobs <= tasks) ? 0 : jobs - q * tasks; for (int i = 0; i < tasks; i++) jobsPerTask[i] = q; int n = 0; while (r != 0) { jobsPerTask[n]++; r--; n++; } } Global::DataType Global::detectSimpleType(int count, const uint freqs0[]) { int sum = 0; for (int i = 0; i < 12; i++) sum += freqs0[int(DNA_SYMBOLS[i])]; if (sum > (count - count / 12)) return DNA; sum = 0; for (int i = 0; i < 20; i++) sum += freqs0[int(NUMERIC_SYMBOLS[i])]; if (sum == count) return NUMERIC; // Last symbol with padding '=' sum = (freqs0[0x3D] == 1) ? 1 : 0; for (int i = 0; i < 64; i++) sum += freqs0[int(BASE64_SYMBOLS[i])]; if (sum == count) return BASE64; sum = 0; for (int i = 0; i < 256; i += 8) { sum += (freqs0[i+0] > 0) ? 1 : 0; sum += (freqs0[i+1] > 0) ? 1 : 0; sum += (freqs0[i+2] > 0) ? 1 : 0; sum += (freqs0[i+3] > 0) ? 1 : 0; sum += (freqs0[i+4] > 0) ? 1 : 0; sum += (freqs0[i+5] > 0) ? 1 : 0; sum += (freqs0[i+6] > 0) ? 1 : 0; sum += (freqs0[i+7] > 0) ? 1 : 0; } if (sum == 256) return BIN; return (sum <= 4) ? SMALL_ALPHABET : UNDEFINED; } #if defined(WIN32) || defined(_WIN32) || defined(_WIN64) bool Global::isReservedName(string fileName) { transform(fileName.begin(), fileName.end(), fileName.begin(), ::toupper); return _singleton._reservedNames.find(fileName) != _singleton._reservedNames.end(); } #else bool Global::isReservedName(string) { return false; } #endif kanzi-cpp-2.5.2/src/Global.hpp000066400000000000000000000107461516423635400161350ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_Global #define knz_Global #include #include #include "types.hpp" namespace kanzi { class Global { public: enum DataType { UNDEFINED, TEXT, MULTIMEDIA, EXE, NUMERIC, BASE64, DNA, BIN, UTF8, SMALL_ALPHABET }; static int stretch(int d); // ln(x / (1 - x)) static int squash(int d); // 1 / (1 + e-x) (inverse of stretch) static int log2(uint32 x); // fast, integer rounded static int log2(uint64 x); // fast, integer rounded static int _log2(uint32 x); // same as log2 minus check on input value static int _log2(uint64 x); // same as log2 minus check on input value static int trailingZeros(uint32 x); static int trailingZeros(uint64 x); static int log2_1024(uint32 x); // slow, accurate to 1/1024th static void computeJobsPerTask(int jobsPerTask[], int jobs, int tasks); static int computeFirstOrderEntropy1024(int blockLen, const uint histo[]); static void computeHistogram(const byte block[], int end, uint freqs[], bool isOrder0=true, bool withTotal=false); static DataType detectSimpleType(int count, const uint histo[]); static bool isReservedName(std::string fileName); private: Global(); ~Global() {} static const Global _singleton; static const int LOG2_4096[257]; // 4096*Math.log2(x) static const int LOG2[256]; // int(Math.log2(x-1)) static int STRETCH[4096]; static int SQUASH[4096]; static char BASE64_SYMBOLS[]; static char DNA_SYMBOLS[]; static char NUMERIC_SYMBOLS[]; std::set _reservedNames; }; // return p = 1/(1 + exp(-d)), d scaled by 8 bits, p scaled by 12 bits inline int Global::squash(int d) { if (d >= 2048) return 4095; return (d <= -2048) ? 0 : SQUASH[d + 2047]; } inline int Global::stretch(int d) { return STRETCH[d]; } // x cannot be 0 inline int Global::_log2(uint32 x) { #if defined(_MSC_VER) unsigned long res; _BitScanReverse(&res, x); return int(res); #elif defined(__GNUG__) || defined(__clang__) return 31 ^ __builtin_clz(x); #else int res = 0; if (x >= 1 << 16) { x >>= 16; res = 16; } if (x >= 1 << 8) { x >>= 8; res += 8; } return res + Global::LOG2[x - 1]; #endif } // x cannot be 0 inline int Global::_log2(uint64 x) { #if defined(_MSC_VER) && defined(_M_AMD64) unsigned long res; _BitScanReverse64(&res, x); return int(res); #elif defined(__GNUG__) || defined(__clang__) return 63 ^ __builtin_clzll(x); #else int res = 0; if (x >= uint64(1) << 32) { x >>= 32; res = 32; } if (x >= uint64(1) << 16) { x >>= 16; res += 16; } if (x >= uint64(1) << 8) { x >>= 8; res += 8; } return res + Global::LOG2[x - 1]; #endif } // x cannot be 0 inline int Global::trailingZeros(uint32 x) { #if defined(_MSC_VER) unsigned long res; _BitScanForward(&res, x); return int(res); #elif defined(__GNUG__) || defined(__clang__) return __builtin_ctz(x); #else return _log2(x & (~x + 1)); #endif } // x cannot be 0 inline int Global::trailingZeros(uint64 x) { #if defined(_MSC_VER) && defined(_M_AMD64) unsigned long res; _BitScanForward64(&res, x); return int(res); #elif defined(__GNUG__) || defined(__clang__) return __builtin_ctzll(x); #else return _log2(x & (~x + 1)); #endif } } #endif kanzi-cpp-2.5.2/src/InputBitStream.hpp000066400000000000000000000027411516423635400176430ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_InputBitStream #define knz_InputBitStream #include "types.hpp" namespace kanzi { class InputBitStream { public: // Returns 1 or 0 virtual int readBit() = 0; // Length is the number of bits in [1..64]. Return the bits read as a long // Throws if the stream is closed. virtual uint64 readBits(uint length) = 0; // Read bits and put them in the byte array. Length is the number of bits // Return the number of bits read. // Throws if the stream is closed. virtual uint readBits(byte bits[], uint length) = 0; virtual void close() = 0; // Number of bits read virtual uint64 read() const = 0; // Return false when the bitstream is closed or the End-Of-Stream has been reached virtual bool hasMoreToRead() = 0; InputBitStream(){} virtual ~InputBitStream(){} }; } #endif kanzi-cpp-2.5.2/src/InputStream.hpp000066400000000000000000000013551516423635400172040ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_InputStream #define knz_InputStream #include namespace kanzi { // Maps to istream typedef std::istream InputStream; } #endif kanzi-cpp-2.5.2/src/Listener.hpp000066400000000000000000000014771516423635400165230ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_Listener #define knz_Listener namespace kanzi { template class Listener { public: Listener(){} virtual void processEvent(const T& evt) = 0; virtual ~Listener(){} }; } #endif kanzi-cpp-2.5.2/src/Magic.hpp000066400000000000000000000122011516423635400157410ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_Magic #define knz_Magic #include "Memory.hpp" namespace kanzi { struct Magic { static const uint NO_MAGIC = 0; static const uint JPG_MAGIC = 0xFFD8FFE0; static const uint GIF_MAGIC = 0x47494638; static const uint PDF_MAGIC = 0x25504446; static const uint ZIP_MAGIC = 0x504B0304; // Works for jar & office docs static const uint LZMA_MAGIC = 0x377ABCAF; // Works for 7z 37 7A BC AF 27 1C static const uint RAR_MAGIC = 0x52617221; // 52 61 72 21 1A 07 00 static const uint PNG_MAGIC = 0x89504E47; static const uint ELF_MAGIC = 0x7F454C46; static const uint MAC_MAGIC32 = 0xFEEDFACE; static const uint MAC_CIGAM32 = 0xCEFAEDFE; static const uint MAC_MAGIC64 = 0xFEEDFACF; static const uint MAC_CIGAM64 = 0xCFFAEDFE; static const uint ZSTD_MAGIC = 0x28B52FFD; static const uint BROTLI_MAGIC = 0x81CFB2CE; static const uint RIFF_MAGIC = 0x52494646; // WAV, AVI, WEBP static const uint CAB_MAGIC = 0x4D534346; static const uint FLAC_MAGIC = 0x664C6143; static const uint XZ_MAGIC = 0xFD377A58; // FD 37 7A 58 5A 00 static const uint KNZ_MAGIC = 0x4B414E5A; static const uint BZIP2_MAGIC = 0x425A68; static const uint MP3_ID3_MAGIC = 0x494433; static const uint GZIP_MAGIC = 0x1F8B; static const uint BMP_MAGIC = 0x424D; static const uint WIN_MAGIC = 0x4D5A; static const uint PBM_MAGIC = 0x5034; // bin only static const uint PGM_MAGIC = 0x5035; // bin only static const uint PPM_MAGIC = 0x5036; // bin only static uint getType(const byte src[]); static bool isCompressed(uint magic); static bool isMultimedia(uint magic); static bool isExecutable(uint magic); }; // 4 bytes must be readable in src inline uint Magic::getType(const byte src[]) { static const uint KEYS32[18] = { GIF_MAGIC, PDF_MAGIC, ZIP_MAGIC, LZMA_MAGIC, PNG_MAGIC, ELF_MAGIC, MAC_MAGIC32, MAC_CIGAM32, MAC_MAGIC64, MAC_CIGAM64, ZSTD_MAGIC, BROTLI_MAGIC, CAB_MAGIC, RIFF_MAGIC, FLAC_MAGIC, XZ_MAGIC, KNZ_MAGIC, RAR_MAGIC }; static const uint KEYS16[3] = { GZIP_MAGIC, BMP_MAGIC, WIN_MAGIC }; const uint key = uint(BigEndian::readInt32(&src[0])); if ((key & ~0x0F) == JPG_MAGIC) return key; if (((key >> 8) == BZIP2_MAGIC) || ((key >> 8) == MP3_ID3_MAGIC)) return key >> 8; const int n = sizeof(KEYS32) / sizeof(uint); for (int i = 0; i < n; i++) { if (key == KEYS32[i]) return key; } const uint key16 = key >> 16; for (int i = 0; i < 3; i++) { if (key16 == KEYS16[i]) return key16; } if ((key16 == PBM_MAGIC) || (key16 == PGM_MAGIC) || (key16 == PPM_MAGIC)) { const uint subkey = (key >> 8) & 0xFF; if ((subkey == 0x07) || (subkey == 0x0A) || (subkey == 0x0D) || (subkey == 0x20)) return key16; } return NO_MAGIC; } inline bool Magic::isCompressed(uint magic) { switch (magic) { case JPG_MAGIC: case GIF_MAGIC: case PNG_MAGIC: //case RIFF_MAGIC: may or may not be case LZMA_MAGIC: case ZSTD_MAGIC: case BROTLI_MAGIC: case CAB_MAGIC: case ZIP_MAGIC: case GZIP_MAGIC: case BZIP2_MAGIC: case FLAC_MAGIC: case MP3_ID3_MAGIC: case XZ_MAGIC: case KNZ_MAGIC: case RAR_MAGIC: return true; default: return false; } } inline bool Magic::isMultimedia(uint magic) { switch (magic) { case JPG_MAGIC: case GIF_MAGIC: case PNG_MAGIC: case RIFF_MAGIC: case FLAC_MAGIC: case MP3_ID3_MAGIC: case BMP_MAGIC: case PBM_MAGIC: case PGM_MAGIC: case PPM_MAGIC: return true; default: return false; } } inline bool Magic::isExecutable(uint magic) { switch (magic) { case ELF_MAGIC: case WIN_MAGIC: case MAC_MAGIC32: case MAC_CIGAM32: case MAC_MAGIC64: case MAC_CIGAM64: return true; default: return false; } } } #endif kanzi-cpp-2.5.2/src/Makefile000066400000000000000000000245741516423635400156700ustar00rootroot00000000000000#CC=clang #CXX=clang++ OBJ_DIR = obj TEST_OBJ_DIR=$(OBJ_DIR)/test APP=kanzi APP_STATIC=$(APP)_static APP_DYNAMIC=$(APP)_dynamic ifeq ($(TCMALLOC_ENABLED), 1) LDFLAGS += -lpthread -ltcmalloc_minimal else LDFLAGS += -lpthread endif ifeq ($(CONCURRENCY_DISABLED), 1) CONCURRENCY_FLAG = -DCONCURRENCY_DISABLED endif ifndef CXX_STD CXX_STD = c++17 endif ifndef C_STD C_STD = c11 endif ifeq ($(OS),Windows_NT) DETECTED_OS := Windows else DETECTED_OS := $(shell uname -s) endif CFLAGS += -c -std=$(C_STD) -Wall -Wextra -O3 -fPIC -pedantic -march=native ifeq ($(DETECTED_OS),Windows) CXXFLAGS += -c -std=$(CXX_STD) -Wall -Wextra -O3 -fomit-frame-pointer -fPIC -DNDEBUG -pedantic -march=native -fno-rtti $(CONCURRENCY_FLAG) else ARCH ?= $(shell uname -m) # Check for both x86_64 (Linux) and amd64 (FreeBSD) ifneq ($(filter x86_64 amd64,$(ARCH)),) #CXXFLAGS += -c -std=$(CXX_STD) -fsanitize=undefined -ftrapv -D_FORTIFY_SOURCE=3 -D_LIBCPP_HARDENING_MODE=_LIBCPP_HARDENING_MODE_FAST -fstrict-aliasing -Wall -Wextra -O3 -fomit-frame-pointer -fPIC -DNDEBUG -pedantic -march=native -fno-rtti $(CONCURRENCY_FLAG) CXXFLAGS += -c -std=$(CXX_STD) -fstrict-aliasing -Wall -Wextra -O3 -fomit-frame-pointer -fPIC -DNDEBUG -pedantic -march=native -fno-rtti $(CONCURRENCY_FLAG) else #CXXFLAGS += -c -std=$(CXX_STD) -fsanitize=signed-integer-overflow -ftrapv -D_FORTIFY_SOURCE=3 -Wall -Wextra -O3 -fPIC -DNDEBUG -pedantic -fno-rtti $(CONCURRENCY_FLAG) CXXFLAGS += -c -std=$(CXX_STD) -fstrict-aliasing -Wall -Wextra -Wpedantic -Wdeprecated -O3 -fPIC -DNDEBUG -pedantic -fno-rtti $(CONCURRENCY_FLAG) endif endif ifeq ($(DETECTED_OS),FreeBSD) STATIC_LINK_FLAGS := -static else ifeq ($(DETECTED_OS),Darwin) STATIC_LINK_FLAGS := else STATIC_LINK_FLAGS := -static-libstdc++ -static-libgcc endif LIB_COMMON_SOURCES=Global.cpp \ Event.cpp \ util/WallTimer.cpp \ entropy/EntropyUtils.cpp \ entropy/HuffmanCommon.cpp \ entropy/CMPredictor.cpp \ entropy/TPAQPredictor.cpp \ transform/AliasCodec.cpp \ transform/BWT.cpp \ transform/BWTS.cpp \ transform/DivSufSort.cpp \ transform/SBRT.cpp \ transform/BWTBlockCodec.cpp \ transform/LZCodec.cpp \ transform/FSDCodec.cpp \ transform/ROLZCodec.cpp \ transform/RLT.cpp \ transform/SRT.cpp \ transform/TextCodec.cpp \ transform/UTFCodec.cpp \ transform/EXECodec.cpp \ transform/ZRLT.cpp LIB_COMP_SOURCES=api/Compressor.cpp \ bitstream/DebugOutputBitStream.cpp \ bitstream/DefaultOutputBitStream.cpp \ io/CompressedOutputStream.cpp \ entropy/ANSRangeEncoder.cpp \ entropy/BinaryEntropyEncoder.cpp \ entropy/ExpGolombEncoder.cpp \ entropy/FPAQEncoder.cpp \ entropy/HuffmanEncoder.cpp \ entropy/RangeEncoder.cpp LIB_DECOMP_SOURCES=api/Decompressor.cpp \ bitstream/DebugInputBitStream.cpp \ bitstream/DefaultInputBitStream.cpp \ io/CompressedInputStream.cpp \ entropy/ANSRangeDecoder.cpp \ entropy/BinaryEntropyDecoder.cpp \ entropy/ExpGolombDecoder.cpp \ entropy/FPAQDecoder.cpp \ entropy/HuffmanDecoder.cpp \ entropy/RangeDecoder.cpp LIB_SOURCES=$(LIB_COMMON_SOURCES) $(LIB_COMP_SOURCES) $(LIB_DECOMP_SOURCES) # Define library object files LIB_OBJECTS=$(filter-out $(OBJ_DIR)/test/%.o,$(LIB_COMMON_OBJECTS) $(LIB_COMP_OBJECTS) $(LIB_DECOMP_OBJECTS)) TEST_SOURCES=test/TestEntropyCodec.cpp \ test/TestBWT.cpp \ test/TestCompressedStream.cpp \ test/TestDefaultBitStream.cpp \ test/TestFactories.cpp \ test/TestMalformedStream.cpp \ test/TestTransforms.cpp \ test/TestAPI.c APP_SOURCES=app/Kanzi.cpp \ app/InfoPrinter.cpp \ app/BlockCompressor.cpp \ app/BlockDecompressor.cpp SOURCES=$(LIB_SOURCES) $(APP_SOURCES) # Function to create object file paths, preserving the directory structure OBJ = $(OBJ_DIR)/$(patsubst %.cpp,%.o,$(1)) LIB_COMMON_OBJECTS=$(foreach src,$(LIB_COMMON_SOURCES),$(call OBJ,$(src))) LIB_COMP_OBJECTS=$(foreach src,$(LIB_COMP_SOURCES),$(call OBJ,$(src))) LIB_DECOMP_OBJECTS=$(foreach src,$(LIB_DECOMP_SOURCES),$(call OBJ,$(src))) LIB_OBJECTS=$(LIB_COMMON_OBJECTS) $(LIB_COMP_OBJECTS) $(LIB_DECOMP_OBJECTS) #TEST_OBJECTS=$(foreach src,$(TEST_SOURCES),$(call OBJ,$(src))) APP_OBJECTS=$(foreach src,$(APP_SOURCES),$(call OBJ,$(src))) OBJECTS=$(LIB_OBJECTS) $(APP_OBJECTS) $(TEST_OBJECTS) RPTS=$(SOURCES:.cpp=.optrpt) STATIC_LIB_SUFFIX := .a SHARED_LIB_SUFFIX := .so PROG_SUFFIX := # Default to -shared (Linux/GNU) SHARED_OPTION := -shared # Detect Windows or macOS ifeq ($(DETECTED_OS),Windows) STATIC_LIB_SUFFIX := .lib SHARED_LIB_SUFFIX := .dll PROG_SUFFIX := .exe SHARED_OPTION := -shared else ifeq ($(DETECTED_OS),Darwin) # macOS Specifics SHARED_LIB_SUFFIX := .dylib SHARED_OPTION := -dynamiclib endif STATIC_LIB := lib$(APP)$(STATIC_LIB_SUFFIX) SHARED_LIB := lib$(APP)$(SHARED_LIB_SUFFIX) STATIC_COMP_LIB := lib$(APP)comp$(STATIC_LIB_SUFFIX) STATIC_DECOMP_LIB := lib$(APP)decomp$(STATIC_LIB_SUFFIX) SHARED_COMP_LIB := lib$(APP)comp$(SHARED_LIB_SUFFIX) SHARED_DECOMP_LIB := lib$(APP)decomp$(SHARED_LIB_SUFFIX) all: lib test $(APP_STATIC) $(APP_DYNAMIC) libcomp: $(STATIC_COMP_LIB) $(SHARED_COMP_LIB) libdecomp: $(STATIC_DECOMP_LIB) $(SHARED_DECOMP_LIB) lib: $(STATIC_LIB) $(SHARED_LIB) # Create static libraries $(STATIC_LIB):$(LIB_OBJECTS) $(AR) cr ../lib/$@ $+ $(STATIC_COMP_LIB):$(LIB_COMP_OBJECTS) $(AR) cr ../lib/$@ $+ $(STATIC_DECOMP_LIB):$(LIB_DECOMP_OBJECTS) $(AR) cr ../lib/$@ $+ # Create shared libraries $(SHARED_LIB):$(LIB_OBJECTS) $(CXX) -o ../lib/$@ $(LDFLAGS) $(SHARED_OPTION) $+ $(SHARED_COMP_LIB):$(LIB_COMP_OBJECTS) $(CXX) -o ../lib/$@ $(LDFLAGS) $(SHARED_OPTION) $+ $(SHARED_DECOMP_LIB):$(LIB_DECOMP_OBJECTS) $(CXX) -o ../lib/$@ $(LDFLAGS) $(SHARED_OPTION) $+ # Targets for test executables testAPI: $(LIB_OBJECTS) $(TEST_OBJ_DIR)/TestAPI.o $(CXX) $^ -o ../bin/$@ $(LDFLAGS) testBWT: $(LIB_OBJECTS) $(TEST_OBJ_DIR)/TestBWT.o $(CXX) $^ -o ../bin/$@ $(LDFLAGS) testTransforms: $(LIB_OBJECTS) $(TEST_OBJ_DIR)/TestTransforms.o $(CXX) $^ -o ../bin/$@ $(LDFLAGS) testEntropyCodec: $(LIB_OBJECTS) $(TEST_OBJ_DIR)/TestEntropyCodec.o $(CXX) $^ -o ../bin/$@ $(LDFLAGS) testDefaultBitStream: $(LIB_OBJECTS) $(TEST_OBJ_DIR)/TestDefaultBitStream.o $(CXX) $^ -o ../bin/$@ $(LDFLAGS) testFactories: $(LIB_OBJECTS) $(TEST_OBJ_DIR)/TestFactories.o $(CXX) $^ -o ../bin/$@ $(LDFLAGS) testMalformedStream: $(LIB_OBJECTS) $(TEST_OBJ_DIR)/TestMalformedStream.o $(CXX) $^ -o ../bin/$@ $(LDFLAGS) testCompressedStream: $(LIB_OBJECTS) $(TEST_OBJ_DIR)/TestCompressedStream.o $(CXX) $^ -o ../bin/$@ $(LDFLAGS) test: testAPI testBWT testTransforms testEntropyCodec testDefaultBitStream testFactories testMalformedStream testCompressedStream # Default executable target kanzi: $(LIB_OBJECTS) $(APP_OBJECTS) $(CXX) $^ -o ../bin/$@ $(LDFLAGS) # Statically linked executable ifeq ($(DETECTED_OS),Darwin) # macOS does not support -static => disable static target kanzi_static: @echo "Static linking is not supported on macOS" else # Only add the flags if we aren't on macOS kanzi_static: LDFLAGS += $(STATIC_LINK_FLAGS) kanzi_static: $(LIB_OBJECTS) $(APP_OBJECTS) $(CXX) $^ -o ../bin/kanzi_static$(PROG_SUFFIX) $(LDFLAGS) endif # Dynamically linked executable kanzi_dynamic: LDFLAGS := $(filter-out -static,$(LDFLAGS)) kanzi_dynamic: $(LIB_OBJECTS) $(APP_OBJECTS) $(CXX) $^ -o ../bin/kanzi_dynamic$(PROG_SUFFIX) $(LDFLAGS) # Install / uninstall (may require sudo or admin rights) ifeq ($(DETECTED_OS),Windows) INSTALL_DIR=C:/Program\ Files else INSTALL_DIR=/usr/local MAN_DIR=$(INSTALL_DIR)/share/man/man1 endif print-vars: @echo "OS=$(DETECTED_OS)" @echo "APP=$(APP)" @echo "CXX=$(CXX)" @echo "CXXFLAGS=$(CXXFLAGS)" @echo "LDFLAGS=$(LDFLAGS)" @echo "INSTALL_DIR=$(INSTALL_DIR)" install: $(STATIC_LIB) $(SHARED_LIB) $(APP) ifeq ($(DETECTED_OS),Windows) copy /Y ..\bin\$(APP)$(PROG_SUFFIX) $(INSTALL_DIR) else install -d $(INSTALL_DIR)/lib install -m 644 ../lib/$(STATIC_LIB) $(INSTALL_DIR)/lib install -m 644 ../lib/$(SHARED_LIB) $(INSTALL_DIR)/lib install -d $(INSTALL_DIR)/include/kanzi install -m 644 ./*.hpp $(INSTALL_DIR)/include/kanzi install -d $(INSTALL_DIR)/include/kanzi/api install -m 644 ./api/*.hpp $(INSTALL_DIR)/include/kanzi/api install -d $(INSTALL_DIR)/include/kanzi/io install -m 644 ./io/*.hpp $(INSTALL_DIR)/include/kanzi/io install -d $(INSTALL_DIR)/include/kanzi/entropy install -m 644 ./entropy/*.hpp $(INSTALL_DIR)/include/kanzi/entropy install -d $(INSTALL_DIR)/include/kanzi/bitstream install -m 644 ./bitstream/*.hpp $(INSTALL_DIR)/include/kanzi/bitstream install -d $(INSTALL_DIR)/include/kanzi/transform install -m 644 ./transform/*.hpp $(INSTALL_DIR)/include/kanzi/transform install -d $(INSTALL_DIR)/include/kanzi/util install -m 644 ./util/*.hpp $(INSTALL_DIR)/include/kanzi/util install -d $(INSTALL_DIR)/bin install -m577 ../bin/$(APP)$(PROG_SUFFIX) $(INSTALL_DIR)/bin install -d $(MAN_DIR) if [ -f ../doc/kanzi.1.gz ]; then \ install -m 644 ../doc/kanzi.1.gz $(MAN_DIR)/$(APP).1.gz; \ elif [ -f ../doc/kanzi.1 ]; then \ gzip -n -c ../doc/kanzi.1 > $(MAN_DIR)/$(APP).1.gz; \ chmod 644 $(MAN_DIR)/$(APP).1.gz; \ else \ echo "Error: missing ../doc/kanzi.1.gz or ../doc/kanzi.1"; \ exit 1; \ fi endif # Uninstall may require sudo or admin rights uninstall: ifeq ($(DETECTED_OS),Windows) del /Q $(INSTALL_DIR)\$(APP)$(PROG_SUFFIX) else rm -f -r $(INSTALL_DIR)/include/$(APP) rm -f $(INSTALL_DIR)/lib/$(STATIC_LIB) rm -f $(INSTALL_DIR)/lib/$(SHARED_LIB) rm -f $(INSTALL_DIR)/bin/$(APP_STATIC)$(PROG_SUFFIX) rm -f $(INSTALL_DIR)/bin/$(APP_DYNAMIC)$(PROG_SUFFIX) rm -f $(INSTALL_DIR)/bin/$(APP)$(PROG_SUFFIX) rm -f $(MAN_DIR)/$(APP).1.gz endif clean: ifeq ($(DETECTED_OS),Windows) del /S $(OBJ_DIR)\*.o ..\bin\$(APP)$(PROG_SUFFIX) \ ..\bin\$(APP_STATIC)$(PROG_SUFFIX) \ ..\bin\$(APP_DYNAMIC)$(PROG_SUFFIX) ..\bin\test*$(PROG_SUFFIX) \ ..\lib\$(STATIC_LIB) ..\lib\$(SHARED_LIB) \ ..\lib\$(STATIC_COMP_LIB) ..\lib\$(STATIC_DECOMP_LIB) \ ..\lib\$(SHARED_COMP_LIB) ..\lib\$(SHARED_DECOMP_LIB) else rm -f ../bin/test*$(PROG_SUFFIX) $(OBJECTS) $(RPTS) \ ../bin/$(APP)$(PROG_SUFFIX) \ ../bin/$(APP_STATIC)$(PROG_SUFFIX) \ ../bin/$(APP_DYNAMIC)$(PROG_SUFFIX) \ ../lib/$(STATIC_LIB) ../lib/$(SHARED_LIB) \ ../lib/$(STATIC_COMP_LIB) ../lib/$(STATIC_DECOMP_LIB) \ ../lib/$(SHARED_COMP_LIB) ../lib/$(SHARED_DECOMP_LIB) rm -rf $(OBJ_DIR) endif ifeq ($(DETECTED_OS),Windows) MKDIR = if not exist $(subst /,\,$(dir $@)) mkdir $(subst /,\,$(dir $@)) else MKDIR = mkdir -p $(dir $@) endif $(OBJ_DIR)/%.o: %.cpp @$(MKDIR) $(CXX) $(CXXFLAGS) $< -o $@ $(OBJ_DIR)/%.o: %.c @$(MKDIR) $(CC) $(CFLAGS) $< -o $@ kanzi-cpp-2.5.2/src/Memory.hpp000066400000000000000000000125721516423635400162040ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_Memory #define knz_Memory #include #include "types.hpp" namespace kanzi { // Prefetch helpers static KANZI_ALWAYS_INLINE void prefetchRead(const void* ptr) { #if defined(__GNUG__) || defined(__clang__) __builtin_prefetch(ptr, 0, 1); #elif defined(__x86_64__) || defined(_M_AMD64) _mm_prefetch((const char*)ptr, _MM_HINT_T0); #elif defined(_M_ARM) __prefetch(ptr); #elif defined(_M_ARM64) __prefetch2(ptr, 1); #endif } static KANZI_ALWAYS_INLINE void prefetchWrite(const void* ptr) { #if defined(__GNUG__) || defined(__clang__) __builtin_prefetch(ptr, 1, 1); #elif defined(__x86_64__) || defined(_M_AMD64) _mm_prefetch((const char*)ptr, _MM_HINT_T0); #elif defined(_M_ARM) __prefetchw(ptr); #elif defined(_M_ARM64) __prefetch2(ptr, 17); #endif } // Byte-swap helpers static KANZI_ALWAYS_INLINE uint16 knz_bswap16(uint16 x) { #if defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 5) return __builtin_bswap16(x); #elif defined(_MSC_VER) return _byteswap_ushort(x); #else return (uint16)((x >> 8) | (x << 8)); #endif } static KANZI_ALWAYS_INLINE uint32 knz_bswap32(uint32 x) { #if defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 5) return __builtin_bswap32(x); #elif defined(_MSC_VER) return _byteswap_ulong(x); #else return ((x >> 24) | ((x >> 8) & 0xFF00) | ((x << 8) & 0xFF0000) | (x << 24)); #endif } static KANZI_ALWAYS_INLINE uint64 knz_bswap64(uint64 x) { #if defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 5) return __builtin_bswap64(x); #elif defined(_MSC_VER) return _byteswap_uint64(x); #else x = ((x & 0xFFFFFFFF00000000ull) >> 32) | ((x & 0xFFFFFFFFull) << 32); x = ((x & 0xFFFF0000FFFF0000ull) >> 16) | ((x & 0xFFFF0000FFFFull) << 16); x = ((x & 0xFF00FF00FF00FF00ull) >> 8) | ((x & 0xFF00FF00FF00FFull) << 8); return x; #endif } #ifdef AGGRESSIVE_OPTIMIZATION // There be dragons! // User assumes responsibility for alignment and aliasing constraints. #define KANZI_MEM_EQ4(x, y) (*(const uint32*)(x) == *(const uint32*)(y)) #define KANZI_MEM_EQ8(x, y) (*(const uint64*)(x) == *(const uint64*)(y)) #else #define KANZI_MEM_EQ4(x, y) (std::memcmp((x), (y), 4) == 0) #define KANZI_MEM_EQ8(x, y) (std::memcmp((x), (y), 8) == 0) #endif // Detect host endianness #ifndef HOST_IS_LITTLE #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) || defined(__BIG_ENDIAN__) #define HOST_IS_LITTLE 0 #else #define HOST_IS_LITTLE 1 #endif #endif template static KANZI_ALWAYS_INLINE T readEndian(const byte* p) { T val; #ifdef AGGRESSIVE_OPTIMIZATION val = *reinterpret_cast(p); // may be unaligned #else memcpy(&val, p, sizeof(T)); #endif // Swap if host and source endianness differ #if HOST_IS_LITTLE if (SourceIsBigEndian) { #else if (!SourceIsBigEndian) { #endif if (sizeof(T) == 2) val = (T)knz_bswap16((uint16)val); else if (sizeof(T) == 4) val = (T)knz_bswap32((uint32)val); else if (sizeof(T) == 8) val = (T)knz_bswap64((uint64)val); } return val; } template static KANZI_ALWAYS_INLINE void writeEndian(byte* p, T val) { #if HOST_IS_LITTLE if (TargetIsBigEndian) { #else if (!TargetIsBigEndian) { #endif if (sizeof(T) == 2) val = (T)knz_bswap16((uint16)val); else if (sizeof(T) == 4) val = (T)knz_bswap32((uint32)val); else if (sizeof(T) == 8) val = (T)knz_bswap64((uint64)val); } #ifdef AGGRESSIVE_OPTIMIZATION *reinterpret_cast(p) = val; #else memcpy(p, &val, sizeof(T)); #endif } class BigEndian { public: static int64 readLong64(const byte* p) { return readEndian(p); } static int32 readInt32(const byte* p) { return readEndian(p); } static int16 readInt16(const byte* p) { return readEndian(p); } static void writeLong64(byte* p, int64 v) { writeEndian(p, v); } static void writeInt32(byte* p, int32 v) { writeEndian(p, v); } static void writeInt16(byte* p, int16 v) { writeEndian(p, v); } }; class LittleEndian { public: static int64 readLong64(const byte* p) { return readEndian(p); } static int32 readInt32(const byte* p) { return readEndian(p); } static int16 readInt16(const byte* p) { return readEndian(p); } static void writeLong64(byte* p, int64 v) { writeEndian(p, v); } static void writeInt32(byte* p, int32 v) { writeEndian(p, v); } static void writeInt16(byte* p, int16 v) { writeEndian(p, v); } }; } // namespace kanzi #endif kanzi-cpp-2.5.2/src/OutputBitStream.hpp000066400000000000000000000027241516423635400200450ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_OutputBitStream #define knz_OutputBitStream #include "types.hpp" namespace kanzi { class OutputBitStream { public: // Write the least significant bit of the input integer // Throws if the stream is closed. virtual void writeBit(int bit) = 0; // Length is the number of bits in [1..64]. Return the number of bits written. // Throws if the stream is closed. virtual uint writeBits(uint64 bits, uint length) = 0; // Write bits ouf of the byte array. Length is the number of bits. // Return the number of bits written. // Throws if the stream is closed. virtual uint writeBits(const byte bits[], uint length) = 0; virtual void close() = 0; // Number of bits written virtual uint64 written() const = 0; OutputBitStream(){} virtual ~OutputBitStream(){} }; } #endif kanzi-cpp-2.5.2/src/OutputStream.hpp000066400000000000000000000013601516423635400174010ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_OutputStream #define knz_OutputStream #include namespace kanzi { // Maps to ostream typedef std::ostream OutputStream; } #endif kanzi-cpp-2.5.2/src/Predictor.hpp000066400000000000000000000022031516423635400166550ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_Predictor #define knz_Predictor namespace kanzi { // Predictor predicts the probability of the next bit being 1. class Predictor { public: Predictor(){} // Updates the internal probability model based on the observed bit virtual void update(int bit) = 0; // Returns the value representing the probability of the next bit being 1 // in the [0..4095] range. // E.G. 410 represents roughly a probability of 10% for 1 virtual int get() = 0; virtual ~Predictor(){} }; } #endif kanzi-cpp-2.5.2/src/Seekable.hpp000066400000000000000000000016751516423635400164510ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_Seekable #define knz_Seekable #include "types.hpp" namespace kanzi { class Seekable { public: Seekable(){} // return position in bits virtual int64 tell() = 0; // position in bits // return success or failure virtual bool seek(int64 position) = 0; virtual ~Seekable(){} }; } #endif kanzi-cpp-2.5.2/src/SliceArray.hpp000066400000000000000000000034551516423635400167720ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_SliceArray #define knz_SliceArray namespace kanzi { template class SliceArray { public: T* _array; int _length; // buffer length (a.k.a capacity) int _index; SliceArray(T* arr, int len, int index = 0) : _array(arr), _length(len), _index(index) {} #if __cplusplus < 201103L SliceArray(const SliceArray& sa) { _array = sa._array; _length = sa._length; _index = sa._index; } SliceArray& operator=(const SliceArray& sa); ~SliceArray(){} // does not deallocate buffer memory #else SliceArray(SliceArray&& sa) noexcept = default; SliceArray& operator=(SliceArray&& sa) noexcept = default; ~SliceArray() = default; #endif // Utility methods static bool isValid(const SliceArray& sa); }; template inline bool SliceArray::isValid(const SliceArray& sa) { return ((sa._array != nullptr) && (sa._index >= 0) && (sa._length >= 0) && (sa._index <= sa._length)); } #if __cplusplus < 201103L template inline SliceArray& SliceArray::operator=(const SliceArray& sa) { _array = sa._array; _length = sa._length; _index = sa._index; return *this; } #endif } #endif kanzi-cpp-2.5.2/src/Transform.hpp000066400000000000000000000026051516423635400167030ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_Transform #define knz_Transform #include "SliceArray.hpp" namespace kanzi { // Transform is a class used to transform an input byte array and write // the result to an output byte array. The result may have a different size. // The transform must be stateless to ensure that the compression results // are the same regardless of the number of jobs (ie no information is retained // between to invocations of forward or inverse). template class Transform { public: Transform(){} virtual bool forward(SliceArray& src, SliceArray& dst, int length) = 0; virtual bool inverse(SliceArray& src, SliceArray& dst, int length) = 0; virtual int getMaxEncodedLength(int srcLen) const = 0; virtual ~Transform(){} }; } #endif kanzi-cpp-2.5.2/src/api/000077500000000000000000000000001516423635400147655ustar00rootroot00000000000000kanzi-cpp-2.5.2/src/api/Compressor.cpp000066400000000000000000000232401516423635400176260ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include "Compressor.hpp" #include "../types.hpp" #include "../Error.hpp" #include "../io/IOException.hpp" #include "../io/CompressedOutputStream.hpp" #include "../transform/TransformFactory.hpp" #include "../entropy/EntropyEncoderFactory.hpp" // Note stat64/lstat64 are deprecated on MacOS/Linux // Use _FILE_OFFSET_BITS and stat/lstat instead #ifdef _WIN32 #define FSTAT _fstat64 #define STAT _stat64 #else #define _FILE_OFFSET_BITS 64 #define FSTAT fstat #define STAT stat #endif #ifdef _MSC_VER #include #define FILENO(f) _fileno(f) #define READ(fd, buf, n) _read(fd, buf, uint(n)) #define WRITE(fd, buf, n) _write(fd, buf, uint(n)) #else #include #define FILENO(f) fileno(f) #define READ(fd, buf, n) read(fd, buf, n) #define WRITE(fd, buf, n) write(fd, buf, n) #endif using namespace std; using namespace kanzi; struct cContext { kanzi::CompressedOutputStream* pCos; size_t blockSize; void* fos; }; namespace kanzi { // Utility classes to map C FILEs to C++ streams class ofstreambuf FINAL : public streambuf { public: ofstreambuf(int fd) : _fd(fd), _buffer(65536) { // Initialize put pointers to the beginning of the buffer setp(&_buffer[0], &_buffer[0] + _buffer.size()); } virtual ~ofstreambuf() { // Call the non-virtual implementation directly instead of the virtual sync() flush(); } protected: // Called when the buffer is full virtual int_type overflow(int_type c) { if (flush() == EOF) return EOF; if (c != EOF) { *pptr() = char(c); pbump(1); } return c; } // Called for explicit sync/flush virtual int sync() { return (flush() == EOF) ? -1 : 0; } // Optimized block write virtual streamsize xsputn(const char* s, streamsize n) { streamsize remaining = n; const char* src = s; while (remaining > 0) { streamsize avail = epptr() - pptr(); if (avail >= remaining) { // Fits in current buffer memcpy(pptr(), src, remaining); pbump(int(remaining)); return n; } if (avail > 0) { // Fill the rest of the buffer memcpy(pptr(), src, avail); pbump(int(avail)); src += avail; remaining -= avail; } // Flush full buffer if (flush() == EOF) return n - remaining; // If the remaining chunk is large, write directly to FD to avoid double copy if (remaining >= streamsize(_buffer.size())) { streamsize toWrite = remaining; while (toWrite > 0) { const ptrdiff_t written = ptrdiff_t(WRITE(_fd, src, toWrite)); if (written <= 0) return n - remaining; // Error src += written; toWrite -= streamsize(written); } remaining = 0; } } return n; } private: int _fd; std::vector _buffer; int flush() { ptrdiff_t n = pptr() - pbase(); if (n > 0) { char* dst = pbase(); ptrdiff_t remaining = n; while (remaining > 0) { const ptrdiff_t written = ptrdiff_t(WRITE(_fd, dst, remaining)); if (written <= 0) return EOF; dst += written; remaining -= written; } pbump(-int(n)); // Reset pbump by subtracting the amount written } return 0; } }; class FileOutputStream FINAL : public ostream { private: ofstreambuf _buf; public: FileOutputStream(int fd) : ostream(nullptr), _buf(fd) { rdbuf(&_buf); } }; } // Create internal cContext and CompressedOutputStream KANZI_API int CDECL initCompressor(struct cData* pData, FILE* dst, struct cContext** pCtx) KANZI_NOEXCEPT { if ((pData == nullptr) || (pCtx == nullptr) || (dst == nullptr)) return Error::ERR_INVALID_PARAM; FileOutputStream* fos = nullptr; cContext* cctx = nullptr; try { // Process params const int fd = FILENO(dst); if (fd == -1) return Error::ERR_CREATE_COMPRESSOR; string transform = TransformFactory::getName(TransformFactory::getType(pData->transform)); string entropy = EntropyEncoderFactory::getName(EntropyEncoderFactory::getType(pData->entropy)); if ((transform.length() >= sizeof(pData->transform)) || (entropy.length() >= sizeof(pData->entropy))) { return Error::ERR_INVALID_PARAM; } memset(pData->transform, 0, sizeof(pData->transform)); strncpy(pData->transform, transform.c_str(), sizeof(pData->transform) - 1); memset(pData->entropy, 0, sizeof(pData->entropy)); strncpy(pData->entropy, entropy.c_str(), sizeof(pData->entropy) - 1); pData->blockSize = (pData->blockSize + 15) & -16; *pCtx = nullptr; size_t fileSize = 0; struct STAT sbuf; if (FSTAT(fd, &sbuf) == 0) { fileSize = size_t(sbuf.st_size); } // Create compression stream and update context fos = new FileOutputStream(fd); cctx = new cContext(); cctx->pCos = new CompressedOutputStream(*fos, pData->jobs, pData->entropy, pData->transform, int(pData->blockSize), pData->checksum, uint64(fileSize), #ifdef CONCURRENCY_ENABLED nullptr, #endif pData->headerless != 0); cctx->blockSize = pData->blockSize; cctx->fos = fos; *pCtx = cctx; } catch (const exception&) { if (fos != nullptr) delete fos; if (cctx != nullptr) delete cctx; return Error::ERR_CREATE_COMPRESSOR; } return 0; } KANZI_API int CDECL compress(struct cContext* pCtx, const unsigned char* src, size_t inSize, size_t* outSize) KANZI_NOEXCEPT { if ((pCtx == nullptr) || (outSize == nullptr)) { return Error::ERR_INVALID_PARAM; } if ((src == nullptr) && (inSize != 0)) { return Error::ERR_INVALID_PARAM; } if (inSize > size_t(pCtx->blockSize)) { return Error::ERR_INVALID_PARAM; } *outSize = 0; int res = 0; CompressedOutputStream* pCos = pCtx->pCos; if (pCos == nullptr) { return Error::ERR_INVALID_PARAM; } try { const uint64 w = pCos->getWritten(); pCos->write((const char*)src, streamsize(inSize)); res = pCos->good() ? 0 : Error::ERR_WRITE_FILE; *outSize = int(pCos->getWritten() - w); } catch (const IOException& ioe) { return ioe.error(); } catch (const exception&) { return Error::ERR_UNKNOWN; } return res; } // Cleanup allocated internal data structures KANZI_API int CDECL disposeCompressor(struct cContext** ppCtx, size_t* outSize) KANZI_NOEXCEPT { if ((ppCtx == nullptr) || (*ppCtx == nullptr) || (outSize == nullptr)) return Error::ERR_INVALID_PARAM; *outSize = 0; cContext* pCtx = *ppCtx; CompressedOutputStream* pCos = pCtx->pCos; try { if (pCos != nullptr) { const uint64 w = pCos->getWritten(); pCos->close(); *outSize = int(pCos->getWritten() - w); delete pCos; pCos = nullptr; pCtx->pCos = nullptr; } if (pCtx->fos != nullptr) delete static_cast(pCtx->fos); pCtx->fos = nullptr; delete pCtx; *ppCtx = nullptr; } catch (const IOException& ioe) { if (pCos != nullptr) { delete pCos; pCos = nullptr; pCtx->pCos = nullptr; } if (pCtx->fos != nullptr) delete static_cast(pCtx->fos); delete pCtx; *ppCtx = nullptr; return ioe.error(); } catch (const exception&) { if (pCos != nullptr) { delete pCos; pCos = nullptr; pCtx->pCos = nullptr; } if (pCtx->fos != nullptr) delete static_cast(pCtx->fos); delete pCtx; *ppCtx = nullptr; return Error::ERR_UNKNOWN; } return 0; } KANZI_API unsigned int CDECL getCompressorVersion(void) KANZI_NOEXCEPT { return (KANZI_COMP_VERSION_MAJOR << 16) | (KANZI_COMP_VERSION_MINOR << 8) | KANZI_COMP_VERSION_PATCH; } kanzi-cpp-2.5.2/src/api/Compressor.hpp000066400000000000000000000076361516423635400176460ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_Compressor #define knz_Compressor #ifdef _WIN32 #define CDECL __cdecl #ifdef KANZI_EXPORTS #define KANZI_API __declspec(dllexport) #else #define KANZI_API #endif #else #define CDECL #define KANZI_API #endif #include #ifdef __cplusplus #if __cplusplus >= 201103L // C++ 11 or higher #define KANZI_NOEXCEPT noexcept #else #define KANZI_NOEXCEPT #endif #else #define KANZI_NOEXCEPT #endif #define KANZI_COMP_VERSION_MAJOR 1 #define KANZI_COMP_VERSION_MINOR 0 #define KANZI_COMP_VERSION_PATCH 0 #ifdef __cplusplus extern "C" { #endif /** * Compression context: encapsulates compressor state (opaque: could change in future versions) */ struct cContext; /** * Compression parameters */ struct cData { char transform[64]; /* name of transforms [None|PACK|BWT|BWTS|LZ|LZX|LZP|ROLZ|ROLZX] [RLT|ZRLT|MTFT|RANK|SRT|TEXT|MM|EXE|UTF|DNA] */ char entropy[16]; /* name of entropy codec [None|Huffman|ANS0|ANS1|Range|FPAQ|TPAQ|TPAQX|CM] */ size_t blockSize; /* size of block in bytes */ unsigned int jobs; /* max number of concurrent tasks */ int checksum; /* 0, 32 or 64 to indicate size of block checksum */ int headerless; /* bool to indicate if the bitstream has a header (usually set to 0) */ }; /** * @return the version number of the library. * Useful for checking for compatibility at runtime. */ KANZI_API unsigned int CDECL getCompressorVersion(void) KANZI_NOEXCEPT; /** * Initialize the compressor internal states. * * @param cParam [IN|OUT] - the compression parameters, transform and enropy are validated and rewritten * @param dst [IN] - the destination stream of compressed data * @param ctx [IN|OUT] - pointer to the compression context created by the call * * @return 0 in case of success, else see error code in Error.hpp */ KANZI_API int CDECL initCompressor(struct cData* cParam, FILE* dst, struct cContext** ctx) KANZI_NOEXCEPT; /** * Compress a block of data. The compressor must have been initialized. * * @param ctx [IN] - the compression context created during initialization * @param src [IN] - the source block of data to compress * @param inSize [IN] - the size of the source block to compress. * @param outSize [IN|OUT] - the size of the compressed data Updated to reflect the number bytes written to the destination. * * @return 0 in case of success, else see error code in Error.hpp */ KANZI_API int CDECL compress(struct cContext* ctx, const unsigned char* src, size_t inSize, size_t* outSize) KANZI_NOEXCEPT; /** * Dispose the compressor and cleanup memory resources. * * @param ctx [IN] - the compression context created during initialization * @param outSize [IN|OUT] - the number of bytes written to the destination * (the compressor may flush internal data) * * @return 0 in case of success, else see error code in Error.hpp */ KANZI_API int CDECL disposeCompressor(struct cContext** ctx, size_t* outSize) KANZI_NOEXCEPT; #ifdef __cplusplus } #endif #endif kanzi-cpp-2.5.2/src/api/Decompressor.cpp000066400000000000000000000206771516423635400201520ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "Decompressor.hpp" #include "../types.hpp" #include "../Error.hpp" #include "../io/IOException.hpp" #include "../io/CompressedInputStream.hpp" #include "../transform/TransformFactory.hpp" #include "../entropy/EntropyDecoderFactory.hpp" #ifdef _MSC_VER #include #define FILENO(f) _fileno(f) #define READ(fd, buf, n) _read(fd, buf, uint(n)) #define WRITE(fd, buf, n) _write(fd, buf, uint(n)) #else #include #define FILENO(f) fileno(f) #define READ(fd, buf, n) read(fd, buf, n) #define WRITE(fd, buf, n) write(fd, buf, n) #endif using namespace std; using namespace kanzi; /** * Decompression context: encapsulates decompressor state (opaque: could change in future versions) */ struct dContext { kanzi::CompressedInputStream* pCis; size_t bufferSize; void* fis; }; namespace kanzi { class ifstreambuf FINAL : public streambuf { public: ifstreambuf(int fd) : _fd(fd) { // gptr() = egptr() initially forces underflow() on first read setg(_buffer + 4, _buffer + 4, _buffer + 4); } private: static const int BUF_SIZE = 1024 + 4; int _fd; char _buffer[BUF_SIZE]; virtual int_type underflow() { if (gptr() < egptr()) return traits_type::to_int_type(*gptr()); // Preserve up to 4 characters for putback int putback = int(gptr() - eback()); if (putback > 4) putback = 4; // Only move putback if > 0 if (putback > 0) { std::memmove(_buffer + (4 - putback), gptr() - putback, putback); } // Read new data const int n = int(READ(_fd, _buffer + 4, BUF_SIZE - 4)); if (n <= 0) return EOF; // Reset get pointers: // eback = start of buffer (including putback area) // gptr = first new kanzi::byte // egptr = end of new data setg(_buffer + (4 - putback), _buffer + 4, _buffer + 4 + n); return traits_type::to_int_type(*gptr()); } }; class FileInputStream FINAL : public istream { private: ifstreambuf _buf; public: FileInputStream(int fd) : istream(nullptr), _buf(fd) { rdbuf(&_buf); } }; } // Create internal dContext and CompressedInputStream KANZI_API int CDECL initDecompressor(struct dData* pData, FILE* src, struct dContext** pCtx) KANZI_NOEXCEPT { if ((pData == nullptr) || (pCtx == nullptr) || (src == nullptr)) return Error::ERR_INVALID_PARAM; // Validate buffer size (sanity check against huge allocations, e.g., > 2GB) if (pData->bufferSize > size_t(2) * 1024 * 1024 * 1024) return Error::ERR_INVALID_PARAM; dContext* dctx = nullptr; FileInputStream* fis = nullptr; try { const int fd = FILENO(src); if (fd == -1) return Error::ERR_CREATE_DECOMPRESSOR; // Create decompression stream and context *pCtx = nullptr; fis = new FileInputStream(fd); dctx = new dContext(); dctx->pCis = nullptr; dctx->fis = nullptr; if (pData->headerless != 0) { // Headerless mode: process params string transform = TransformFactory::getName(TransformFactory::getType(pData->transform)); string entropy = EntropyDecoderFactory::getName(EntropyDecoderFactory::getType(pData->entropy)); // Validate sizes if ((transform.length() >= sizeof(pData->transform)) || (entropy.length() >= sizeof(pData->entropy))) { delete fis; delete dctx; return Error::ERR_INVALID_PARAM; } memset(pData->transform, 0, sizeof(pData->transform)); strncpy(pData->transform, transform.c_str(), sizeof(pData->transform) - 1); memset(pData->entropy, 0, sizeof(pData->entropy)); strncpy(pData->entropy, entropy.c_str(), sizeof(pData->entropy) - 1); pData->blockSize = (pData->blockSize + 15) & -16; dctx->pCis = new CompressedInputStream(*fis, pData->jobs, pData->entropy, pData->transform, pData->blockSize, pData->checksum, pData->originalSize, #ifdef CONCURRENCY_ENABLED nullptr, #endif true, pData->bsVersion); } else { dctx->pCis = new CompressedInputStream(*fis, pData->jobs); } dctx->bufferSize = pData->bufferSize; dctx->fis = fis; *pCtx = dctx; } catch (const exception&) { if (dctx != nullptr) { // pCis is managed by dctx, but might not be assigned yet if (dctx->pCis) delete dctx->pCis; delete dctx; } // fis is usually owned by pCis, but if pCis wasn't created, we delete it if (fis != nullptr && (dctx == nullptr || dctx->pCis == nullptr)) delete fis; return Error::ERR_CREATE_DECOMPRESSOR; } return 0; } KANZI_API int CDECL decompress(struct dContext* pCtx, unsigned char* dst, size_t* inSize, size_t* outSize) KANZI_NOEXCEPT { if ((pCtx == nullptr) || (outSize == nullptr)) { return Error::ERR_INVALID_PARAM; } if (*outSize > pCtx->bufferSize) { return Error::ERR_INVALID_PARAM; } if (*outSize == 0) return 0; if (dst == nullptr) { return Error::ERR_INVALID_PARAM; } if (inSize) *inSize = 0; CompressedInputStream* pCis = pCtx->pCis; if (pCis == nullptr) { *outSize = 0; return Error::ERR_INVALID_PARAM; } try { const uint64 r = pCis->getRead(); pCis->read((char*)dst, std::streamsize(*outSize)); if (!pCis->good() && !pCis->eof()) return Error::ERR_READ_FILE; if (inSize) *inSize = size_t(pCis->getRead() - r); *outSize = size_t(pCis->gcount()); } catch (const IOException& ioe) { *outSize = 0; return ioe.error(); } catch (const exception&) { *outSize = 0; return Error::ERR_UNKNOWN; } return 0; } // Cleanup allocated internal data structures KANZI_API int CDECL disposeDecompressor(struct dContext** ppCtx) KANZI_NOEXCEPT { if ((ppCtx == nullptr) || (*ppCtx == nullptr)) return Error::ERR_INVALID_PARAM; dContext* pCtx = *ppCtx; CompressedInputStream* pCis = static_cast(pCtx->pCis); try { if (pCis != nullptr) { pCis->close(); delete pCis; pCis = nullptr; pCtx->pCis = nullptr; } if (pCtx->fis != nullptr) delete (FileInputStream*)pCtx->fis; pCtx->fis = nullptr; delete pCtx; *ppCtx = nullptr; } catch (const IOException& ioe) { if (pCis != nullptr) { delete pCis; pCis = nullptr; pCtx->pCis = nullptr; } if (pCtx->fis != nullptr) delete (FileInputStream*)pCtx->fis; pCtx->fis = nullptr; delete pCtx; *ppCtx = nullptr; return ioe.error(); } catch (const exception&) { if (pCis != nullptr) { delete pCis; pCtx->pCis = nullptr; } if (pCtx->fis != nullptr) delete (FileInputStream*)pCtx->fis; pCtx->fis = nullptr; delete pCtx; *ppCtx = nullptr; return Error::ERR_UNKNOWN; } return 0; } KANZI_API unsigned int CDECL getDecompressorVersion(void) KANZI_NOEXCEPT { return (KANZI_DECOMP_VERSION_MAJOR << 16) | (KANZI_DECOMP_VERSION_MINOR << 8) | KANZI_DECOMP_VERSION_PATCH; } kanzi-cpp-2.5.2/src/api/Decompressor.hpp000066400000000000000000000101521516423635400201420ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_Decompressor #define knz_Decompressor #ifdef _WIN32 #define CDECL __cdecl #ifdef KANZI_EXPORTS #define KANZI_API __declspec(dllexport) #else #define KANZI_API #endif #else #define CDECL #define KANZI_API #endif #include #ifdef __cplusplus #if __cplusplus >= 201103L // C++ 11 or higher #define KANZI_NOEXCEPT noexcept #else #define KANZI_NOEXCEPT #endif #else #define KANZI_NOEXCEPT #endif #define KANZI_DECOMP_VERSION_MAJOR 1 #define KANZI_DECOMP_VERSION_MINOR 0 #define KANZI_DECOMP_VERSION_PATCH 0 #ifdef __cplusplus extern "C" { #endif /** * Decompression context: encapsulates decompressor state (opaque: could change in future versions) */ struct dContext; /** * Decompression parameters */ struct dData { // Required fields size_t bufferSize; /* read buffer size (at least block size) */ unsigned int jobs; /* max number of concurrent tasks */ int headerless; /* bool to indicate if the bitstream has a header (usually set to 0) */ // Optional fields: only required if headerless is true char transform[64]; /* name of transforms [None|PACK|BWT|BWTS|LZ|LZX|LZP|ROLZ|ROLZX] [RLT|ZRLT|MTFT|RANK|SRT|TEXT|MM|EXE|UTF|DNA] */ char entropy[16]; /* name of entropy codec [None|Huffman|ANS0|ANS1|Range|FPAQ|TPAQ|TPAQX|CM] */ unsigned int blockSize; /* size of block in bytes */ size_t originalSize; /* size of original file in bytes */ int checksum; /* 0, 32 or 64 to indicate size of block checksum */ int bsVersion; /* version of the bitstream */ }; /** * @return the version number of the library. * Useful for checking for compatibility at runtime. */ KANZI_API unsigned int CDECL getDecompressorVersion(void) KANZI_NOEXCEPT; /** * Initialize the decompressor internal states. * * @param dParam [IN|OUT] - the decompression parameters. Transform and entropy are * validated and rewritten. * @param src [IN] - the source stream of compressed data * @param ctx [IN|OUT] - a pointer to the decompression context created by the call * * @return 0 in case of success, else see error code in Error.hpp */ KANZI_API int CDECL initDecompressor(struct dData* dParam, FILE* src, struct dContext** ctx) KANZI_NOEXCEPT; /** * Decompress a block of data. The decompressor must have been initialized. * * @param ctx [IN] - the decompression context created during initialization * @param dst [IN] - the destination block of decompressed data * @param inSize [OUT] - the number of bytes read from source. * @param outSize [IN|OUT] - the size of the block to decompress. * Updated to reflect the number of decompressed bytes * * @return 0 in case of success, else see error code in Error.hpp */ KANZI_API int CDECL decompress(struct dContext* ctx, unsigned char* dst, size_t* inSize, size_t* outSize) KANZI_NOEXCEPT; /** * Dispose the decompressor and cleanup memory resources. * * @param ctx [IN] - the compression context created during initialization * * @return 0 in case of success, else see error code in Error.hpp */ KANZI_API int CDECL disposeDecompressor(struct dContext** ctx) KANZI_NOEXCEPT; #ifdef __cplusplus } #endif #endif kanzi-cpp-2.5.2/src/api/kanzi.py000066400000000000000000000075061516423635400164630ustar00rootroot00000000000000import ctypes from kanzi_c_api import _lib, _libc, cContext_p, dContext_p, cData, dData class KanziError(RuntimeError): pass def _check(rc, msg): if rc != 0: raise KanziError(f"{msg} (error code {rc})") # ----------------------------------------------------------------------------- # Compressor # ----------------------------------------------------------------------------- class Compressor: def __init__( self, dst_path, transform=b"LZ", entropy=b"Huffman", block_size=1 << 20, jobs=1, checksum=0, headerless=0, ): self._file = _libc.fopen(dst_path.encode(), b"wb") if not self._file: raise OSError("fopen failed") self._ctx = cContext_p() params = cData() params.transform = transform params.entropy = entropy params.blockSize = block_size params.jobs = jobs params.checksum = checksum params.headerless = headerless rc = _lib.initCompressor( ctypes.byref(params), self._file, ctypes.byref(self._ctx), ) _check(rc, "initCompressor failed") def compress(self, data: bytes) -> int: src = (ctypes.c_ubyte * len(data)).from_buffer_copy(data) out_size = ctypes.c_size_t(0) rc = _lib.compress( self._ctx, src, len(data), ctypes.byref(out_size), ) _check(rc, "compress failed") return out_size.value def close(self) -> int: out_size = ctypes.c_size_t(0) rc = _lib.disposeCompressor( ctypes.byref(self._ctx), ctypes.byref(out_size), ) _check(rc, "disposeCompressor failed") _libc.fclose(self._file) self._file = None return out_size.value def __enter__(self): return self def __exit__(self, exc_type, exc, tb): self.close() # ----------------------------------------------------------------------------- # Decompressor # ----------------------------------------------------------------------------- class Decompressor: def __init__( self, src_path, buffer_size, jobs=1, headerless=0, **headerless_params, ): self._file = _libc.fopen(src_path.encode(), b"rb") if not self._file: raise OSError("fopen failed") self._ctx = dContext_p() params = dData() params.bufferSize = buffer_size params.jobs = jobs params.headerless = headerless if headerless: params.transform = headerless_params["transform"] params.entropy = headerless_params["entropy"] params.blockSize = headerless_params["blockSize"] params.originalSize = headerless_params["originalSize"] params.checksum = headerless_params["checksum"] params.bsVersion = headerless_params["bsVersion"] rc = _lib.initDecompressor( ctypes.byref(params), self._file, ctypes.byref(self._ctx), ) _check(rc, "initDecompressor failed") def decompress_block(self, max_output: int) -> bytes: dst = (ctypes.c_ubyte * max_output)() in_size = ctypes.c_size_t(0) out_size = ctypes.c_size_t(max_output) rc = _lib.decompress( self._ctx, dst, ctypes.byref(in_size), ctypes.byref(out_size), ) _check(rc, "decompress failed") return bytes(dst[: out_size.value]) def close(self): rc = _lib.disposeDecompressor(ctypes.byref(self._ctx)) _check(rc, "disposeDecompressor failed") _libc.fclose(self._file) def __enter__(self): return self def __exit__(self, exc_type, exc, tb): self.close() kanzi-cpp-2.5.2/src/api/kanzi_c_api.py000066400000000000000000000111201516423635400176010ustar00rootroot00000000000000import ctypes import sys import os # ----------------------------------------------------------------------------- # Platform detection # ----------------------------------------------------------------------------- if sys.platform.startswith("win"): KANZI_LIB_NAME = "kanzi.dll" LIBC_NAME = "msvcrt.dll" elif sys.platform == "darwin": KANZI_LIB_NAME = "libkanzi.dylib" LIBC_NAME = "libc.dylib" else: KANZI_LIB_NAME = "libkanzi.so" LIBC_NAME = "libc.so.6" # ----------------------------------------------------------------------------- # Load shared libraries # ----------------------------------------------------------------------------- _lib = ctypes.CDLL(KANZI_LIB_NAME) _libc = ctypes.CDLL(LIBC_NAME) # ----------------------------------------------------------------------------- # libc FILE* # ----------------------------------------------------------------------------- FILE_p = ctypes.c_void_p _libc.fopen.argtypes = [ctypes.c_char_p, ctypes.c_char_p] _libc.fopen.restype = FILE_p _libc.fclose.argtypes = [FILE_p] _libc.fclose.restype = ctypes.c_int # ----------------------------------------------------------------------------- # Opaque contexts # ----------------------------------------------------------------------------- class cContext(ctypes.Structure): pass class dContext(ctypes.Structure): pass cContext_p = ctypes.POINTER(cContext) dContext_p = ctypes.POINTER(dContext) # ----------------------------------------------------------------------------- # Compression parameters # ----------------------------------------------------------------------------- class cData(ctypes.Structure): _fields_ = [ ("transform", ctypes.c_char * 64), ("entropy", ctypes.c_char * 16), ("blockSize", ctypes.c_size_t), ("jobs", ctypes.c_uint), ("checksum", ctypes.c_int), ("headerless", ctypes.c_int), ] class dData(ctypes.Structure): _fields_ = [ # Required ("bufferSize", ctypes.c_size_t), ("jobs", ctypes.c_uint), ("headerless", ctypes.c_int), # Headerless-only ("transform", ctypes.c_char * 64), ("entropy", ctypes.c_char * 16), ("blockSize", ctypes.c_uint), ("originalSize", ctypes.c_size_t), ("checksum", ctypes.c_int), ("bsVersion", ctypes.c_int), ] # ----------------------------------------------------------------------------- # Function prototypes - Compressor # ----------------------------------------------------------------------------- _lib.getCompressorVersion.argtypes = [] _lib.getCompressorVersion.restype = ctypes.c_uint _lib.initCompressor.argtypes = [ ctypes.POINTER(cData), FILE_p, ctypes.POINTER(cContext_p), ] _lib.initCompressor.restype = ctypes.c_int _lib.compress.argtypes = [ cContext_p, ctypes.POINTER(ctypes.c_ubyte), ctypes.c_size_t, ctypes.POINTER(ctypes.c_size_t), ] _lib.compress.restype = ctypes.c_int _lib.disposeCompressor.argtypes = [ ctypes.POINTER(cContext_p), ctypes.POINTER(ctypes.c_size_t), ] _lib.disposeCompressor.restype = ctypes.c_int # ----------------------------------------------------------------------------- # Function prototypes - Decompressor # ----------------------------------------------------------------------------- _lib.getDecompressorVersion.argtypes = [] _lib.getDecompressorVersion.restype = ctypes.c_uint _lib.initDecompressor.argtypes = [ ctypes.POINTER(dData), FILE_p, ctypes.POINTER(dContext_p), ] _lib.initDecompressor.restype = ctypes.c_int _lib.decompress.argtypes = [ dContext_p, ctypes.POINTER(ctypes.c_ubyte), ctypes.POINTER(ctypes.c_size_t), ctypes.POINTER(ctypes.c_size_t), ] _lib.decompress.restype = ctypes.c_int _lib.disposeDecompressor.argtypes = [ ctypes.POINTER(dContext_p), ] _lib.disposeDecompressor.restype = ctypes.c_int # ----------------------------------------------------------------------------- # Optional helpers (recommended for kanzi.py) # ----------------------------------------------------------------------------- def fopen(path: bytes, mode: bytes) -> FILE_p: return _libc.fopen(path, mode) def fclose(fp: FILE_p) -> int: return _libc.fclose(fp) # ----------------------------------------------------------------------------- # Public exports # ----------------------------------------------------------------------------- __all__ = [ # libraries "_lib", "_libc", # FILE* "FILE_p", "fopen", "fclose", # contexts "cContext", "dContext", "cContext_p", "dContext_p", # params "cData", "dData", # functions (via _lib) ] kanzi-cpp-2.5.2/src/app/000077500000000000000000000000001516423635400147745ustar00rootroot00000000000000kanzi-cpp-2.5.2/src/app/BlockCompressor.cpp000066400000000000000000000754701516423635400206240ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include #include "BlockCompressor.hpp" #include "InfoPrinter.hpp" #include "../SliceArray.hpp" #include "../transform/TransformFactory.hpp" #include "../io/IOException.hpp" #include "../io/IOUtil.hpp" #include "../io/NullOutputStream.hpp" #include "../util/Clock.hpp" #include "../util/Printer.hpp" #ifdef CONCURRENCY_ENABLED #include #endif using namespace kanzi; using namespace std; const int BlockCompressor::DEFAULT_BLOCK_SIZE = 4 * 1024 * 1024; const int BlockCompressor::MIN_BLOCK_SIZE = 1024; const int BlockCompressor::MAX_BLOCK_SIZE = 1024 * 1024 * 1024; BlockCompressor::BlockCompressor(const Context& ctx) : _ctx(ctx) { int level = -1; if (_ctx.has("level") == true) { level = _ctx.getInt("level"); if ((level < 0) || (level > 9)) throw invalid_argument("Invalid compression level"); string tranformAndCodec[2]; getTransformAndCodec(level, tranformAndCodec); _transform = tranformAndCodec[0]; _codec = tranformAndCodec[1]; } else { if ((_ctx.has("transform") == false) && (_ctx.has("entropy") == false)) { // Default to level 3 string tranformAndCodec[2]; getTransformAndCodec(3, tranformAndCodec); _transform = tranformAndCodec[0]; _codec = tranformAndCodec[1]; } else { _codec = _ctx.getString("entropy", "NONE"); string strTransf = _ctx.getString("transform", "NONE"); // Extract transform names. Curate input (EG. NONE+NONE+xxxx => xxxx) _transform = TransformFactory::getName(TransformFactory::getType(strTransf.c_str())); } } _checksum = _ctx.getInt("checksum", 0); if ((_checksum != 0) && (_checksum != 32) && (_checksum != 64)) throw invalid_argument("Invalid block checksum size"); _ctx.putInt("checksum", _checksum); _ctx.putString("entropy", _codec); _ctx.putString("transform", _transform); _overwrite = _ctx.getInt("overwrite", 0) != 0; _ctx.putInt("overwrite", _overwrite ? 1 : 0); _skipBlocks = _ctx.getInt("skipBlocks", 0) != 0; _ctx.putInt("skipBlocks", _skipBlocks ? 1 : 0); _verbosity = _ctx.getInt("verbosity", 1); _ctx.putInt("verbosity", _verbosity); _jobs = _ctx.getInt("jobs", 1); _ctx.putInt("jobs", _jobs); _noDotFiles = _ctx.getInt("noDotFiles", 0) != 0; _ctx.putInt("noDotFiles", _noDotFiles ? 1 : 0); _noLinks = _ctx.getInt("noLinks", 0) != 0; _ctx.putInt("noLinks", _noLinks ? 1 : 0); _autoBlockSize = _ctx.getInt("autoBlock", 0) != 0; _ctx.putInt("autoBlock", _autoBlockSize ? 1 : 0); _reorderFiles = _ctx.getInt("fileReorder", 0) != 0; _ctx.putInt("fileReorder", _reorderFiles ? 1 : 0); bool remove = _ctx.getInt("remove", 0) != 0; _ctx.putInt("remove", remove ? 1 : 0); if (_ctx.has("inputName") == false) throw invalid_argument("Missing input name"); _inputName = _ctx.getString("inputName") == "" ? "STDIN" : _ctx.getString("inputName"); if (Global::isReservedName(_inputName)) throw invalid_argument("'" + _inputName + "' is a reserved name"); if (_ctx.has("outputName") == false) throw invalid_argument("Missing output name"); string str = _ctx.getString("outputName"); _outputName = (str == "") && (_inputName == "STDIN") ? "STDOUT" : str; if (Global::isReservedName(_outputName)) throw invalid_argument("'" + _outputName + "' is a reserved name"); if (_ctx.has("blockSize") == false) { switch (level) { case 6: _blockSize = 2 * DEFAULT_BLOCK_SIZE; break; case 7: _blockSize = 4 * DEFAULT_BLOCK_SIZE; break; case 8: _blockSize = 4 * DEFAULT_BLOCK_SIZE; break; case 9: _blockSize = 8 * DEFAULT_BLOCK_SIZE; break; default: _blockSize = DEFAULT_BLOCK_SIZE; } _ctx.putInt("blockSize", _blockSize); } else { uint64 bl = _ctx.getLong("blockSize", DEFAULT_BLOCK_SIZE); if (bl < MIN_BLOCK_SIZE) { stringstream sserr; sserr << "Minimum block size is " << (MIN_BLOCK_SIZE / 1024) << " KiB ("; sserr << MIN_BLOCK_SIZE << " bytes), got " << bl; sserr << (bl > 1 ? " bytes" : " byte"); throw invalid_argument(sserr.str()); } if (bl > MAX_BLOCK_SIZE) { stringstream sserr; sserr << "Maximum block size is " << (MAX_BLOCK_SIZE / (1024 * 1024 * 1024)) << " GiB ("; sserr << MAX_BLOCK_SIZE << " bytes), got " << bl << " bytes"; throw invalid_argument(sserr.str()); } bl = (bl + 15) & ~uint64(15); _blockSize = int(min(bl, uint64(MAX_BLOCK_SIZE))); } } BlockCompressor::~BlockCompressor() { dispose(); _listeners.clear(); } int BlockCompressor::compress(uint64& outputSize) { vector files; Clock stopClock; int nbFiles = 1; Printer log(cout); stringstream ss; string upperInputName = _inputName; transform(upperInputName.begin(), upperInputName.end(), upperInputName.begin(), safeToUpper); bool isStdIn = upperInputName == "STDIN"; if (isStdIn == false) { vector errors; bool isRecursive = (_inputName.length() < 2) || (_inputName[_inputName.length() - 2] != PATH_SEPARATOR) || (_inputName[_inputName.length() - 1] != '.'); FileListConfig cfg = { isRecursive, _noLinks, false, _noDotFiles }; createFileList(_inputName, files, cfg, errors); if (errors.size() > 0) { for (size_t i = 0; i < errors.size(); i++) cerr << errors[i] << endl; return Error::ERR_OPEN_FILE; } if (files.size() == 0) { cerr << "Cannot find any file to compress" << endl; return Error::ERR_OPEN_FILE; } nbFiles = int(files.size()); string strFiles = nbFiles > 1 ? " files" : " file"; ss << nbFiles << strFiles << " to compress\n"; log.println(ss.str(), _verbosity > 0); ss.str(string()); } string upperOutputName = _outputName; transform(upperOutputName.begin(), upperOutputName.end(), upperOutputName.begin(), safeToUpper); bool isStdOut = upperOutputName == "STDOUT"; // Limit verbosity level when output is stdout // Logic is duplicated here to avoid dependency to Kanzi.cpp if (isStdOut == true) _verbosity = 0; // Limit verbosity level when files are processed concurrently if ((_verbosity > 1) && (_jobs > 1) && (nbFiles > 1)) { log.println("Warning: limiting verbosity to 1 due to concurrent processing of input files.\n", true); _verbosity = 1; } if (_verbosity > 2) { if (_autoBlockSize == true) ss << "Block size: 'auto'" << endl; else ss << "Block size: " << _blockSize << " bytes" << endl; ss << "Verbosity: " << _verbosity << endl; ss << "Overwrite: " << (_overwrite ? "true" : "false") << endl; string ckSize = "NONE"; if (_checksum == 32) ckSize = "32 bits"; else if (_checksum == 64) ckSize = "64 bits"; ss << "Block checksum: " << ckSize << endl; string etransform = _transform; transform(etransform.begin(), etransform.end(), etransform.begin(), safeToUpper); ss << "Using " << (etransform == "NONE" ? "no" : _transform) << " transform (stage 1)" << endl; string ecodec = _codec; transform(ecodec.begin(), ecodec.end(), ecodec.begin(), safeToUpper); ss << "Using " << (ecodec == "NONE" ? "no" : _codec) << " entropy codec (stage 2)" << endl; ss << "Using " << _jobs << " job" << (_jobs > 1 ? "s" : "") << endl; log.print(ss.str(), true); ss.str(string()); } InfoPrinter listener(_verbosity, InfoPrinter::COMPRESSION, cout); if (_verbosity > 2) addListener(listener); int res = 0; uint64 read = 0; uint64 written = 0; bool inputIsDir = false; string formattedOutName = _outputName; string formattedInName = _inputName; bool specialOutput = (isStdOut == true) || (upperOutputName == "NONE"); // Need to strip path separator at the end to make 'stat()' happy if ((formattedOutName.size() > 1) && (formattedOutName[formattedOutName.size() - 1] == PATH_SEPARATOR)) { formattedOutName.resize(formattedOutName.size() - 1); } if (isStdIn == false) { struct STAT buffer; if ((formattedInName.size() > 1) && (formattedInName[formattedInName.size() - 1] == PATH_SEPARATOR)) { formattedInName.resize(formattedInName.size() - 1); } if (STAT(formattedInName.c_str(), &buffer) != 0) { cerr << "Cannot access input file '" << formattedInName << "'" << endl; return Error::ERR_OPEN_FILE; } if ((buffer.st_mode & S_IFDIR) != 0) { inputIsDir = true; if ((formattedInName.size() != 0) && (formattedInName[formattedInName.size() - 1] == '.')) { formattedInName.resize(formattedInName.size() - 1); } if ((formattedInName.size() != 0) && (formattedInName[formattedInName.size() - 1] != PATH_SEPARATOR)) { formattedInName += PATH_SEPARATOR; } if ((formattedOutName.size() != 0) && (specialOutput == false)) { if (STAT(formattedOutName.c_str(), &buffer) != 0) { cerr << "Output must be an existing directory (or 'NONE')" << endl; return Error::ERR_OPEN_FILE; } if ((buffer.st_mode & S_IFDIR) == 0) { cerr << "Output must be a directory (or 'NONE')" << endl; return Error::ERR_CREATE_FILE; } formattedOutName += PATH_SEPARATOR; } } else { if ((formattedOutName.size() != 0) && (specialOutput == false)) { if ((STAT(formattedOutName.c_str(), &buffer) == 0) && ((buffer.st_mode & S_IFDIR) != 0)) { cerr << "Output must be a file (or 'NONE')" << endl; return Error::ERR_CREATE_FILE; } } } } #ifdef CONCURRENCY_ENABLED ThreadPool pool(_jobs + 1); // +1 to avoid deadlock due to thread exhaustion #endif _ctx.putInt("verbosity", _verbosity); // Run the task(s) if (nbFiles == 1) { string oName = formattedOutName; string iName = "STDIN"; if (isStdIn == true) { if (oName.length() == 0) { oName = "STDOUT"; } } else { iName = files[0].fullPath(); _ctx.putLong("fileSize", files[0]._size); // Set the block size to optimize compression ratio when possible if ((_autoBlockSize == true) && (_jobs > 0)) { const int64 bl = files[0]._size / _jobs; _blockSize = int(max(min((bl + 63) & ~63, int64(MAX_BLOCK_SIZE)), int64(MIN_BLOCK_SIZE))); _ctx.putInt("blockSize", _blockSize); } if (oName.length() == 0) { oName = iName + ".knz"; } else if ((inputIsDir == true) && (specialOutput == false)) { oName = formattedOutName + iName.substr(formattedInName.size()) + ".knz"; } } _ctx.putString("inputName", iName); _ctx.putString("outputName", oName); FileCompressTask task(_ctx, _listeners); FileCompressResult fcr = task.run(); res = fcr._code; read = fcr._read; written = fcr._written; if (res != 0) { cerr << fcr._errMsg << endl; } } else { vector*> tasks; #ifdef CONCURRENCY_ENABLED vector jobsPerTask(nbFiles); Global::computeJobsPerTask(jobsPerTask.data(), _jobs, nbFiles); #endif if (_reorderFiles == true) sortFilesByPathAndSize(files, true); // Create one task per file for (int i = 0; i < nbFiles; i++) { string oName = formattedOutName; string iName = files[i].fullPath(); if (oName.length() == 0) { oName = iName + ".knz"; } else if ((inputIsDir == true) && (specialOutput == false)) { oName = formattedOutName + iName.substr(formattedInName.size()) + ".knz"; } int blockSize = _blockSize; // Set the block size to optimize compression ratio when possible if ((_autoBlockSize == true) && (_jobs > 0)) { const int64 bl = files[i]._size / _jobs; blockSize = int(max(min((bl + 63) & ~63, int64(MAX_BLOCK_SIZE)), int64(MIN_BLOCK_SIZE))); } #ifdef CONCURRENCY_ENABLED Context taskCtx(_ctx, &pool); taskCtx.putInt("jobs", jobsPerTask[i]); #else Context taskCtx(_ctx); taskCtx.putInt("jobs", 1); #endif taskCtx.putLong("fileSize", files[i]._size); taskCtx.putString("inputName", iName); taskCtx.putString("outputName", oName); taskCtx.putInt("blockSize", blockSize); FileCompressTask* task = new FileCompressTask(taskCtx, _listeners); tasks.push_back(task); } bool doConcurrent = _jobs > 1; #ifdef CONCURRENCY_ENABLED if (doConcurrent) { vector*> workers; vector > results; BoundedConcurrentQueue queue(nbFiles, &tasks[0]); // !tasks.empty() // Create one worker per job and run it. A worker calls several tasks sequentially. for (int i = 0; i < _jobs; i++) { workers.push_back(new FileCompressWorker(&queue)); if (_ctx.getPool() == nullptr) results.push_back(async(launch::async, &FileCompressWorker::run, workers[i])); else results.push_back(_ctx.getPool()->schedule(&FileCompressWorker::run, workers[i])); } // Wait for results for (int i = 0; i < _jobs; i++) { FileCompressResult fcr = results[i].get(); res = fcr._code; read += fcr._read; written += fcr._written; if (res != 0) { cerr << fcr._errMsg << endl; // Exit early by telling the workers that the queue is empty queue.clear(); } } for (int i = 0; i < _jobs; i++) delete workers[i]; } #endif if (!doConcurrent) { for (uint i = 0; i < tasks.size(); i++) { FileCompressResult fcr = tasks[i]->run(); res = fcr._code; read += fcr._read; written += fcr._written; if (res != 0) { cerr << fcr._errMsg << endl; break; } } } for (int i = 0; i < nbFiles; i++) delete tasks[i]; } stopClock.stop(); if (nbFiles > 1) { if (_verbosity > 0) { double delta = stopClock.elapsed(); log.println("", true); ss.str(string()); ss << "Total compression time: "; if (delta >= 1e5) { ss.precision(1); ss.setf(ios::fixed); ss << (delta / 1000) << " s" << endl; } else { ss << int(delta) << " ms" << endl; } ss << "Total output size: " << written << (written > 1 ? " bytes" : " byte") << endl; if (read > 0) { ss.precision(6); ss << "Compression ratio: " << (float(written) / float(read)) << endl; } log.print(ss.str(), true); ss.str(string()); } } if (_verbosity > 2) removeListener(listener); outputSize += written; return res; } bool BlockCompressor::addListener(Listener& bl) { _listeners.push_back(&bl); return true; } bool BlockCompressor::removeListener(Listener& bl) { std::vector*>::iterator it = find(_listeners.begin(), _listeners.end(), &bl); if (it == _listeners.end()) return false; _listeners.erase(it); return true; } void BlockCompressor::notifyListeners(vector*>& listeners, const Event& evt) { for (size_t i = 0; i < listeners.size(); i++) listeners[i]->processEvent(evt); } void BlockCompressor::getTransformAndCodec(int level, string tranformAndCodec[2]) { switch (level) { case 0: tranformAndCodec[0] = "NONE"; tranformAndCodec[1] = "NONE"; break; case 1: tranformAndCodec[0] = "LZX"; tranformAndCodec[1] = "NONE"; break; case 2: tranformAndCodec[0] = "DNA+LZ"; tranformAndCodec[1] = "HUFFMAN"; break; case 3: tranformAndCodec[0] = "TEXT+UTF+PACK+MM+LZX"; tranformAndCodec[1] = "HUFFMAN"; break; case 4: tranformAndCodec[0] = "TEXT+UTF+EXE+PACK+MM+ROLZ"; tranformAndCodec[1] = "NONE"; break; case 5: tranformAndCodec[0] = "TEXT+UTF+BWT+RANK+ZRLT"; tranformAndCodec[1] = "ANS0"; break; case 6: tranformAndCodec[0] = "TEXT+UTF+BWT+SRT+ZRLT"; tranformAndCodec[1] = "FPAQ"; break; case 7: tranformAndCodec[0] = "LZP+TEXT+UTF+BWT+LZP"; tranformAndCodec[1] = "CM"; break; case 8: tranformAndCodec[0] = "EXE+RLT+TEXT+UTF+DNA"; tranformAndCodec[1] = "TPAQ"; break; case 9: tranformAndCodec[0] = "EXE+RLT+TEXT+UTF+DNA"; tranformAndCodec[1] = "TPAQX"; break; default: tranformAndCodec[0] = "Unknown"; tranformAndCodec[1] = "Unknown"; } } template FileCompressTask::FileCompressTask(const Context& ctx, vector*>& listeners) : _ctx(ctx) , _listeners(listeners) { _is = nullptr; _cos = nullptr; } template T FileCompressTask::run() { Printer log(cout); int verbosity = _ctx.getInt("verbosity"); string inputName = _ctx.getString("inputName"); string outputName = _ctx.getString("outputName"); stringstream ss; if (verbosity > 2) { ss << "Input file name: '" << inputName << "'" << endl; ss << "Output file name: '" << outputName << "'" << endl; log.print(ss.str(), true); ss.str(string()); } bool overwrite = _ctx.getInt("overwrite") != 0; OutputStream* os = nullptr; #define CLEANUP_COMP_OS if ((os != nullptr) && (os != &cout)) {\ delete os; \ os = nullptr; \ } try { string str = outputName; transform(str.begin(), str.end(), str.begin(), safeToUpper); if (str == "NONE") { os = new NullOutputStream(); } else if (str == "STDOUT") { os = &cout; } else { if (samePaths(inputName, outputName)) { stringstream sserr; sserr << "The input and output files must be different" << endl; return T(Error::ERR_CREATE_FILE, 0, 0, sserr.str()); } struct STAT buffer; string path = outputName; replace(path.begin(), path.end(), '\\', '/'); if (STAT(outputName.c_str(), &buffer) == 0) { if ((buffer.st_mode & S_IFDIR) != 0) { return T(Error::ERR_OUTPUT_IS_DIR, 0, 0, "The output file is a directory"); } if (overwrite == false) { stringstream sserr; sserr << "File '" << outputName << "' exists and the 'force' command " << "line option has not been provided"; return T(Error::ERR_OVERWRITE_FILE, 0, 0, sserr.str()); } // Delete output file to ensure consistent performance remove(outputName.c_str()); } os = new ofstream(outputName.c_str(), ofstream::out | ofstream::binary); if (!*os) { string errMsg; if (overwrite == true) { // Attempt to create the full folder hierarchy to file string parentDir = outputName; size_t idx = outputName.find_last_of(PATH_SEPARATOR); if (idx != string::npos) parentDir.resize(idx); int rmkd = mkdirAll(parentDir); if ((rmkd == 0) || (rmkd == EEXIST)) { delete os; os = new ofstream(outputName.c_str(), ofstream::out | ofstream::binary); } else { errMsg = strerror(rmkd); } } if (!*os) { delete os; stringstream sserr; sserr << "Cannot open output file '" << outputName << "' for writing"; if (errMsg != "") sserr << ": " << errMsg; return T(Error::ERR_CREATE_FILE, 0, 0, sserr.str()); } } } try { _cos = new CompressedOutputStream(*os, _ctx); for (uint i = 0; i < _listeners.size(); i++) _cos->addListener(*_listeners[i]); } catch (const invalid_argument& e) { CLEANUP_COMP_OS stringstream sserr; sserr << "Cannot create compressed stream: " << e.what(); return T(Error::ERR_CREATE_COMPRESSOR, 0, 0, sserr.str()); } } catch (const exception& e) { CLEANUP_COMP_OS stringstream sserr; sserr << "Cannot open output file '" << outputName << "' for writing: " << e.what(); return T(Error::ERR_CREATE_FILE, 0, 0, sserr.str()); } #define CLEANUP_COMP_IS if ((_is != nullptr) && (_is != &cin)) {\ delete _is; \ _is = nullptr; \ } try { string str = inputName; transform(str.begin(), str.end(), str.begin(), safeToUpper); if (str == "STDIN") { _is = &cin; } else { ifstream* ifs = new ifstream(inputName.c_str(), ifstream::in | ifstream::binary); if (!*ifs) { delete ifs; delete _cos; _cos = nullptr; stringstream sserr; sserr << "Cannot open input file '" << inputName << "'"; return T(Error::ERR_OPEN_FILE, 0, 0, sserr.str()); } _is = ifs; } } catch (const exception& e) { delete _cos; _cos = nullptr; CLEANUP_COMP_IS CLEANUP_COMP_OS stringstream sserr; sserr << "Cannot open input file '" << inputName << "': " << e.what(); return T(Error::ERR_OPEN_FILE, 0, 0, sserr.str()); } // Compress ss << "\nCompressing " << inputName << " ..."; log.println(ss.str(), verbosity > 1); log.println("\n", verbosity > 3); int64 read = 0; kanzi::byte* buf = new kanzi::byte[DEFAULT_BUFFER_SIZE]; SliceArray sa(buf, DEFAULT_BUFFER_SIZE, 0); WallTimer timer; if (_listeners.size() > 0) { int64 inputSize = _ctx.getLong("fileSize", -1); Event evt(Event::COMPRESSION_START, 0, inputSize, timer.getCurrentTime()); BlockCompressor::notifyListeners(_listeners, evt); } Clock stopClock; try { while (true) { int len; try { _is->read(reinterpret_cast(&sa._array[0]), sa._length); len = *_is ? sa._length : int(_is->gcount()); } catch (const exception& e) { CLEANUP_COMP_IS const uint64 w = _cos->getWritten(); delete[] buf; delete _cos; _cos = nullptr; CLEANUP_COMP_OS stringstream sserr; sserr << "Failed to read block from file '" << inputName << "': "; sserr << e.what() << endl; return T(Error::ERR_READ_FILE, read, w, sserr.str().c_str()); } if (len <= 0) break; // Just write block to the compressed output stream ! read += len; _cos->write(reinterpret_cast(&sa._array[0]), len); } } catch (const IOException& ioe) { const uint64 w = _cos->getWritten(); delete _cos; _cos = nullptr; CLEANUP_COMP_IS CLEANUP_COMP_OS delete[] buf; return T(ioe.error(), read, w, ioe.what()); } catch (const exception& e) { const uint64 w = _cos->getWritten(); delete _cos; _cos = nullptr; CLEANUP_COMP_IS CLEANUP_COMP_OS delete[] buf; stringstream sserr; sserr << "An unexpected condition happened. Exiting ..." << endl << e.what(); return T(Error::ERR_UNKNOWN, read, w, sserr.str()); } // Close compressed stream to ensure all data are flushed try { if (_cos != nullptr) _cos->close(); } catch (const IOException& ioe) { const uint64 w = (_cos != nullptr) ? _cos->getWritten() : 0; if (_cos != nullptr) { delete _cos; _cos = nullptr; } CLEANUP_COMP_OS CLEANUP_COMP_IS delete[] buf; return T(ioe.error(), read, w, ioe.what()); } catch (const exception& e) { const uint64 w = (_cos != nullptr) ? _cos->getWritten() : 0; if (_cos != nullptr) { delete _cos; _cos = nullptr; } CLEANUP_COMP_OS CLEANUP_COMP_IS delete[] buf; stringstream sserr; sserr << "Compression failure: " << e.what(); return T(Error::ERR_UNKNOWN, read, w, sserr.str()); } uint64 encoded = _cos->getWritten(); // Clean up resources at the end of the method as the task may be // recycled in a threadpool and the destructor not called. delete _cos; _cos = nullptr; // os destructor will call close if ofstream CLEANUP_COMP_OS CLEANUP_COMP_IS stopClock.stop(); double delta = stopClock.elapsed(); if (verbosity >= 1) { log.println("", (verbosity > 1) && (read > 0)); ss.str(string()); if (verbosity > 1) { if (delta >= 1e5) { ss.precision(1); ss.setf(ios::fixed); ss << "Compression time: " << (delta / 1000) << " s" << endl; } else { ss << "Compression time: " << int(delta) << " ms" << endl; } ss << "Input size: " << read << endl; ss << "Output size: " << encoded << endl; if (read > 0) { ss.precision(6); ss << "Compression ratio: " << (double(encoded) / double(read)) << endl; } log.print(ss.str(), true); ss.str(string()); } else { ss << "Compressed " << inputName << ": " << read << " => " << encoded; ss.precision(2); ss.setf(ios::fixed); if (read > 0) { const double r = double(encoded) / double(read); ss << " (" << (100 * r) << "%)"; } if (delta >= 1e5) { ss.precision(1); ss << " in " << (delta / 1000) << " s"; } else { ss << " in " << int(delta) << " ms"; } log.println(ss.str(), true); ss.str(string()); } if ((verbosity > 1) && (delta > 0) && (read > 0)) { double b2KiB = double(1000) / double(1024); ss << "Throughput (KiB/s): " << uint(double(read) * b2KiB / delta); log.println(ss.str(), true); ss.str(string()); } log.println("", verbosity > 1); } if (_listeners.size() > 0) { Event evt(Event::COMPRESSION_END, 0, int64(encoded), timer.getCurrentTime()); BlockCompressor::notifyListeners(_listeners, evt); } if (_ctx.getInt("remove", 0) != 0) { // Delete input file if (inputName == "STDIN") { log.println("Warning: ignoring remove option with STDIN", verbosity > 0); } else if (remove(inputName.c_str()) != 0) { log.println("Warning: input file could not be deleted", verbosity > 0); } } delete[] buf; return T(0, read, encoded, ""); } template FileCompressTask::~FileCompressTask() { dispose(); if (_cos != nullptr) { delete _cos; _cos = nullptr; } try { if ((_is != nullptr) && (_is != &cin)) { delete _is; } _is = nullptr; } catch (const exception&) { // Ignore: best effort } } // Close and flush streams. Do not deallocate resources. Idempotent. template void FileCompressTask::dispose() { try { if (_cos != nullptr) { _cos->close(); } } catch (const exception& e) { cerr << "Compression failure: " << e.what() << endl; } // _is destructor will call close if ifstream } #ifdef CONCURRENCY_ENABLED template R FileCompressWorker::run() { int res = 0; uint64 read = 0; uint64 written = 0; string errMsg; while (res == 0) { T* task = _queue->get(); if (task == nullptr) break; R result = (*task)->run(); res = result._code; read += result._read; written += result._written; if (res != 0) { errMsg += result._errMsg; } } return R(res, read, written, errMsg); } #endif kanzi-cpp-2.5.2/src/app/BlockCompressor.hpp000066400000000000000000000101331516423635400206120ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_BlockCompressor #define knz_BlockCompressor #include #include #include "../InputStream.hpp" #include "../io/CompressedOutputStream.hpp" namespace kanzi { class FileCompressResult { public: int _code; uint64 _read; uint64 _written; std::string _errMsg; FileCompressResult() : _code(0) , _read(0) , _written(0) , _errMsg() { } FileCompressResult(int code, uint64 read, uint64 written, const std::string& errMsg) : _code(code) , _read(read) , _written(written) , _errMsg(errMsg) { } #if __cplusplus < 201103L FileCompressResult(const FileCompressResult& fcr) : _code(fcr._code) , _read(fcr._read) , _written(fcr._written) , _errMsg(fcr._errMsg) { } FileCompressResult& operator=(const FileCompressResult& fcr) { _errMsg = fcr._errMsg; _code = fcr._code; _read = fcr._read; _written = fcr._written; return *this; } ~FileCompressResult() {} #else FileCompressResult(const FileCompressResult& fdr) = delete; FileCompressResult& operator=(const FileCompressResult& fdr) = delete; FileCompressResult(FileCompressResult&& fdr) = default; FileCompressResult& operator=(FileCompressResult&& fdr) = default; ~FileCompressResult() = default; #endif }; #ifdef CONCURRENCY_ENABLED template class FileCompressWorker FINAL : public Task { public: FileCompressWorker(BoundedConcurrentQueue* queue) : _queue(queue) { } ~FileCompressWorker() {} R run(); private: BoundedConcurrentQueue* _queue; }; #endif template class FileCompressTask FINAL : public Task { public: static const int DEFAULT_BUFFER_SIZE = 65536; FileCompressTask(const Context& ctx, std::vector*>& listeners); ~FileCompressTask(); T run(); void dispose(); private: Context _ctx; InputStream* _is; CompressedOutputStream* _cos; std::vector*> _listeners; }; typedef FileCompressTask FCTask; class BlockCompressor { friend class FileCompressTask; public: BlockCompressor(const Context& ctx); ~BlockCompressor(); int compress(uint64& written); // Register a copy of the listener // Not thread safe bool addListener(Listener& bl); // Not thread safe bool removeListener(Listener& bl); void dispose() const {}; private: static const int DEFAULT_BLOCK_SIZE; static const int MIN_BLOCK_SIZE; static const int MAX_BLOCK_SIZE; int _verbosity; int _checksum; bool _overwrite; bool _skipBlocks; std::string _inputName; std::string _outputName; std::string _codec; std::string _transform; int _blockSize; bool _autoBlockSize; // derive block size from input size and jobs int _jobs; std::vector*> _listeners; bool _reorderFiles; bool _noDotFiles; bool _noLinks; Context _ctx; static void notifyListeners(std::vector*>& listeners, const Event& evt); static void getTransformAndCodec(int level, std::string tranformAndCodec[2]); }; } #endif kanzi-cpp-2.5.2/src/app/BlockDecompressor.cpp000066400000000000000000000661241516423635400211310ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include #include "BlockDecompressor.hpp" #include "InfoPrinter.hpp" #include "../Global.hpp" #include "../SliceArray.hpp" #include "../io/IOException.hpp" #include "../io/IOUtil.hpp" #include "../io/NullOutputStream.hpp" #include "../util/Clock.hpp" #include "../util/Printer.hpp" #ifdef CONCURRENCY_ENABLED #include #endif using namespace kanzi; using namespace std; BlockDecompressor::BlockDecompressor(const Context& ctx) : _ctx(ctx) { _blockSize = 0; _overwrite = _ctx.getInt("overwrite", 0) != 0; _ctx.putInt("overwrite", _overwrite ? 1 : 0); _verbosity = _ctx.getInt("verbosity", 1); _ctx.putInt("verbosity", _verbosity); _jobs = _ctx.getInt("jobs", 1); _ctx.putInt("jobs", _jobs); bool remove = _ctx.getInt("remove", 0) != 0; _ctx.putInt("remove", remove ? 1 : 0); _noDotFiles = _ctx.getInt("noDotFiles", 0) != 0; _ctx.putInt("noDotFiles", _noDotFiles ? 1 : 0); _noLinks = _ctx.getInt("noLinks", 0) != 0; _ctx.putInt("noLinks", _noLinks ? 1 : 0); if (_ctx.has("inputName") == false) throw invalid_argument("Missing input name"); _inputName = _ctx.getString("inputName") == "" ? "STDIN" : _ctx.getString("inputName"); if (Global::isReservedName(_inputName)) throw invalid_argument("'" + _inputName + "' is a reserved name"); if (_ctx.has("outputName") == false) throw invalid_argument("Missing output name"); string str = _ctx.getString("outputName"); _outputName = (str == "") && (_inputName == "STDIN") ? "STDOUT" : str; if (Global::isReservedName(_outputName)) throw invalid_argument("'" + _outputName + "' is a reserved name"); } BlockDecompressor::~BlockDecompressor() { dispose(); _listeners.clear(); } int BlockDecompressor::decompress(uint64& inputSize) { vector files; uint64 read = 0; Clock stopClock; int nbFiles = 1; Printer log(cout); stringstream ss; string upperInputName = _inputName; transform(upperInputName.begin(), upperInputName.end(), upperInputName.begin(), safeToUpper); bool isStdIn = upperInputName == "STDIN"; // In mode "info", we want to display the information in the stream header only. // We can reuse the existing code but we need to: // create an InfoPrinter with a dedicated INFO type // disable logging outside of this printer (=> _verbosity=0) // decompress no block (=> _outputName = NONE and --from=1 and --to=1) // disable threading for proper display (=> _jobs=1) bool isInfo = _ctx.getString("mode") == "y"; int vl = _verbosity; if (isInfo) { _verbosity = 0; _jobs = 1; _outputName = "NONE"; _ctx.putString("outputName", _outputName); _ctx.putInt("from", 1); _ctx.putInt("to", 1); _ctx.putInt("jobs", 1); if (isStdIn == true) { cerr << "STDIN not allowed in 'info' mode" << endl; return Error::ERR_OPEN_FILE; } } if (isStdIn == false) { vector errors; bool isRecursive = (_inputName.length() < 2) || (_inputName[_inputName.length() - 2] != PATH_SEPARATOR) || (_inputName[_inputName.length() - 1] != '.'); FileListConfig cfg = { isRecursive, _noLinks, false, _noDotFiles }; createFileList(_inputName, files, cfg, errors); if (errors.size() > 0) { for (size_t i = 0; i < errors.size(); i++) cerr << errors[i] << endl; return Error::ERR_OPEN_FILE; } if (files.size() == 0) { cerr << "Cannot find any file to decompress" << endl; return Error::ERR_OPEN_FILE; } nbFiles = int(files.size()); string strFiles = nbFiles > 1 ? " files" : " file"; ss << nbFiles << strFiles << " to decompress\n"; log.println(ss.str(), _verbosity > 0); ss.str(string()); } string upperOutputName = _outputName; transform(upperOutputName.begin(), upperOutputName.end(), upperOutputName.begin(), safeToUpper); bool isStdOut = upperOutputName == "STDOUT"; // Limit verbosity level when output is stdout // Logic is duplicated here to avoid dependency to Kanzi.cpp if (isStdOut == true) _verbosity = 0; // Limit verbosity level when files are processed concurrently if ((_verbosity > 1) && (_jobs > 1) && (nbFiles > 1)) { log.println("Warning: limiting verbosity to 1 due to concurrent processing of input files.\n", true); _verbosity = 1; } if (_verbosity > 2) { ss << "Verbosity: " << _verbosity << endl; ss << "Overwrite: " << (_overwrite ? "true" : "false") << endl; ss << "Using " << _jobs << " job" << (_jobs > 1 ? "s" : "") << endl; log.print(ss.str(), true); ss.str(string()); } const int from = _ctx.getInt("from", 1); InfoPrinter::Type ipt = isInfo ? InfoPrinter::INFO : InfoPrinter::DECOMPRESSION; InfoPrinter listener(vl, ipt, cout, max(from, 1)); if ((vl > 2) || ((isInfo == true) && (vl > 0))) addListener(listener); int res = 0; bool inputIsDir = false; string formattedOutName = _outputName; string formattedInName = _inputName; bool specialOutput = (isStdOut == true) || (upperOutputName == "NONE"); // Need to strip path separator at the end to make 'stat()' happy if ((formattedOutName.size() > 1) && (formattedOutName[formattedOutName.size() - 1] == PATH_SEPARATOR)) { formattedOutName.resize(formattedOutName.size() - 1); } if (isStdIn == false) { struct STAT buffer; if ((formattedInName.size() > 1) && (formattedInName[formattedInName.size() - 1] == PATH_SEPARATOR)) { formattedInName.resize(formattedInName.size() - 1); } if (STAT(formattedInName.c_str(), &buffer) != 0) { cerr << "Cannot access input file '" << formattedInName << "'" << endl; return Error::ERR_OPEN_FILE; } if ((buffer.st_mode & S_IFDIR) != 0) { inputIsDir = true; if ((formattedInName.size() != 0) && (formattedInName[formattedInName.size() - 1] == '.')) { formattedInName.resize(formattedInName.size() - 1); } if ((formattedInName.size() != 0) && (formattedInName[formattedInName.size() - 1] != PATH_SEPARATOR)) { formattedInName += PATH_SEPARATOR; } if ((formattedOutName.size() != 0) && (specialOutput == false)) { if (STAT(formattedOutName.c_str(), &buffer) != 0) { cerr << "Output must be an existing directory (or 'NONE')" << endl; return Error::ERR_OPEN_FILE; } if ((buffer.st_mode & S_IFDIR) == 0) { cerr << "Output must be a directory (or 'NONE')" << endl; return Error::ERR_CREATE_FILE; } formattedOutName += PATH_SEPARATOR; } } else { inputIsDir = false; if ((formattedOutName.size() != 0) && (specialOutput == false)) { if ((STAT(formattedOutName.c_str(), &buffer) == 0) && ((buffer.st_mode & S_IFDIR) != 0)) { cerr << "Output must be a file (or 'NONE')" << endl; return Error::ERR_CREATE_FILE; } } } } _ctx.putInt("verbosity", _verbosity); #ifdef CONCURRENCY_ENABLED ThreadPool pool(_jobs + 1); // +1 to avoid deadlock due to thread exhaustion #endif // Run the task(s) if (nbFiles == 1) { string oName = formattedOutName; string iName = "STDIN"; if (isStdIn == true) { if (oName.length() == 0) { oName = "STDOUT"; } } else { iName = files[0].fullPath(); _ctx.putLong("fileSize", files[0]._size); string upperIName = iName; transform(upperIName.begin(), upperIName.end(), upperIName.begin(), safeToUpper); if (oName.length() == 0) { oName = iName; if ((upperIName.length() >= 4) && (upperIName.substr(upperIName.length() - 4) == ".KNZ")) oName.resize(oName.length() - 4); else oName = oName + ".bak"; } else if ((inputIsDir == true) && (specialOutput == false)) { oName = formattedOutName + iName.substr(formattedInName.size()); if ((upperIName.length() >= 4) && (upperIName.substr(upperIName.length() - 4) == ".KNZ")) oName.resize(oName.length() - 4); else oName = oName + ".bak"; } } _ctx.putString("inputName", iName); _ctx.putString("outputName", oName); FileDecompressTask task(_ctx, _listeners); FileDecompressResult fdr = task.run(); res = fdr._code; read = fdr._read; if (res != 0) { cerr << fdr._errMsg << endl; } } else { vector*> tasks; #ifdef CONCURRENCY_ENABLED vector jobsPerTask(nbFiles); Global::computeJobsPerTask(jobsPerTask.data(), _jobs, nbFiles); #endif sortFilesByPathAndSize(files, true); // Create one task per file for (int i = 0; i < nbFiles; i++) { string oName = formattedOutName; string iName = files[i].fullPath(); upperInputName = iName; transform(upperInputName.begin(), upperInputName.end(), upperInputName.begin(), safeToUpper); if (oName.length() == 0) { oName = iName; if ((upperInputName.length() >= 4) && (upperInputName.substr(upperInputName.length() - 4) == ".KNZ")) oName.resize(oName.length() - 4); else oName = oName + ".bak"; } else if ((inputIsDir == true) && (specialOutput == false)) { oName = formattedOutName + iName.substr(formattedInName.size()); if ((upperInputName.length() >= 4) && (upperInputName.substr(upperInputName.length() - 4) == ".KNZ")) oName.resize(oName.length() - 4); else oName = oName + ".bak"; } #ifdef CONCURRENCY_ENABLED Context taskCtx(_ctx, &pool); taskCtx.putInt("jobs", jobsPerTask[i]); #else Context taskCtx(_ctx); taskCtx.putInt("jobs", 1); #endif taskCtx.putLong("fileSize", files[i]._size); taskCtx.putString("inputName", iName); taskCtx.putString("outputName", oName); FileDecompressTask* task = new FileDecompressTask(taskCtx, _listeners); tasks.push_back(task); } bool doConcurrent = _jobs > 1; #ifdef CONCURRENCY_ENABLED if (doConcurrent) { vector*> workers; vector > results; BoundedConcurrentQueue queue(nbFiles, &tasks[0]); // Create one worker per job and run it. A worker calls several tasks sequentially. for (int i = 0; i < _jobs; i++) { workers.push_back(new FileDecompressWorker*, FileDecompressResult>(&queue)); if (_ctx.getPool() == nullptr) results.push_back(async(launch::async, &FileDecompressWorker::run, workers[i])); else results.push_back(_ctx.getPool()->schedule(&FileDecompressWorker::run, workers[i])); } // Wait for results for (int i = 0; i < _jobs; i++) { FileDecompressResult fdr = results[i].get(); res = fdr._code; read += fdr._read; if (res != 0) { cerr << fdr._errMsg << endl; // Exit early by telling the workers that the queue is empty queue.clear(); } } for (int i = 0; i < _jobs; i++) delete workers[i]; } #endif if (!doConcurrent) { for (uint i = 0; i < tasks.size(); i++) { FileDecompressResult fdr = tasks[i]->run(); res = fdr._code; read += fdr._read; if (res != 0) { cerr << fdr._errMsg << endl; break; } } } for (int i = 0; i < nbFiles; i++) delete tasks[i]; } stopClock.stop(); if ((nbFiles > 1) && (_verbosity > 0)) { double delta = stopClock.elapsed(); log.println("", true); ss << "Total decompression time: "; if (delta >= 1e5) { ss.precision(1); ss.setf(ios::fixed); ss << (delta / 1000) << " s" << endl; } else { ss << int(delta) << " ms" << endl; } ss << "Total output size: " << read << (read > 1 ? " bytes" : " byte") << endl; log.print(ss.str(), true); ss.str(string()); } if (_verbosity > 2) removeListener(listener); inputSize += read; return res; } bool BlockDecompressor::addListener(Listener& bl) { _listeners.push_back(&bl); return true; } bool BlockDecompressor::removeListener(Listener& bl) { std::vector*>::iterator it = find(_listeners.begin(), _listeners.end(), &bl); if (it == _listeners.end()) return false; _listeners.erase(it); return true; } void BlockDecompressor::notifyListeners(vector*>& listeners, const Event& evt) { for (size_t i = 0; i < listeners.size(); i++) listeners[i]->processEvent(evt); } template FileDecompressTask::FileDecompressTask(const Context& ctx, vector*>& listeners) : _ctx(ctx) , _listeners(listeners) { _os = nullptr; _cis = nullptr; } template FileDecompressTask::~FileDecompressTask() { dispose(); if (_cis != nullptr) { delete _cis; _cis = nullptr; } try { if ((_os != nullptr) && (_os != &cout)) { delete _os; } _os = nullptr; } catch (const exception&) { // Ignore: best effort } } template T FileDecompressTask::run() { Printer log(cout); int verbosity = _ctx.getInt("verbosity"); string inputName = _ctx.getString("inputName"); string outputName = _ctx.getString("outputName"); stringstream ss; if (verbosity > 2) { ss << "Input file name: '" << inputName << "'" << endl; ss << "Output file name: '" << outputName << "'" << endl; log.print(ss.str(), true); ss.str(string()); } bool overwrite = _ctx.getInt("overwrite") != 0; int64 read = 0; ss << "\nDecompressing " << inputName << " ..."; log.println(ss.str(), verbosity > 1); log.println("\n", verbosity > 3); WallTimer timer; if (_listeners.size() > 0) { Event evt(Event::DECOMPRESSION_START, 0, int64(0), timer.getCurrentTime()); BlockDecompressor::notifyListeners(_listeners, evt); } string str = outputName; transform(str.begin(), str.end(), str.begin(), safeToUpper); #if defined(WIN32) || defined(_WIN32) || defined(_WIN64) bool checkOutputSize = str != "NUL"; #else bool checkOutputSize = str != "/DEV/NULL"; #endif #define CLEANUP_DECOMP_OS if ((_os != nullptr) && (_os != &cout)) { \ delete _os; \ _os = nullptr; \ } if (str == "NONE") { _os = new NullOutputStream(); checkOutputSize = false; } else if (str == "STDOUT") { _os = &cout; checkOutputSize = false; } else { try { if (samePaths(inputName, outputName)) { stringstream sserr; sserr << "The input and output files must be different"; return T(Error::ERR_CREATE_FILE, 0, sserr.str()); } struct STAT buffer; if (STAT(outputName.c_str(), &buffer) == 0) { if ((buffer.st_mode & S_IFDIR) != 0) { stringstream sserr; sserr << "The output file is a directory"; return T(Error::ERR_OUTPUT_IS_DIR, 0, sserr.str()); } if (overwrite == false) { stringstream sserr; sserr << "File '" << outputName << "' exists and the 'force' command " << "line option has not been provided"; return T(Error::ERR_OVERWRITE_FILE, 0, sserr.str()); } // Delete output file to ensure consistent performance remove(outputName.c_str()); } ofstream* ofs = new ofstream(outputName.c_str(), ofstream::out | ofstream::binary); if (!*ofs) { string errMsg; if (overwrite == true) { // Attempt to create the full folder hierarchy to file string parentDir = outputName; size_t idx = outputName.find_last_of(PATH_SEPARATOR); if (idx != string::npos) parentDir.resize(idx); int rmkd = mkdirAll(parentDir); if ((rmkd == 0) || (rmkd == EEXIST)) { delete ofs; ofs = new ofstream(outputName.c_str(), ofstream::binary); } else { errMsg = strerror(rmkd); } } if (!*ofs) { delete ofs; stringstream sserr; sserr << "Cannot open output file '" << outputName << "' for writing"; if (errMsg != "") sserr << ": " << errMsg; return T(Error::ERR_CREATE_FILE, 0, sserr.str()); } } _os = ofs; } catch (const exception& e) { stringstream sserr; sserr << "Cannot open output file '" << outputName << "' for writing: " << e.what(); return T(Error::ERR_CREATE_FILE, 0, sserr.str()); } } InputStream* is = nullptr; #define CLEANUP_DECOMP_IS if ((is != nullptr) && (is != &cin)) { \ delete is; \ is = nullptr; \ } try { str = inputName; transform(str.begin(), str.end(), str.begin(), safeToUpper); if (str == "STDIN") { is = &cin; } else { ifstream* ifs = new ifstream(inputName.c_str(), ifstream::in | ifstream::binary); if (!*ifs) { delete ifs; CLEANUP_DECOMP_OS stringstream sserr; sserr << "Cannot open input file '" << inputName << "'"; return T(Error::ERR_OPEN_FILE, 0, sserr.str()); } is = ifs; } try { _cis = new CompressedInputStream(*is, _ctx); for (uint i = 0; i < _listeners.size(); i++) _cis->addListener(*_listeners[i]); } catch (const invalid_argument& e) { CLEANUP_DECOMP_IS CLEANUP_DECOMP_OS stringstream sserr; sserr << "Cannot create compressed stream: " << e.what(); return T(Error::ERR_CREATE_DECOMPRESSOR, 0, sserr.str()); } } catch (const exception& e) { CLEANUP_DECOMP_IS CLEANUP_DECOMP_OS stringstream sserr; sserr << "Cannot open input file '" << inputName << "': " << e.what(); const uint64 readSoFar = (_cis != nullptr) ? _cis->getRead() : 0; return T(Error::ERR_OPEN_FILE, readSoFar, sserr.str()); } Clock stopClock; kanzi::byte* buf = new kanzi::byte[DEFAULT_BUFFER_SIZE]; try { SliceArray sa(buf, DEFAULT_BUFFER_SIZE, 0); int decoded = 0; // Decode next block do { _cis->read(reinterpret_cast(&sa._array[0]), sa._length); decoded = int(_cis->gcount()); if (decoded < 0) { dispose(); const uint64 d = _cis->getRead(); delete _cis; _cis = nullptr; CLEANUP_DECOMP_IS CLEANUP_DECOMP_OS delete[] buf; stringstream sserr; sserr << "Reached end of stream"; return T(Error::ERR_READ_FILE, d, sserr.str()); } try { if (decoded > 0) { _os->write(reinterpret_cast(&sa._array[0]), decoded); read += decoded; } } catch (const exception& e) { dispose(); const uint64 d = _cis->getRead(); delete _cis; _cis = nullptr; CLEANUP_DECOMP_IS CLEANUP_DECOMP_OS delete[] buf; stringstream sserr; sserr << "Failed to write decompressed block to file '" << outputName << "': " << e.what(); return T(Error::ERR_WRITE_FILE, d, sserr.str()); } } while (_cis->eof() == 0); } catch (const IOException& e) { dispose(); const uint64 d = _cis->getRead(); bool isEOF = _cis->eof(); delete _cis; _cis = nullptr; CLEANUP_DECOMP_IS CLEANUP_DECOMP_OS delete[] buf; if (isEOF == true) return T(Error::ERR_READ_FILE, d, "Reached end of stream"); stringstream sserr; sserr << e.what(); return T(e.error(), d, sserr.str()); } catch (const exception& e) { dispose(); const uint64 d = _cis->getRead(); bool isEOF = _cis->eof(); delete _cis; _cis = nullptr; CLEANUP_DECOMP_IS CLEANUP_DECOMP_OS delete[] buf; if (isEOF == true) return T(Error::ERR_READ_FILE, d, "Reached end of stream"); stringstream sserr; sserr << "An unexpected condition happened. Exiting ..." << endl << e.what(); return T(Error::ERR_UNKNOWN, d, sserr.str()); } // Close streams to ensure all data are flushed dispose(); const uint64 decoded = _cis->getRead(); const uint64 written = (checkOutputSize == true) ? uint64(_os->tellp()) : 0; // Clean up resources at the end of the method as the task may be // recycled in a threadpool and the destructor not called. delete _cis; _cis = nullptr; // is destructor will call close if ifstream CLEANUP_DECOMP_IS CLEANUP_DECOMP_OS // If the whole input stream has been decoded and the original data size is present, // check that the output size matches the original data size. if ((checkOutputSize == true) && (_ctx.has("to") == false) && (_ctx.has("from") == false)) { const uint64 outputSize = _ctx.getLong("outputSize", 0); if ((outputSize != 0) && (written != outputSize)) { delete[] buf; stringstream sserr; sserr << "Corrupted bitstream: invalid output size (expected " << outputSize; sserr << ", got " << written << ")"; return T(Error::ERR_INVALID_FILE, decoded, sserr.str()); } } stopClock.stop(); double delta = stopClock.elapsed(); if (verbosity >= 1) { log.println("", verbosity > 1); ss.str(string()); if (verbosity > 1) { if (delta >= 1e5) { ss.precision(1); ss.setf(ios::fixed); ss << "Decompression time: " << (delta / 1000) << " s" << endl; } else { ss << "Decompression time: " << int(delta) << " ms" << endl; } ss << "Input size: " << decoded << endl; ss << "Output size: " << read << endl; log.print(ss.str(), true); ss.str(string()); } if (verbosity == 1) { ss << "Decompressed " << inputName << ": " << decoded << " => " << read; if (delta >= 1e5) { ss.precision(1); ss.setf(ios::fixed); ss << " bytes in " << (delta / 1000) << " s"; } else { ss << " bytes in " << int(delta) << " ms"; } log.println(ss.str(), true); ss.str(string()); } if ((verbosity > 1) && (delta > 0)) { double b2KiB = double(1000) / double(1024); ss << "Throughput (KiB/s): " << uint(double(read) * b2KiB / delta); log.println(ss.str(), true); ss.str(string()); } log.println("", verbosity > 1); } if (_listeners.size() > 0) { Event evt(Event::DECOMPRESSION_END, 0, int64(decoded), timer.getCurrentTime()); BlockDecompressor::notifyListeners(_listeners, evt); } if (_ctx.getInt("remove", 0) != 0) { // Delete input file if (inputName == "STDIN") { log.println("Warning: ignoring remove option with STDIN", verbosity > 0); } else if (remove(inputName.c_str()) != 0) { log.println("Warning: input file could not be deleted", verbosity > 0); } } delete[] buf; return T(0, read, ""); } // Close and flush streams. Do not deallocate resources. Idempotent. template void FileDecompressTask::dispose() { try { if (_cis != nullptr) { _cis->close(); } } catch (const exception& e) { cerr << "Decompression failure: " << e.what() << endl; } // _os destructor will call close if ofstream } #ifdef CONCURRENCY_ENABLED template R FileDecompressWorker::run() { int res = 0; uint64 read = 0; string errMsg; while (res == 0) { T* task = _queue->get(); if (task == nullptr) break; R result = (*task)->run(); res = result._code; read += result._read; if (res != 0) { errMsg += result._errMsg; } } return R(res, read, errMsg); } #endif kanzi-cpp-2.5.2/src/app/BlockDecompressor.hpp000066400000000000000000000071201516423635400211250ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_BlockDecompressor #define knz_BlockDecompressor #include #include #include "../OutputStream.hpp" #include "../io/CompressedInputStream.hpp" namespace kanzi { class FileDecompressResult { public: int _code; uint64 _read; std::string _errMsg; FileDecompressResult() : _code(0) , _read(0) , _errMsg() { } FileDecompressResult(int code, uint64 read, const std::string& errMsg) : _code(code) , _read(read) , _errMsg(errMsg) { } #if __cplusplus < 201103L FileDecompressResult(const FileDecompressResult& fdr) : _code(fdr._code) , _read(fdr._read) , _errMsg(fdr._errMsg) { } FileDecompressResult& operator=(const FileDecompressResult& fdr) { _errMsg = fdr._errMsg; _code = fdr._code; _read = fdr._read; return *this; } ~FileDecompressResult() {} #else FileDecompressResult(const FileDecompressResult& fcr) = delete; FileDecompressResult& operator=(const FileDecompressResult& fcr) = delete; FileDecompressResult(FileDecompressResult&& fcr) = default; FileDecompressResult& operator=(FileDecompressResult&& fcr) = default; ~FileDecompressResult() = default; #endif }; #ifdef CONCURRENCY_ENABLED template class FileDecompressWorker FINAL : public Task { public: FileDecompressWorker(BoundedConcurrentQueue* queue) : _queue(queue) { } ~FileDecompressWorker() {} R run(); private: BoundedConcurrentQueue* _queue; }; #endif template class FileDecompressTask FINAL : public Task { public: static const int DEFAULT_BUFFER_SIZE = 65536; FileDecompressTask(const Context& ctx, std::vector*>& listeners); ~FileDecompressTask(); T run(); void dispose(); private: Context _ctx; OutputStream* _os; CompressedInputStream* _cis; std::vector*> _listeners; }; typedef FileDecompressTask FDTask; class BlockDecompressor { friend class FileDecompressTask; public: BlockDecompressor(const Context& ctx); ~BlockDecompressor(); int decompress(uint64& read); // Register a copy of the listener // Not thread safe bool addListener(Listener& bl); // Not thread safe bool removeListener(Listener& bl); void dispose() const {}; private: int _verbosity; bool _overwrite; std::string _inputName; std::string _outputName; int _blockSize; int _jobs; std::vector*> _listeners; bool _noDotFiles; bool _noLinks; Context _ctx; static void notifyListeners(std::vector*>& listeners, const Event& evt); }; } #endif kanzi-cpp-2.5.2/src/app/InfoPrinter.cpp000066400000000000000000000244431516423635400177460ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include #include "InfoPrinter.hpp" #include "../util/strings.hpp" using namespace kanzi; using namespace std; InfoPrinter::InfoPrinter(int infoLevel, InfoPrinter::Type type, OutputStream& os, int firstBlockId) : _os(os) , _type(type) , _level(infoLevel) , _headerInfo(0) { STORE_ATOMIC(_nextBlockId, firstBlockId); if (type == InfoPrinter::COMPRESSION) { _thresholds[0] = Event::COMPRESSION_START; _thresholds[1] = Event::BEFORE_TRANSFORM; _thresholds[2] = Event::AFTER_TRANSFORM; _thresholds[3] = Event::BEFORE_ENTROPY; _thresholds[4] = Event::AFTER_ENTROPY; _thresholds[5] = Event::COMPRESSION_END; } else { _thresholds[0] = Event::DECOMPRESSION_START; _thresholds[1] = Event::BEFORE_ENTROPY; _thresholds[2] = Event::AFTER_ENTROPY; _thresholds[3] = Event::BEFORE_TRANSFORM; _thresholds[4] = Event::AFTER_TRANSFORM; _thresholds[5] = Event::DECOMPRESSION_END; } } void InfoPrinter::processEvent(const Event& evt) { if (_type == InfoPrinter::INFO) { processHeaderInfo(evt); return; } #ifdef CONCURRENCY_ENABLED Event::Type t = evt.getType(); if (t == Event::BEFORE_TRANSFORM || t == Event::AFTER_TRANSFORM || t == Event::BEFORE_ENTROPY || t == Event::AFTER_ENTROPY) { processBlockEventOrdered(evt); return; } #endif processEventOrdered(evt); } #ifdef CONCURRENCY_ENABLED void InfoPrinter::processBlockEventOrdered(const Event& evt) { const int blockId = evt.getId(); const Event::Type type = evt.getType(); bool blockComplete = false; // Determine completion condition if (_type == InfoPrinter::COMPRESSION) { blockComplete = (type == Event::AFTER_ENTROPY); } else if (_type == InfoPrinter::DECOMPRESSION) { blockComplete = (type == Event::AFTER_TRANSFORM); } { #ifdef CONCURRENCY_ENABLED std::lock_guard lock(_mutex); #endif _pendingBlocks[blockId].push_back(evt); // Do not attempt to release unless this block is complete if (blockComplete == false) return; } // Try to release completed blocks in strict blockId order for (;;) { std::vector events; { #ifdef CONCURRENCY_ENABLED std::lock_guard lock(_mutex); #endif int expectedId = LOAD_ATOMIC(_nextBlockId); std::map >::iterator it = _pendingBlocks.find(expectedId); if (it == _pendingBlocks.end()) return; // The block must be complete before release bool complete = false; const std::vector& evts = it->second; for (size_t i = 0; i < evts.size(); i++) { if ((_type == InfoPrinter::COMPRESSION) && (evts[i].getType() == Event::AFTER_ENTROPY)) { complete = true; break; } if ((_type == InfoPrinter::DECOMPRESSION) && (evts[i].getType() == Event::AFTER_TRANSFORM)) { complete = true; break; } } if (complete == false) return; // Release block events.swap(it->second); _pendingBlocks.erase(it); STORE_ATOMIC(_nextBlockId, expectedId + 1); } // Process all events for this block in arrival order #ifdef CONCURRENCY_ENABLED std::lock_guard lock(_mutex); #endif for (size_t i = 0; i < events.size(); i++) { processEventOrdered(events[i]); } } } #endif void InfoPrinter::processEventOrdered(const Event& evt) { const int blockId = evt.getId(); const Event::Type type = evt.getType(); string msg; if (type == _thresholds[1]) { BlockInfo* bi = new BlockInfo(); bi->_timeStamp1 = evt.getTime(); bi->_stage0Size = evt.getSize(); _blocks[blockId] = bi; if (_level >= 5) { msg = evt.toString(); } } else if (type == _thresholds[2]) { std::map::iterator it = _blocks.find(blockId); if (it == _blocks.end()) return; BlockInfo& bi = *it->second; bi._timeStamp2 = evt.getTime(); if (_level >= 5) { double elapsed = WallTimer::calculateDifference(bi._timeStamp1, bi._timeStamp2); std::stringstream ss; ss << evt.toString() << " [" << int64(elapsed) << " ms]"; msg = ss.str(); } } else if (type == _thresholds[3]) { std::map::iterator it = _blocks.find(blockId); if (it == _blocks.end()) return; BlockInfo& bi = *it->second; bi._timeStamp3 = evt.getTime(); bi._stage1Size = evt.getSize(); if (_level >= 5) msg = evt.toString(); } else if (type == _thresholds[4]) { std::map::iterator it = _blocks.find(blockId); if (it == _blocks.end()) return; const BlockInfo& bi = *it->second; stringstream ss; if (_level >= 5) { ss << evt.toString() << endl; } // Display block info if (_level >= 4) { double elapsed1 = WallTimer::calculateDifference(bi._timeStamp1, bi._timeStamp2); double elapsed2 = WallTimer::calculateDifference(bi._timeStamp3, evt.getTime()); ss << "Block " << blockId << ": " << bi._stage0Size << " => " << bi._stage1Size << " [" << int64(elapsed1) << " ms] => " << evt.getSize() << " [" << int64(elapsed2) << " ms]"; // Add compression ratio for encoding if ((_type == InfoPrinter::COMPRESSION) && (bi._stage0Size != 0)) { ss << " (" << uint(double(evt.getSize()) * 100.0 / double(bi._stage0Size)) << "%)"; } // Optionally add hash if (evt.getHash() != 0) { ss << std::uppercase << std::hex << " [" << evt.getHash() << "]"; } msg = ss.str(); } delete it->second; _blocks.erase(it); } else if ((evt.getType() == Event::AFTER_HEADER_DECODING) && (_level >= 3)) { Event::HeaderInfo* info = evt.getInfo(); if (info == nullptr) return; if (_level >= 5) { // JSON output msg = evt.toString(); } else { // Raw text output stringstream ss; ss << "Bitstream version: " << info->bsVersion << endl; string strCk = "NONE"; if (info->checksumSize == 32) strCk = "32 bits"; else if (info->checksumSize == 64) strCk = "64 bits"; ss << "Block checksum: " << strCk << endl; ss << "Block size: " << info->blockSize << " bytes" << endl; string strE = info->entropyType == "NONE" ? "no" : info->entropyType; ss << "Using " << strE << " entropy codec (stage 1)" << endl; string strF = info->transformType == "NONE" ? "no" : info->transformType; ss << "Using " << strF << " transform (stage 2)" << endl; if (info->originalSize >= 0) ss << "Original size: " << info->originalSize << " byte(s)" << endl; msg = ss.str(); } } else if (_level >= 5) { msg = evt.toString(); } if (msg.size() > 0) { _os << msg << endl; _os.flush(); } } void InfoPrinter::processHeaderInfo(const Event& evt) { if ((_level == 0) || (evt.getType() != Event::AFTER_HEADER_DECODING)) return; const Event::HeaderInfo* info = evt.getInfo(); if (info == nullptr) return; stringstream ss; if (_headerInfo++ == 0) { ss << endl; ss << "|" << " File Name "; ss << "|" << "Ver"; ss << "|" << "Check"; ss << "|" << "Block Size"; ss << "|" << " File Size "; ss << "|" << " Orig. Size "; ss << "|" << " Ratio "; if (_level >= 4) { ss << "|" << " Entropy"; ss << "|" << " Transforms "; } ss << "|" << endl; } ss << "|"; string inputName = info->inputName; size_t idx = inputName.find_last_of(PATH_SEPARATOR); if (idx != string::npos) inputName.erase(0, idx + 1); if (inputName.length() > 20) inputName.replace(18, string::npos, ".."); ss << left << setw(20) << inputName << "|" << right; ss << setw(3) << info->bsVersion << "|"; ss << setw(5) << info->checksumSize << "|"; ss << setw(10) << info->blockSize << "|"; if (info->fileSize >= 0) ss << setw(12) << formatSize(double(info->fileSize)) << "|"; else ss << setw(12) << " N/A |"; if (info->originalSize >= 0) ss << setw(12) << formatSize(double(info->originalSize)) << "|"; else ss << setw(12) << " N/A |"; if ((info->originalSize >= 0) && (info->fileSize >= 0)) { double compSz = double(info->fileSize); double origSz = double(info->originalSize); if (origSz == 0.0) ss << setw(7) << " N/A |"; else ss << setw(7) << fixed << setprecision(3) << (compSz / origSz) << "|"; } else { ss << setw(7) << " N/A |"; } if (_level >= 4) { ss << setw(8) << info->entropyType << "|"; string t = info->transformType; if (t.length() > 26) t.replace(24, string::npos, ".."); ss << setw(26) << t << "|"; } _os << ss.str() << endl; } kanzi-cpp-2.5.2/src/app/InfoPrinter.hpp000066400000000000000000000042701516423635400177470ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_InfoPrinter #define knz_InfoPrinter #include #include #ifdef CONCURRENCY_ENABLED #include #endif #include #include #include "../concurrent.hpp" #include "../Event.hpp" #include "../Listener.hpp" #include "../OutputStream.hpp" #include "../util/Clock.hpp" namespace kanzi { class BlockInfo { public: int64 _stage0Size; int64 _stage1Size; WallTimer::TimeData _timeStamp1; WallTimer::TimeData _timeStamp2; WallTimer::TimeData _timeStamp3; BlockInfo() : _stage0Size(0), _stage1Size(0) {} }; class InfoPrinter : public Listener { public: enum Type { COMPRESSION, DECOMPRESSION, INFO }; InfoPrinter(int infoLevel, InfoPrinter::Type type, OutputStream& os, int firstBlockId = 1); ~InfoPrinter() {} void processEvent(const Event& evt); private: #ifdef CONCURRENCY_ENABLED // Ordered-phase handling void processBlockEventOrdered(const Event& evt); #endif // Actual event processing + printing void processEventOrdered(const Event& evt); // Header-only info void processHeaderInfo(const Event& evt); OutputStream& _os; InfoPrinter::Type _type; int _level; int _headerInfo; // Per-block state std::map _blocks; Event::Type _thresholds[6]; #ifdef CONCURRENCY_ENABLED std::mutex _mutex; #endif std::map > _pendingBlocks; atomic_int_t _nextBlockId; }; } #endif kanzi-cpp-2.5.2/src/app/Kanzi.cpp000066400000000000000000001210611516423635400165550ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include #include "BlockCompressor.hpp" #include "BlockDecompressor.hpp" #include "../Error.hpp" #include "../util/Printer.hpp" #if defined(WIN32) || defined(_WIN32) || defined(_WIN64) #include #include #include #endif using namespace kanzi; using namespace std; static const string CMD_LINE_ARGS[15] = { "-c", "-d", "-i", "-o", "-b", "-t", "-e", "-j", "-v", "-l", "-s", "-x", "-f", "-h", "-y" }; //static const int ARG_IDX_COMPRESS = 0; //static const int ARG_IDX_DECOMPRESS = 1; static const int ARG_IDX_INPUT = 2; static const int ARG_IDX_OUTPUT = 3; static const int ARG_IDX_BLOCK = 4; static const int ARG_IDX_TRANSFORM = 5; static const int ARG_IDX_ENTROPY = 6; static const int ARG_IDX_JOBS = 7; static const int ARG_IDX_VERBOSE = 8; static const int ARG_IDX_LEVEL = 9; //static const int ARG_IDX_CHECKSUM = 10; //static const int ARG_IDX_FROM = 11; //static const int ARG_IDX_TO = 12; static const string KANZI_VERSION = "2.5.2"; static const string APP_HEADER = "Kanzi " + KANZI_VERSION + " (c) Frederic Langlet"; static const string APP_SUB_HEADER = "Fast lossless data compressor."; static const string APP_USAGE = "Usage: kanzi [-c|-d|-y] [flags and files in any order]"; #ifdef CONCURRENCY_ENABLED static const int MAX_CONCURRENCY = 64; #endif void printHelp(Printer& log, const string& mode, bool showHeader) { log.println("", true); if (showHeader == true) { log.println(APP_HEADER, true); log.println("", true); log.println(APP_SUB_HEADER, true); } log.println(APP_USAGE, true); log.println("", true); log.println("Options\n", true); log.println(" -h, --help", true); if ((mode != "c") && (mode != "d") && (mode != "y")) { log.println(" Display this message.", true); log.println(" Use in conjunction with -c to print information for compression,", true); log.println(" or -d to print information for decompression.\n", true); log.println(" -c, --compress", true); log.println(" Compress mode\n", true); log.println(" -d, --decompress", true); log.println(" Decompress mode\n", true); log.println(" -y, --info", true); log.println(" Info mode: display information about compressed files\n", true); } else { log.println(" Display this message.\n", true); } log.println(" -i, --input=", true); log.println(" Name of the input file or directory or 'stdin'", true); log.println(" When the source is a directory, all files in it will be processed.", true); #if defined(WIN32) || defined(_WIN32) || defined(_WIN64) log.println(" Provide \\. at the end of the directory name to avoid recursion", true); log.println(" (EG: myDir\\. => no recursion)", true); #else log.println(" Provide /. at the end of the directory name to avoid recursion", true); log.println(" (EG: myDir/. => no recursion)", true); #endif log.println(" If this option is not provided, kanzi reads data from stdin.\n", true); if (mode != "y") { log.println(" -o, --output=", true); if (mode == "c") { log.println(" Optional name of the output file or directory (defaults to", true); log.println(" if input is or 'stdout' if input is 'stdin').", true); log.println(" or 'none' or 'stdout'.\n", true); } else if (mode == "d") { log.println(" Optional name of the output file or directory (defaults to", true); log.println(" if input is or 'stdout' if input is 'stdin').", true); log.println(" or 'none' or 'stdout'.\n", true); } else { log.println(" Optional name of the output file or 'none' or 'stdout'.\n", true); } } if (mode == "c") { log.println(" -b, --block=", true); log.println(" Size of blocks (default 4|8|16|32 MiB based on level, max 1 GiB, min 1 KiB).", true); log.println(" 'auto' means that the compressor derives the best value", true); log.println(" based on input size (when available) and number of jobs.\n", true); log.println(" -l, --level=", true); log.println(" Set the compression level [0..9]", true); log.println(" Providing this option forces entropy and transform.", true); log.println(" Defaults to level 3 if not provided.\n", true); log.println(" 0 = NONE&NONE (store)", true); log.println(" 1 = LZX&NONE", true); log.println(" 2 = DNA+LZ&HUFFMAN", true); log.println(" 3 = TEXT+UTF+PACK+MM+LZX&HUFFMAN", true); log.println(" 4 = TEXT+UTF+EXE+PACK+MM+ROLZ&NONE", true); log.println(" 5 = TEXT+UTF+BWT+RANK+ZRLT&ANS0", true); log.println(" 6 = TEXT+UTF+BWT+SRT+ZRLT&FPAQ", true); log.println(" 7 = LZP+TEXT+UTF+BWT+LZP&CM", true); log.println(" 8 = EXE+RLT+TEXT+UTF+DNA&TPAQ", true); log.println(" 9 = EXE+RLT+TEXT+UTF+DNA&TPAQX\n", true); log.println(" Kanzi utilizes various algorithm combinations across different compression", true); log.println(" levels. While these levels are calibrated to improve compression monotonically", true); log.println(" this is not guaranteed. Certain data may respond better to specific algorithms", true); log.println(" meaning higher compression levels could occasionally yield lower compression ratios.\n", true); log.println(" -e, --entropy=", true); log.println(" Entropy codec [None|Huffman|ANS0|ANS1|Range|FPAQ|TPAQ|TPAQX|CM]\n", true); log.println(" -t, --transform=", true); log.println(" Transform [None|BWT|BWTS|LZ|LZX|LZP|ROLZ|ROLZX|RLT|ZRLT]", true); log.println(" [MTFT|RANK|SRT|TEXT|MM|EXE|UTF|PACK]", true); log.println(" EG: BWT+RANK or BWTS+MTFT\n", true); log.println(" -x, -x32, -x64, --checksum=", true); log.println(" Enable block checksum (32 or 64 bits).", true); log.println(" -x is equivalent to -x32.\n", true); log.println(" -s, --skip", true); log.println(" Copy blocks with high entropy instead of compressing them.\n", true); } log.println(" -j, --jobs=", true); log.println(" Maximum number of jobs the program may start concurrently", true); #ifdef CONCURRENCY_ENABLED int cores = min(max(int(thread::hardware_concurrency()) / 2, 1), MAX_CONCURRENCY); char msg[96]; snprintf(msg, sizeof(msg), " Default is half of available cores (%d on this machine).\n", cores); log.println(" If 0 is provided, use all available cores (maximum is 64).", true); log.println(msg, true); #else log.println(" (always 1 in this version).\n", true); #endif log.println(" -v, --verbose=", true); log.println(" 0=silent, 1=default, 2=display details, 3=display configuration,", true); log.println(" 4=display block size and timings, 5=display extra information", true); log.println(" Verbosity is reduced to 1 when files are processed concurrently.", true); log.println(" Verbosity is reduced to 0 when the output is 'stdout'.\n", true); if (mode != "y") { log.println(" -f, --force", true); log.println(" Overwrite the output file if it already exists\n", true); } if (mode == "c") { log.println(" --rm", true); log.println(" Remove the input file after successful compression.", true); log.println(" If the input is a folder, all processed files under the folder are removed.\n", true); } else if (mode == "d") { log.println(" --rm", true); log.println(" Remove the input file after successful decompression.", true); log.println(" If the input is a folder, all processed files under the folder are removed.\n", true); } log.println(" --skip-links", true); log.println(" Do not follow links\n", true); log.println(" --skip-dot-files", true); log.println(" Skip dot files\n", true); if (mode == "d") { log.println(" --from=blockId", true); log.println(" Decompress starting at the provided block (included).", true); log.println(" The first block ID is 1.\n", true); log.println(" --to=blockId", true); log.println(" Decompress ending at the provided block (excluded).\n", true); log.println("", true); log.println("Examples\n", true); log.println(" kanzi -d -i foo.knz -f -v 2 -j 2\n", true); log.println(" kanzi --decompress --input=foo.knz --force --verbose=2 --jobs=2\n", true); } if (mode == "c") { log.println("", true); log.println("Transforms\n", true); log.println(" BWT: Burrows Wheeler Transform is a transform that reorders symbols", true); log.println(" in a reversible way that is more amenable to entropy coding.", true); log.println(" This implementation uses a linear time forward transform and parallel", true); log.println(" inverse transform.\n", true); log.println(" BWTS: Burrows Wheeler Transform by Scott is a bijective variant of the BWT.\n", true); log.println(" LZ: Lempel Ziv implementation of the dictionary based LZ77 transform that", true); log.println(" removes redundancy in the data.\n", true); log.println(" LZX: Lempel Ziv Extra. Same as above with a bigger hash table and more", true); log.println(" match searches.\n", true); log.println(" LZP: Lempel Ziv Prediction can be described as an LZ implementation with only", true); log.println(" one possible match (no offset is emitted).\n", true); log.println(" RLT: Run Length Transform is a simple transform that replaces runs of similar", true); log.println(" symbols with a compact representation.\n", true); log.println(" ZRLT: Zero Run Length Transform. Similar to RLT but only processes runs of 0.", true); log.println(" Usually used post BWT.\n", true); log.println(" MTFT: Move-To-Front Transform is a transform that reduces entropy by assigning", true); log.println(" shorter symbols to recent data (like a LRU cache). Usually used post BWT.\n", true); log.println(" RANK: Rank Transform is a transform that that reduces entropy by assigning shorter", true); log.println(" symbols based on symbol frequency ranks. Usually used post BWT.\n", true); log.println(" EXE: a transform that reduces the entropy of executable files (X86 & ARM64)", true); log.println(" by replacing relative jump addresses with absolute ones.\n", true); log.println(" TEXT: a text transform that uses a dictionary to replace common words with", true); log.println(" their dictionary index.\n", true); log.println(" ROLZ: Reduced Offset Lempel Ziv is an implementation of LZ that replaces match offsets", true); log.println(" with indexes, creating a more compact output with slower decoding speeds.\n", true); log.println(" ROLZX: Extended ROLZ with more match searches and a more compact encoding.\n", true); log.println(" SRT: Sorted Rank Transform is a transform that that reduces entropy by assigning", true); log.println(" shorter symbols based on symbol frequency ranks. Usually used post BWT.\n", true); log.println(" MM: Multimedia transform is a fast transform that removes redundancy in correlated", true); log.println(" channels in some multimedia files (EG. wav, pnm).\n", true); log.println(" UTF: a fast transform replacing UTF-8 codewords with aliases based on frequencies.\n", true); log.println(" PACK: a fast transform replacing unused symbols with aliases based on frequencies.\n", true); log.println(" DNA: same as PACK but triggered only when DNA data is detected.\n", true); log.println("", true); log.println("Entropy codecs\n", true); log.println(" Huffman: a fast implementation of canonical Huffman. Both encoder and decoder", true); log.println(" use code tables and multi-streams to improve performance.\n", true); log.println(" RANGE: a fast implementation of a static range codec.\n", true); log.println(" ANS: based on Range Asymmetric Numeral Systems by Jarek Duda (specifically", true); log.println(" an implementation by Fabian Giesen). Works in a similar fashion to the Range", true); log.println(" codec but uses only 1 state instead of 2, and encodes in reverse byte order.\n", true); log.println(" FPAQ: a binary arithmetic codec based on FPAQ1 by Matt Mahoney. Uses a simple", true); log.println(" adaptive order 0 predictor based on frequencies.\n", true); log.println(" CM: a binary arithmetic codec derived from BCM by Ilya Muravyov. Uses context", true); log.println(" mixing of counters to generate a prediction of the next bit value.\n", true); log.println(" TPAQ: a binary arithmetic codec based initially on Tangelo 2.4 (itself derived", true); log.println(" from FPAQ8). Uses context mixing of predictions produced by one layer", true); log.println(" neural networks. The initial code has been heavily tuned to improve", true); log.println(" compression ratio and speed. Slow but usually excellent compression ratio.\n", true); log.println(" TPAQX: Extended TPAQ with more predictions and more memory usage. Slowest but", true); log.println(" usually the best compression ratio.\n", true); log.println("", true); log.println("Examples\n", true); log.println(" kanzi -c -i foo.txt -o none -b 4m -l 4 -v 3\n", true); log.println(" kanzi -c -i foo.txt -f -t BWT+MTFT+ZRLT -b 4m -e FPAQ -j 4\n", true); log.println(" kanzi --compress --input=foo.txt --output=foo.knz --force", true); log.println(" --transform=BWT+MTFT+ZRLT --block=4m --entropy=FPAQ --jobs=4\n", true); } log.println("", true); log.println("Credits\n", true); log.println(" Matt Mahoney, Yann Collet, Jan Ondrus, Yuta Mori, Ilya Muravyov,", true); log.println(" Neal Burns, Fabian Giesen, Jarek Duda, Ilya Grebnov\n", true); } void printHeader(Printer& log, int verbose, bool& showHeader) { if (verbose < 1) return; log.println("", true); log.println(APP_HEADER, true); log.println("", verbose > 1); log.println(APP_SUB_HEADER, verbose > 1); if (verbose >= 5) { stringstream extraHeader; #ifdef __clang__ extraHeader << "Compiled with clang version "; extraHeader << __clang_major__ << "." << __clang_minor__; #else #ifdef _MSC_VER extraHeader << "Compiled with Visual Studio"; #ifdef _MSC_VER_STR // see types.h extraHeader << " " << _MSC_VER_STR; #endif #else #ifdef __INTEL_COMPILER extraHeader << "Compiled with Intel compiler "; extraHeader << "(" << __INTEL_COMPILER_BUILD_DATE << ")"; #else #ifdef __GNUC__ extraHeader << "Compiled with gcc version "; extraHeader << __GNUC__ << "." << __GNUC_MINOR__ << "." << __GNUC_PATCHLEVEL__; #endif #endif #endif #endif if (extraHeader.str().length() > 0) { #if defined(__AVX2__) extraHeader << " - AVX2"; #elif defined(__AVX__) extraHeader << " - AVX"; #elif defined(__AVX512F__) extraHeader << " - AVX512"; #elif defined(__SSE4_1__) extraHeader << " - SSE4.1"; #elif defined(__SSE3__) extraHeader << " - SSE3"; #elif defined(__SSE2__) extraHeader << " - SSE2"; #elif defined(__SSE__) extraHeader << " - SSE"; #endif log.println(extraHeader.str(), true); } } showHeader = false; } #define WARNING_OPT_NOVALUE(opt) \ stringstream ss; \ ss << "Warning: ignoring option [" << opt << "] with no value."; \ log.println(ss.str(), verbose > 0) #define WARNING_OPT_COMP_ONLY(opt) \ stringstream ss; \ ss << "Warning: ignoring option [" << opt << "]. Only applicable in compression mode."; \ log.println(ss.str(), verbose > 0) #define WARNING_OPT_DECOMP_ONLY(opt) \ stringstream ss; \ ss << "Warning: ignoring option [" << opt << "]. Only applicable in decompression mode."; \ log.println(ss.str(), verbose > 0) #define WARNING_OPT_INVALID(opt) \ stringstream ss; \ ss << "Warning: ignoring option [" << opt << "]. Not applicable in this mode."; \ log.println(ss.str(), verbose > 0) #define WARNING_OPT_DUPLICATE(opt, val) \ stringstream ss; \ ss << "Warning: ignoring duplicate option [" << opt << "]: " << val;\ log.println(ss.str(), verbose > 0) static bool toInt(const string& s, int& res) { // Check that all characters are valid for (size_t i = 0; i < s.length(); i++) { if ((s[i] < '0') || (s[i] > '9')) return false; } // Use atoi because stoi can throw res = atoi(s.c_str()); return true; } int processCommandLine(int argc, const char* argv[], Context& map, Printer& log) { string inputName; string outputName; int remove = -1; int overwrite = -1; int checksum = 0; int skip = -1; int reorder = -1; int noDotFiles = -1; int noLinks = -1; string codec; string transf; bool verboseFlag = false; int verbose = 1; int ctx = -1; int level = -1; int from = -1; int to = -1; int tasks = -1; int blockSize = -1; int autoBlockSize = -1; string mode; bool showHeader = true; bool showHelp = false; for (int i = 1; i < argc; i++) { string arg(argv[i]); trim(arg); if (arg == "-v") { ctx = ARG_IDX_VERBOSE; continue; } if (arg == "-i") { ctx = ARG_IDX_INPUT; continue; } if (arg == "-o") { ctx = ARG_IDX_OUTPUT; continue; } // Extract verbosity, output and mode first if ((arg == "-c") || (arg.compare(0, 10, "--compress") == 0)) { if (mode != "") { cerr << "Only one mode can be provided (already got '" << mode << "'" << endl; return Error::ERR_INVALID_PARAM; } mode = "c"; continue; } if ((arg == "-d") || (arg.compare(0, 12, "--decompress") == 0)) { if (mode != "") { cerr << "Only one mode can be provided (already got '" << mode << "'" << endl; return Error::ERR_INVALID_PARAM; } mode = "d"; continue; } if ((arg == "-y") || (arg.compare(0, 10, "--info") == 0)) { if (mode != "") { cerr << "Only one mode can be provided (already got '" << mode << "')" << endl; return Error::ERR_INVALID_PARAM; } mode = "y"; continue; } if ((ctx == ARG_IDX_VERBOSE) || (arg.compare(0, 10, "--verbose=") == 0)) { if (verboseFlag == true) { WARNING_OPT_DUPLICATE("verbosity level", arg); } else { if (ctx != ARG_IDX_VERBOSE) arg = arg.substr(10); if ((toInt(arg, verbose) == false) || (verbose < 0) || (verbose > 5)) { cerr << "Invalid verbosity level provided on command line: " << arg << endl; return Error::ERR_INVALID_PARAM; } verboseFlag = true; } } else if ((ctx == ARG_IDX_OUTPUT) || (arg.compare(0, 9, "--output=") == 0)) { if (ctx != ARG_IDX_OUTPUT) arg = arg.substr(9); outputName = trim(arg); } else if ((ctx == ARG_IDX_INPUT) || (arg.compare(0, 8, "--input=") == 0)) { if (ctx != ARG_IDX_INPUT) arg = arg.substr(8); inputName = trim(arg); } else if ((arg == "--help") || (arg == "-h")) { showHelp = true; } ctx = -1; } if ((argc == 1) || (showHelp == true)) { printHeader(log, verbose, showHeader); printHelp(log, mode, showHeader); return 0; } // Overwrite verbosity if the output goes to stdout if (outputName.length() == 0) { if (inputName.length() == 0) { verbose = 0; verboseFlag = true; } } else { if (mode == "y") { WARNING_OPT_INVALID(outputName); } else { string str = outputName; transform(str.begin(), str.end(), str.begin(), safeToUpper); if (str == "STDOUT") { verbose = 0; verboseFlag = true; } } } printHeader(log, verbose, showHeader); inputName.clear(); outputName.clear(); ctx = -1; for (int i = 1; i < argc; i++) { string arg(argv[i]); if ((arg.length() > 0) && (arg[0] == 0x20)) { size_t k = 1; // Left trim limited to spaces (due to possible unicode chars in names) while ((k < arg.length()) && (arg[k] == 0x20)) k++; arg = arg.substr(k); } if ((arg == "-c") || (arg == "-d") || (arg == "-y") || (arg == "--compress") || (arg == "--decompress") || (arg == "--info")) { if (ctx != -1) { WARNING_OPT_NOVALUE(CMD_LINE_ARGS[ctx]); } ctx = -1; continue; } if ((arg == "--force") || (arg == "-f")) { if (ctx != -1) { WARNING_OPT_NOVALUE(CMD_LINE_ARGS[ctx]); } else if (overwrite >= 0) { WARNING_OPT_DUPLICATE(arg, "true"); } if (mode == "y") { WARNING_OPT_INVALID(arg); ctx = - 1; continue; } overwrite = 1; ctx = -1; continue; } if ((arg == "--skip") || (arg == "-s")) { if (ctx != -1) { WARNING_OPT_NOVALUE(CMD_LINE_ARGS[ctx]); } else if (skip >= 0) { WARNING_OPT_DUPLICATE(arg, "true"); } ctx = -1; if (mode != "c") { WARNING_OPT_COMP_ONLY(arg); continue; } skip = 1; continue; } if ((arg == "-x") || (arg == "-x32") || (arg == "-x64")) { if (checksum > 0) { WARNING_OPT_DUPLICATE(arg, "true"); } ctx = -1; if (mode != "c") { WARNING_OPT_COMP_ONLY(arg); continue; } checksum = (arg == "-x64") ? 64 : 32; continue; } if (arg == "--rm") { if (ctx != -1) { WARNING_OPT_NOVALUE(CMD_LINE_ARGS[ctx]); } else if (remove >= 0) { WARNING_OPT_DUPLICATE(arg, "true"); } ctx = -1; if (mode == "y") { WARNING_OPT_INVALID(arg); continue; } remove = 1; continue; } if (arg == "--no-file-reorder") { if (ctx != -1) { WARNING_OPT_NOVALUE(CMD_LINE_ARGS[ctx]); } else if (reorder >= 0) { WARNING_OPT_DUPLICATE(arg, "true"); } ctx = -1; if (mode != "c") { WARNING_OPT_COMP_ONLY(arg); continue; } reorder = 0; continue; } if (arg == "--skip-dot-files") { if (ctx != -1) { WARNING_OPT_NOVALUE(CMD_LINE_ARGS[ctx]); } else if (noDotFiles >= 0) { WARNING_OPT_DUPLICATE(arg, "true"); } ctx = -1; noDotFiles = 1; continue; } if (arg == "--skip-links") { if (ctx != -1) { WARNING_OPT_NOVALUE(CMD_LINE_ARGS[ctx]); } else if (noLinks >= 0) { WARNING_OPT_DUPLICATE(arg, "true"); } ctx = -1; noLinks = 1; continue; } if (ctx == -1) { for (int j = 0; j < 10; j++) { if (arg == CMD_LINE_ARGS[j]) { ctx = j; break; } } if (ctx != -1) continue; } if ((ctx == ARG_IDX_OUTPUT) || (arg.compare(0, 9, "--output=") == 0)) { if (ctx != ARG_IDX_OUTPUT) arg = arg.substr(9); if (outputName != "") { string msg = (ctx == ARG_IDX_OUTPUT) ? CMD_LINE_ARGS[ctx] : arg; if (mode == "y") { WARNING_OPT_INVALID(msg); ctx = -1; continue; } WARNING_OPT_DUPLICATE(msg, arg); } else { if (mode == "y") { string msg = (ctx == ARG_IDX_OUTPUT) ? CMD_LINE_ARGS[ctx] : arg; WARNING_OPT_INVALID(msg); ctx = -1; continue; } if ((arg.length() >= 2) && (arg[0] == '.') && (arg[1] == PATH_SEPARATOR)) { arg = (arg.length() == 2) ? arg.substr(0, 1) : arg.substr(2); } outputName = arg; } ctx = -1; continue; } if ((ctx == ARG_IDX_INPUT) || (arg.compare(0, 8, "--input=") == 0)) { if (ctx != ARG_IDX_INPUT) arg = arg.substr(8); if (inputName != "") { string msg = (ctx == ARG_IDX_INPUT) ? CMD_LINE_ARGS[ctx] : arg; WARNING_OPT_DUPLICATE(msg, arg); } else { if ((arg.length() >= 2) && (arg[0] == '.') && (arg[1] == PATH_SEPARATOR)) { arg = (arg.length() == 2) ? arg.substr(0, 1) : arg.substr(2); } inputName = arg; } ctx = -1; continue; } if ((ctx == ARG_IDX_ENTROPY) || (arg.compare(0, 10, "--entropy=") == 0)) { if (ctx != ARG_IDX_ENTROPY) arg = arg.substr(10); if (mode != "c"){ string msg = (ctx == ARG_IDX_ENTROPY) ? CMD_LINE_ARGS[ctx] : arg; WARNING_OPT_COMP_ONLY(msg); ctx = -1; continue; } if (codec != "") { string msg = (ctx == ARG_IDX_ENTROPY) ? CMD_LINE_ARGS[ctx] : arg; WARNING_OPT_DUPLICATE(msg, arg); } else { if (arg.length() == 0) { cerr << "Invalid empty entropy provided on command line" << endl; return Error::ERR_INVALID_PARAM; } codec = arg; transform(codec.begin(), codec.end(), codec.begin(), safeToUpper); } ctx = -1; continue; } if ((ctx == ARG_IDX_TRANSFORM) || (arg.compare(0, 12, "--transform=") == 0)) { if (ctx != ARG_IDX_TRANSFORM) arg = arg.substr(12); if (mode != "c"){ string msg = (ctx == ARG_IDX_TRANSFORM) ? CMD_LINE_ARGS[ctx] : arg; WARNING_OPT_COMP_ONLY(msg); ctx = -1; continue; } if (transf != "") { string msg = (ctx == ARG_IDX_TRANSFORM) ? CMD_LINE_ARGS[ctx] : arg; WARNING_OPT_DUPLICATE(msg, arg); } else { if (arg.length() == 0) { cerr << "Invalid empty transform provided on command line" << endl; return Error::ERR_INVALID_PARAM; } transf = arg; transform(transf.begin(), transf.end(), transf.begin(), safeToUpper); } while ((transf.length() > 0) && (transf[0] == '+')) { transf = transf.substr(1); } while ((transf.length() > 0) && (transf[transf.length() - 1] == '+')) { transf.resize(transf.length() - 1); } ctx = -1; continue; } if ((ctx == ARG_IDX_LEVEL) || (arg.compare(0, 8, "--level=") == 0)) { if (ctx != ARG_IDX_LEVEL) arg = arg.substr(8); if (mode != "c") { string msg = (ctx == ARG_IDX_LEVEL) ? CMD_LINE_ARGS[ctx] : arg; WARNING_OPT_COMP_ONLY(msg); ctx = -1; continue; } if (level >= 0) { string msg = (ctx == ARG_IDX_LEVEL) ? CMD_LINE_ARGS[ctx] : arg; WARNING_OPT_DUPLICATE(msg, arg); } else { if ((toInt(arg, level) == false) || ((level < 0) || (level > 9))) { cerr << "Invalid compression level provided on command line: " << arg << endl; return Error::ERR_INVALID_PARAM; } } ctx = -1; continue; } if (arg.compare(0, 11, "--checksum=") == 0) { arg = arg.substr(11); if (mode != "c") { WARNING_OPT_COMP_ONLY(arg); ctx = -1; continue; } if (checksum > 0) { WARNING_OPT_DUPLICATE("--checksum", arg); } else { if ((toInt(arg, checksum) == false) || ((checksum != 32) && (checksum != 64))) { cerr << "Invalid block checksum size provided on command line: " << arg << endl; return Error::ERR_INVALID_PARAM; } } ctx = -1; continue; } if ((ctx == ARG_IDX_BLOCK) || (arg.compare(0, 8, "--block=") == 0)) { if (ctx != ARG_IDX_BLOCK) arg = arg.substr(8); if (arg.length() == 0) { cerr << "Invalid block size provided on command line: " << arg << endl; return Error::ERR_INVALID_PARAM; } if (mode != "c") { string msg = (ctx == ARG_IDX_BLOCK) ? CMD_LINE_ARGS[ctx] : arg; WARNING_OPT_COMP_ONLY(msg); ctx = -1; continue; } if ((blockSize >= 0) || (autoBlockSize >= 0)) { string msg = (ctx == ARG_IDX_BLOCK) ? CMD_LINE_ARGS[ctx] : arg; WARNING_OPT_DUPLICATE(msg, arg); ctx = -1; continue; } transform(arg.begin(), arg.end(), arg.begin(), safeToUpper); if (arg == "AUTO") { autoBlockSize = 1; } else { uint64 scale = 1; char lastChar = arg[arg.length() - 1]; // Process K or M or G suffix if ('K' == lastChar) { scale = 1024; arg.resize(arg.length() - 1); } else if ('M' == lastChar) { scale = 1024 * 1024; arg.resize(arg.length() - 1); } else if ('G' == lastChar) { scale = 1024 * 1024 * 1024; arg.resize(arg.length() - 1); } if (toInt(arg, blockSize) == false) { cerr << "Invalid block size provided on command line: " << arg << endl; return Error::ERR_INVALID_PARAM; } stringstream ss1; ss1 << arg; ss1 >> blockSize; blockSize = int(uint64(blockSize) * scale); } ctx = -1; continue; } if ((ctx == ARG_IDX_JOBS) || (arg.compare(0, 7, "--jobs=") == 0)) { if (ctx != ARG_IDX_JOBS) arg = arg.substr(7); if (tasks >= 0) { string msg = (ctx == ARG_IDX_BLOCK) ? CMD_LINE_ARGS[ctx] : arg; WARNING_OPT_DUPLICATE(msg, arg); ctx = -1; continue; } if ((toInt(arg, tasks) == false) || (tasks < 0)) { cerr << "Invalid number of jobs provided on command line: " << arg << endl; return Error::ERR_INVALID_PARAM; } ctx = -1; continue; } if ((arg.compare(0, 7, "--from=") == 0) && (ctx == -1)) { if (mode != "d"){ WARNING_OPT_DECOMP_ONLY("--from"); continue; } arg = arg.substr(7); if (from >= 0) { WARNING_OPT_DUPLICATE("--from", arg); } else { if ((toInt(arg, from) == false) || (from < 0)) { cerr << "Invalid start block provided on command line: " << arg << endl; if (from == 0) { cerr << "The first block ID is 1." << endl; } return Error::ERR_INVALID_PARAM; } } continue; } if ((arg.compare(0, 5, "--to=") == 0) && (ctx == -1)) { if (mode != "d"){ WARNING_OPT_DECOMP_ONLY("--to"); continue; } arg = arg.substr(5); if (to >= 0) { WARNING_OPT_DUPLICATE("--to", arg); } else { if ((toInt(arg, to) == false) || (to <= 0)) { // Must be > 0 (0 means nothing to do) cerr << "Invalid end block provided on command line: " << arg << endl; return Error::ERR_INVALID_PARAM; } } continue; } if ((arg.compare(0, 10, "--verbose=") != 0) && (ctx == -1)) { stringstream ss; ss << "Warning: ignoring unknown option [" << arg << "]"; log.println(ss.str(), verbose > 0); } ctx = -1; } if (ctx != -1) { stringstream ss; ss << "Warning: ignoring option with missing value [" << CMD_LINE_ARGS[ctx] << "]"; log.println(ss.str(), verbose > 0); } if (level >= 0) { if (codec.length() > 0) { stringstream ss; ss << "Warning: providing the 'level' option forces the entropy codec. Ignoring [" << codec << "]"; log.println(ss.str(), verbose > 0); } if (transf.length() > 0) { stringstream ss; ss << "Warning: providing the 'level' option forces the transform. Ignoring [" << transf << "]"; log.println(ss.str(), verbose > 0); } } if (blockSize >= 0) map.putInt("blockSize", blockSize); map.putInt("verbosity", (verboseFlag == false) ? 1 : verbose); map.putString("mode", mode); map.putString("inputName", inputName); map.putString("outputName", outputName); map.putInt("checksum", checksum); if (autoBlockSize == 1) map.putInt("autoBlock", 1); if ((mode == "c") && (level >= 0)) map.putInt("level", level); if (overwrite == 1) map.putInt("overwrite", 1); if (remove == 1) map.putInt("remove", 1); if (codec.length() > 0) map.putString("entropy", codec); if (transf.length() > 0) map.putString("transform", transf); if (skip == 1) map.putInt("skipBlocks", 1); if (reorder == 0) map.putInt("fileReorder", 0); else map.putInt("fileReorder", 1); if (noDotFiles == 1) // Skip dot files map.putInt("noDotFiles", 1); if (noLinks == 1) // Do not follow links map.putInt("noLinks", 1); if (from >= 0) map.putInt("from", from); if (to >= 0) map.putInt("to", to); if (tasks >= 0) map.putInt("jobs", tasks); return 0; } int main(int argc, const char* argv[]) { #if defined(WIN32) || defined(_WIN32) || defined(_WIN64) // Force standard input and output to binary mode (void) _setmode(_fileno(stdout), _O_BINARY); (void) _setmode(_fileno(stdin), _O_BINARY); // Users can provide a custom code page to properly display some non ASCII file names // eg. 1252 for ANSI Latin-1 or 65001 for utf-8 size_t size; getenv_s(&size, nullptr, 0, "KANZI_CODE_PAGE"); if (size != 0) { char* p = new char[size]; getenv_s(&size, p, size, "KANZI_CODE_PAGE"); string s(p); int cp; if (toInt(s, cp) == true) { SetConsoleCP(cp); SetConsoleOutputCP(cp); } delete[] p; } #endif Context ctx; Printer log(cout); try { int status = processCommandLine(argc, argv, ctx, log); // Command line processing error ? if (status != 0) return status; // Help mode only ? if (ctx.has("mode") == false) return 0; string mode = ctx.getString("mode"); int jobs = ctx.getInt("jobs", -1); #ifndef CONCURRENCY_ENABLED if (jobs > 1) { const int verbosity = ctx.getInt("verbosity"); stringstream ss; ss << "Warning: the number of jobs is limited to 1 in this version"; log.println(ss.str(), verbosity > 0); } jobs = 1; #else if (jobs == 0) { int cores = max(int(thread::hardware_concurrency()), 1); // User provided 0 => use all the cores jobs = min(cores, MAX_CONCURRENCY); } else if (jobs == -1) { int cores = max(int(thread::hardware_concurrency()) / 2, 1); // Defaults to half the cores jobs = min(cores, MAX_CONCURRENCY); } else if (jobs > MAX_CONCURRENCY) { const int verbosity = ctx.getInt("verbosity"); stringstream ss; ss << "Warning: the number of jobs is too high, defaulting to " << MAX_CONCURRENCY; log.println(ss.str(), verbosity > 0); jobs = MAX_CONCURRENCY; } #endif ctx.putInt("jobs", jobs); if (mode == "c") { try { BlockCompressor bc(ctx); uint64 written = 0; int code = bc.compress(written); return code; } catch (const exception& e) { cerr << "Could not create the compressor: " << e.what() << endl; return Error::ERR_CREATE_COMPRESSOR; } } if ((mode == "d") || (mode == "y")) { try { BlockDecompressor bd(ctx); uint64 read = 0; int code = bd.decompress(read); return code; } catch (const exception& e) { cerr << "Could not create the decompressor: " << e.what() << endl; return Error::ERR_CREATE_DECOMPRESSOR; } } cout << "Missing arguments: try --help or -h" << endl; return Error::ERR_MISSING_PARAM; } #if __cplusplus >= 201703L catch (const bad_variant_access& e) { // May be thrown by Context cerr << e.what() << endl; return Error::ERR_UNKNOWN; } #endif catch (const invalid_argument& e) { // May be thrown by ThreadPool cerr << e.what() << endl; return Error::ERR_INVALID_PARAM; } catch (const runtime_error& e) { // May be thrown by ThreadPool cerr << e.what() << endl; return Error::ERR_INVALID_PARAM; } } kanzi-cpp-2.5.2/src/bitstream/000077500000000000000000000000001516423635400162065ustar00rootroot00000000000000kanzi-cpp-2.5.2/src/bitstream/DebugInputBitStream.cpp000066400000000000000000000076531516423635400226060ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include "../bitstream/DebugInputBitStream.hpp" using namespace kanzi; using namespace std; DebugInputBitStream::DebugInputBitStream(InputBitStream& ibs) : _delegate(ibs), _out(cout), _width(80) { _idx = 0; _mark = false; _show = false; _hexa = false; _current = kanzi::byte(0); } DebugInputBitStream::DebugInputBitStream(InputBitStream& ibs, ostream& os) : _delegate(ibs), _out(os), _width(80) { _idx = 0; _mark = false; _show = false; _hexa = false; _current = kanzi::byte(0); } DebugInputBitStream::DebugInputBitStream(InputBitStream& ibs, ostream& os, int width) : _delegate(ibs), _out(os) { if ((width != -1) && (width < 8)) width = 8; if (width != -1) width &= 0xFFFFFFF8; _width = width; _idx = 0; _mark = false; _show = false; _hexa = false; _current = kanzi::byte(0); } DebugInputBitStream::~DebugInputBitStream() { _close(); } // Returns 1 or 0 int DebugInputBitStream::readBit() { int res = _delegate.readBit(); _current <<= 1; _current |= kanzi::byte(res); _out << ((res & 1) == 1 ? "1" : "0"); _idx++; if (_mark == true) _out << "r"; if ((_width != -1) && ((_idx - 1) % _width == _width - 1)) { if (showByte()) printByte(_current); _out << endl; _idx = 0; } else if ((_idx & 7) == 0) { if (showByte()) printByte(_current); else _out << " "; } return res; } uint64 DebugInputBitStream::readBits(uint count) { uint64 res = _delegate.readBits(count); for (uint i = 1; i <= count; i++) { int bit = (res >> (count - i)) & 1; _idx++; _current <<= 1; _current |= kanzi::byte(bit); _out << ((bit == 1) ? "1" : "0"); if ((_mark == true) && (i == count)) _out << "r"; if ((_width != -1) && (_idx % _width == 0)) { if (showByte()) printByte(_current); _out << endl; _idx = 0; } else if ((_idx & 7) == 0) { if (showByte()) printByte(_current); else _out << " "; } } return res; } uint DebugInputBitStream::readBits(kanzi::byte bits[], uint count) { count = _delegate.readBits(bits, count); for (uint i = 0; i < count; i++) { const int bit = int(bits[i >> 3] >> (7 - (i & 7))) & 1; _idx++; _current <<= 1; _current |= kanzi::byte(bit); _out << ((bit == 1) ? "1" : "0"); if ((_mark == true) && (i + 1 == count)) _out << "r"; if ((_width != -1) && (_idx % _width == 0)) { if (showByte()) printByte(_current); _out << endl; _idx = 0; } else if ((_idx & 7) == 0) { if (showByte()) printByte(_current); else _out << " "; } } return count; } void DebugInputBitStream::printByte(kanzi::byte b) { int val = int(b); if (_hexa == true) { _out << hex << " [0x"; _out << ((val < 16) ? "0" : ""); _out << val << "] "; _out << dec; return; } _out << " ["; if (val < 10) _out << "00"; else if (val < 100) _out << "0"; _out << val << "] "; } kanzi-cpp-2.5.2/src/bitstream/DebugInputBitStream.hpp000066400000000000000000000037571516423635400226140ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_DebugInputBitStream #define knz_DebugInputBitStream #include "../InputBitStream.hpp" #include "../OutputStream.hpp" namespace kanzi { class DebugInputBitStream FINAL : public InputBitStream { private: InputBitStream& _delegate; OutputStream& _out; int _width; int _idx; bool _mark; bool _hexa; bool _show; byte _current; void printByte(byte val); void _close() { _delegate.close(); } public: DebugInputBitStream(InputBitStream& ibs); DebugInputBitStream(InputBitStream& ibs, OutputStream& os); DebugInputBitStream(InputBitStream& ibs, OutputStream& os, int width); ~DebugInputBitStream(); // Returns 1 or 0 int readBit(); uint64 readBits(uint length); uint readBits(byte bits[], uint length); // Number of bits read uint64 read() const { return _delegate.read(); } // Return false when the bitstream is closed or the End-Of-Stream has been reached bool hasMoreToRead() { return _delegate.hasMoreToRead(); } void close() { _close(); } void showByte(bool show) { _show = show; } void setHexa(bool hexa) { _hexa = hexa; } bool hexa() const { return _hexa; } bool showByte() const { return _show; } void setMark(bool mark) { _mark = mark; } bool mark() const { return _mark; } }; } #endif kanzi-cpp-2.5.2/src/bitstream/DebugOutputBitStream.cpp000066400000000000000000000076341516423635400230060ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include "DebugOutputBitStream.hpp" using namespace kanzi; using namespace std; DebugOutputBitStream::DebugOutputBitStream(OutputBitStream& obs) : _delegate(obs), _out(cout), _width(80) { _mark = false; _hexa = false; _show = false; _current = kanzi::byte(0); _idx = 0; } DebugOutputBitStream::DebugOutputBitStream(OutputBitStream& obs, OutputStream& os) : _delegate(obs), _out(os), _width(80) { _mark = false; _hexa = false; _show = false; _current = kanzi::byte(0); _idx = 0; } DebugOutputBitStream::DebugOutputBitStream(OutputBitStream& obs, OutputStream& os, int width) : _delegate(obs), _out(os) { if ((width != -1) && (width < 8)) width = 8; if (width != -1) width &= 0xFFFFFFF8; _width = width; _mark = false; _hexa = false; _show = false; _current = kanzi::byte(0); _idx = 0; } DebugOutputBitStream::~DebugOutputBitStream() { _close(); } void DebugOutputBitStream::writeBit(int bit) { bit &= 1; _out << ((bit == 1) ? "1" : "0"); _current <<= 1; _current |= kanzi::byte(bit); _idx++; if (_mark == true) _out << "w"; if ((_width != -1) && ((_idx - 1) % _width == _width - 1)) { if (showByte()) printByte(_current); _out << endl; _idx = 0; } else if ((_idx & 7) == 0) { if (showByte()) printByte(_current); else _out << " "; } _delegate.writeBit(bit); } uint DebugOutputBitStream::writeBits(uint64 bits, uint count) { uint res = _delegate.writeBits(bits, count); for (uint i = 1; i <= res; i++) { uint64 bit = (bits >> (res - i)) & 1; _current <<= 1; _current |= kanzi::byte(bit); _idx++; _out << ((bit == 1) ? "1" : "0"); if ((_mark == true) && (i == res)) _out << "w"; if ((_width != -1) && (_idx % _width == 0)) { if (showByte()) printByte(_current); _out << endl; _idx = 0; } else if ((_idx & 7) == 0) { if (showByte()) printByte(_current); else _out << " "; } } return res; } uint DebugOutputBitStream::writeBits(const kanzi::byte bits[], uint count) { uint res = _delegate.writeBits(bits, count); for (uint i = 0; i < res; i++) { const uint64 bit = uint64(bits[i >> 3] >> (7 - (i & 7))) & 1; _current <<= 1; _current |= kanzi::byte(bit); _idx++; _out << ((bit == 1) ? "1" : "0"); if ((_mark == true) && (i + 1 == res)) _out << "w"; if ((_width != -1) && (_idx % _width == 0)) { if (showByte()) printByte(_current); _out << endl; _idx = 0; } else if ((_idx & 7) == 0) { if (showByte()) printByte(_current); else _out << " "; } } return res; } void DebugOutputBitStream::printByte(kanzi::byte b) { int val = int(b); if (_hexa == true) { _out << hex << " [0x"; _out << ((val < 16) ? "0" : ""); _out << val << "] "; _out << dec; return; } _out << " ["; if (val < 10) _out << "00"; else if (val < 100) _out << "0"; _out << val << "] "; } kanzi-cpp-2.5.2/src/bitstream/DebugOutputBitStream.hpp000066400000000000000000000035711516423635400230070ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_DebugOutputBitStream #define knz_DebugOutputBitStream #include "../OutputBitStream.hpp" #include "../OutputStream.hpp" namespace kanzi { class DebugOutputBitStream FINAL : public OutputBitStream { private: OutputBitStream& _delegate; OutputStream& _out; int _width; int _idx; bool _mark; bool _show; bool _hexa; byte _current; void printByte(byte val); void _close() { _delegate.close(); } public: DebugOutputBitStream(OutputBitStream& obs); DebugOutputBitStream(OutputBitStream& obs, OutputStream& os); DebugOutputBitStream(OutputBitStream& obs, OutputStream& os, int width); ~DebugOutputBitStream(); void writeBit(int bit); uint writeBits(uint64 bits, uint length); uint writeBits(const byte bits[], uint length); // Return number of bits written so far uint64 written() const { return _delegate.written(); } void close() { _close(); } void showByte(bool show) { _show = show; } void setHexa(bool hexa) { _hexa = hexa; } bool hexa() const { return _hexa; } bool showByte() const { return _show; } void setMark(bool mark) { _mark = mark; } bool mark() const { return _mark; } }; } #endif kanzi-cpp-2.5.2/src/bitstream/DefaultInputBitStream.cpp000066400000000000000000000147751516423635400231470ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include "DefaultInputBitStream.hpp" using namespace kanzi; using namespace std; DefaultInputBitStream::DefaultInputBitStream(InputStream& is, uint bufferSize) : _is(is) { if (bufferSize < 1024) throw invalid_argument("Invalid buffer size (must be at least 1024)"); if (bufferSize > 1 << 29) throw invalid_argument("Invalid buffer size (must be at most 536870912)"); if ((bufferSize & 7) != 0) throw invalid_argument("Invalid buffer size (must be a multiple of 8)"); _bufferSize = bufferSize; _buffer = new kanzi::byte[_bufferSize]; _availBits = 0; _maxPosition = -1; _position = 0; _current = 0; _read = 0; _closed = false; } DefaultInputBitStream::~DefaultInputBitStream() { _close(); delete[] _buffer; } uint DefaultInputBitStream::readBits(kanzi::byte bits[], uint count) { if (isClosed() == true) throw BitStreamException("Stream closed", BitStreamException::STREAM_CLOSED); if (count == 0) return 0; uint remaining = count; uint start = 0; // Byte aligned cursor ? if ((_availBits & 7) == 0) { if (_availBits == 0) _availBits = pullCurrent(); // Empty _current while ((_availBits > 0) && (remaining >= 8)) { bits[start] = kanzi::byte(readBits(8)); start++; remaining -= 8; } prefetchRead(&_buffer[_position]); uint availBytes = uint(_maxPosition + 1 - _position); // Copy internal buffer to bits array while ((remaining >> 3) > availBytes) { memcpy(&bits[start], &_buffer[_position], availBytes); start += availBytes; remaining -= (availBytes << 3); _position = _maxPosition + 1; const int read = readFromInputStream(_bufferSize); availBytes = uint(_maxPosition + 1 - _position); if (read < int(_bufferSize)) break; } const uint r = min((remaining >> 6) << 3, availBytes); if (r > 0) { memcpy(&bits[start], &_buffer[_position], r); _position += r; start += r; remaining -= (r << 3); } } else if (remaining >= 64) { // Not kanzi::byte aligned const uint a = _availBits; const uint r = 64 - a; while (remaining >= 256) { const uint64 v0 = _current; if (_position + 32 > _maxPosition) { _availBits = pullCurrent(); if (_availBits < r) throw BitStreamException("No more data to read in the bitstream", BitStreamException::END_OF_STREAM); _availBits -= r; BigEndian::writeLong64(&bits[start], (v0 << r) | (_current >> _availBits)); start += 8; remaining -= 64; continue; } const uint64 v1 = BigEndian::readLong64(&_buffer[_position + 0]); const uint64 v2 = BigEndian::readLong64(&_buffer[_position + 8]); const uint64 v3 = BigEndian::readLong64(&_buffer[_position + 16]); const uint64 v4 = BigEndian::readLong64(&_buffer[_position + 24]); _current = v4; _position += 32; BigEndian::writeLong64(&bits[start + 0], (v0 << r) | (v1 >> a)); BigEndian::writeLong64(&bits[start + 8], (v1 << r) | (v2 >> a)); BigEndian::writeLong64(&bits[start + 16], (v2 << r) | (v3 >> a)); BigEndian::writeLong64(&bits[start + 24], (v3 << r) | (v4 >> a)); start += 32; remaining -= 256; } while (remaining >= 64) { const uint64 v = _current; _availBits = pullCurrent(); if (_availBits < r) throw BitStreamException("No more data to read in the bitstream", BitStreamException::END_OF_STREAM); _availBits -= r; BigEndian::writeLong64(&bits[start], (v << r) | (_current >> _availBits)); start += 8; remaining -= 64; } } // Last bytes while (remaining >= 8) { bits[start] = kanzi::byte(readBits(8)); start++; remaining -= 8; } if (remaining > 0) bits[start] = kanzi::byte(readBits(remaining) << (8 - remaining)); return count; } void DefaultInputBitStream::_close() { if (isClosed() == true) return; _closed = true; // Reset fields to force a readFromInputStream() and trigger an exception // on readBit() or readBits() _read -= int64(_availBits); // can be negative _availBits = 0; _maxPosition = -1; } int DefaultInputBitStream::readFromInputStream(uint count) { if (isClosed() == true) throw BitStreamException("Stream closed", BitStreamException::STREAM_CLOSED); if (count == 0) return 0; int size = -1; try { _read += (int64(_position) << 3); _is.read(reinterpret_cast(_buffer), count); _position = 0; size = (_is.good() == true) ? int(count) : int(_is.gcount()); _maxPosition = (size <= 0) ? -1 : size - 1; // Clear flags (required for future seeks when EOF is reached) _is.clear(); } catch (const runtime_error& e) { // Catch IOException without depending on io package throw BitStreamException(e.what(), BitStreamException::INPUT_OUTPUT); } if (size <= 0) { _is.clear(); throw BitStreamException("No more data to read in the bitstream", BitStreamException::END_OF_STREAM); } return size; } // Return false when the bitstream is closed or the End-Of-Stream has been reached bool DefaultInputBitStream::hasMoreToRead() { if (isClosed() == true) return false; if ((_position <= _maxPosition) || (_availBits > 0)) return true; try { readFromInputStream(_bufferSize); } catch (const BitStreamException&) { return false; } return true; } kanzi-cpp-2.5.2/src/bitstream/DefaultInputBitStream.hpp000066400000000000000000000120351516423635400231370ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_DefaultInputBitStream #define knz_DefaultInputBitStream #include "../BitStreamException.hpp" #include "../InputBitStream.hpp" #include "../InputStream.hpp" #include "../Memory.hpp" #include "../Seekable.hpp" #include "../util/strings.hpp" namespace kanzi { #if defined(_MSC_VER) && _MSC_VER <= 1500 class DefaultInputBitStream FINAL : public InputBitStream #else class DefaultInputBitStream FINAL : public InputBitStream, public Seekable #endif { private: InputStream& _is; byte* _buffer; int _position; // index of current byte (consumed if bitIndex == -1) uint _availBits; // bits not consumed in _current int64 _read; uint64 _current; bool _closed; int _maxPosition; uint _bufferSize; int readFromInputStream(uint count); // return number of available bits uint pullCurrent(); void _close(); public: // Returns 1 or 0 int readBit(); uint64 readBits(uint length); uint readBits(byte bits[], uint count); void close() { _close(); } // Number of bits read uint64 read() const { return uint64(_read + (int64(_position) << 3) - int64(_availBits)); } // Return false when the bitstream is closed or the End-Of-Stream has been reached bool hasMoreToRead(); bool isClosed() const { return _closed; } #if !defined(_MSC_VER) || _MSC_VER > 1500 int64 tell(); bool seek(int64 pos); #endif DefaultInputBitStream(InputStream& is, uint bufferSize = 65536); ~DefaultInputBitStream(); }; // Returns 1 or 0 inline int DefaultInputBitStream::readBit() { if (_availBits == 0) _availBits = pullCurrent(); // Triggers an exception if stream is closed _availBits--; return int(_current >> _availBits) & 1; } inline uint64 DefaultInputBitStream::readBits(uint count) { if ((count == 0) || (count > 64)) throw BitStreamException("Invalid bit count: " + TOSTR(count) + " (must be in [1..64])"); if (count <= _availBits) { _availBits -= count; return (_current >> _availBits) & (uint64(-1) >> (64 - count)); } // Not enough spots available in 'current' count -= _availBits; uint64 res = _current & ((uint64(1) << _availBits) - 1); _availBits = pullCurrent(); if (_availBits < count) throw BitStreamException("No more data to read in the bitstream", BitStreamException::END_OF_STREAM); _availBits -= count; const uint64 tail = (_current >> _availBits) & (uint64(-1) >> (64 - count)); return (count == 64) ? tail : ((res << count) | tail); } // Pull 64 bits of current value from buffer. inline uint DefaultInputBitStream::pullCurrent() { if (_position + 7 > _maxPosition) { if (_position > _maxPosition) readFromInputStream(_bufferSize); if (_position + 7 > _maxPosition) { // End of stream: overshoot max position => adjust bit index uint shift = uint(_maxPosition - _position) * 8; _availBits = shift + 8; uint64 val = 0; while (_position <= _maxPosition) { val |= (uint64(_buffer[_position++]) << shift); shift -= 8; } _current = val; return _availBits; } } // Regular processing, buffer length is multiple of 8 _current = uint64(BigEndian::readLong64(&_buffer[_position])); _position += 8; return 64; } #if !defined(_MSC_VER) || _MSC_VER > 1500 inline int64 DefaultInputBitStream::tell() { if (isClosed()) return -1; _is.clear(); const int64 res = int64(_is.tellg()); return (res < 0) ? -1 : 8 * (res - int64(_maxPosition + 1 - _position)) - int64(_availBits); } inline bool DefaultInputBitStream::seek(int64 pos) { if (isClosed()) return false; if (pos < 0) return false; // Update internal states to force read at new stream position _read += (8 * int64(_position) - int64(_availBits)); _availBits = 0; _position = 0; _maxPosition = -1; _is.clear(); _is.seekg(std::streampos(pos >> 3)); if (_is.fail()) return false; if ((pos & 7) != 0) readBits(pos & 7); return true; } #endif } #endif kanzi-cpp-2.5.2/src/bitstream/DefaultOutputBitStream.cpp000066400000000000000000000143321516423635400233350ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "DefaultOutputBitStream.hpp" using namespace kanzi; using namespace std; DefaultOutputBitStream::DefaultOutputBitStream(OutputStream& os, uint bufferSize) : _os(os) { if (bufferSize < 1024) throw invalid_argument("Invalid buffer size (must be at least 1024)"); if (bufferSize > 1 << 29) throw invalid_argument("Invalid buffer size (must be at most 536870912)"); if ((bufferSize & 7) != 0) throw invalid_argument("Invalid buffer size (must be a multiple of 8)"); _availBits = 64; _bufferSize = bufferSize; _buffer = new kanzi::byte[_bufferSize]; _position = 0; _current = 0; _written = 0; _closed = false; memset(&_buffer[0], 0, size_t(_bufferSize)); } uint DefaultOutputBitStream::writeBits(const kanzi::byte bits[], uint count) { if (isClosed() == true) throw BitStreamException("Stream closed", BitStreamException::STREAM_CLOSED); uint remaining = count; uint start = 0; // Byte aligned cursor ? if ((_availBits & 7) == 0) { // Fill up _current while ((_availBits != 64) && (remaining >= 8)) { writeBits(uint64(bits[start]), 8); start++; remaining -= 8; } const uint maxPos = _bufferSize - 8; // Copy bits array to internal buffer while ((remaining >> 3) >= maxPos - _position) { memcpy(&_buffer[_position], &bits[start], maxPos - _position); start += (maxPos - _position); remaining -= ((maxPos - _position) << 3); _position = maxPos; flush(); } const uint r = (remaining >> 6) << 3; if (r > 0) { memcpy(&_buffer[_position], &bits[start], r); start += r; _position += r; remaining -= (r << 3); } } else if (remaining >= 64) { // Not kanzi::byte aligned const uint r = 64 - _availBits; const uint a = _availBits; while (remaining >= 256) { const uint64 v1 = uint64(BigEndian::readLong64(&bits[start])); const uint64 v2 = uint64(BigEndian::readLong64(&bits[start + 8])); const uint64 v3 = uint64(BigEndian::readLong64(&bits[start + 16])); const uint64 v4 = uint64(BigEndian::readLong64(&bits[start + 24])); _current |= (v1 >> r); if (_position >= _bufferSize - 32) flush(); BigEndian::writeLong64(&_buffer[_position], _current); BigEndian::writeLong64(&_buffer[_position + 8], (v1 << a) | (v2 >> r)); BigEndian::writeLong64(&_buffer[_position + 16], (v2 << a) | (v3 >> r)); BigEndian::writeLong64(&_buffer[_position + 24], (v3 << a) | (v4 >> r)); _position += 32; _current = (v4 << a); start += 32; remaining -= 256; _availBits = 64; } while (remaining >= 64) { const uint64 v = uint64(BigEndian::readLong64(&bits[start])); _current |= (v >> r); pushCurrent(); _current = v << a; start += 8; remaining -= 64; } _availBits = a; } // Last bytes while (remaining >= 8) { writeBits(uint64(bits[start]), 8); start++; remaining -= 8; } if (remaining > 0) writeBits(uint64(bits[start]) >> (8 - remaining), remaining); return count; } void DefaultOutputBitStream::_close() { if (isClosed() == true) return; uint savedBitIndex = _availBits; uint savedPosition = _position; uint64 savedCurrent = _current; try { // Push last bytes (the very last kanzi::byte may be incomplete) uint shift = 56; while (_availBits < 64) { _buffer[_position++] = kanzi::byte(_current >> shift); shift -= 8; _availBits += 8; } _written -= int64(_availBits - 64); // can be negative _availBits = 64; flush(); } catch (const BitStreamException&) { // Revert fields to allow subsequent attempts in case of transient failure _position = savedPosition; _availBits = savedBitIndex; _current = savedCurrent; throw; // re-throw } try { _os.flush(); if (_os.bad()) throw BitStreamException("Write to bitstream failed.", BitStreamException::INPUT_OUTPUT); } catch (const ios_base::failure& e) { throw BitStreamException(e.what(), BitStreamException::INPUT_OUTPUT); } _closed = true; _position = 0; _availBits = 0; _written -= 64; // adjust because _availBits = 0 // Reset fields to force a flush() and trigger an exception // on writeBit() or writeBits() delete[] _buffer; _bufferSize = 8; _buffer = new kanzi::byte[_bufferSize]; memset(&_buffer[0], 0, size_t(_bufferSize)); } // Write buffer to underlying stream void DefaultOutputBitStream::flush() { if (isClosed() == true) throw BitStreamException("Stream closed", BitStreamException::STREAM_CLOSED); try { if (_position > 0) { _os.write(reinterpret_cast(_buffer), _position); if (_os.bad()) throw BitStreamException("Write to bitstream failed", BitStreamException::INPUT_OUTPUT); _written += (int64(_position) << 3); _position = 0; } } catch (const ios_base::failure& e) { throw BitStreamException(e.what(), BitStreamException::INPUT_OUTPUT); } } DefaultOutputBitStream::~DefaultOutputBitStream() { try { _close(); } catch (const exception&) { // Ignore and continue } delete[] _buffer; } kanzi-cpp-2.5.2/src/bitstream/DefaultOutputBitStream.hpp000066400000000000000000000111131516423635400233340ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_DefaultOutputBitStream #define knz_DefaultOutputBitStream #include "../BitStreamException.hpp" #include "../OutputStream.hpp" #include "../OutputBitStream.hpp" #include "../Memory.hpp" #include "../Seekable.hpp" #include "../util/strings.hpp" namespace kanzi { #if defined(_MSC_VER) && _MSC_VER <= 1500 class DefaultOutputBitStream FINAL : public OutputBitStream #else class DefaultOutputBitStream FINAL : public OutputBitStream, public Seekable #endif { private: OutputStream& _os; byte* _buffer; bool _closed; uint _bufferSize; uint _position; // index of current byte in buffer uint _availBits; // bits not consumed in _current int64 _written; uint64 _current; // cached bits void pushCurrent(); void flush(); void _close(); public: DefaultOutputBitStream(OutputStream& os, uint bufferSize=65536); ~DefaultOutputBitStream(); void writeBit(int bit); uint writeBits(uint64 bits, uint length); uint writeBits(const byte bits[], uint length); void close() { _close(); } #if !defined(_MSC_VER) || _MSC_VER > 1500 int64 tell(); bool seek(int64 pos); #endif // Return number of bits written so far uint64 written() const { // Number of bits flushed + bytes written in memory + bits written in memory return uint64(_written + (int64(_position) << 3) + int64(64 - _availBits)); } bool isClosed() const { return _closed; } }; // Write least significant bit of the input integer. Trigger exception if stream is closed inline void DefaultOutputBitStream::writeBit(int bit) { if (_availBits <= 1) { // _availBits = 0 if stream is closed => force pushCurrent() _current |= (uint64(bit) & 1); pushCurrent(); } else { _availBits--; _current |= (uint64(bit & 1) << _availBits); } } // Write 'count' (in [1..64]) bits. Trigger exception if stream is closed inline uint DefaultOutputBitStream::writeBits(uint64 value, uint count) { if ((count == 0) || (count > 64)) return 0; if (count < _availBits) { _availBits -= count; _current |= (value << _availBits); } else { // Not enough spots available in 'current' const uint remaining = count - _availBits; _current |= (_availBits == 0 ? 0 : (value >> remaining) & (~uint64(0) >> (64 - _availBits))); pushCurrent(); if (remaining != 0) { _availBits -= remaining; _current = value << _availBits; } } return count; } // Push 64 bits of current value into buffer. inline void DefaultOutputBitStream::pushCurrent() { BigEndian::writeLong64(&_buffer[_position], int64(_current)); _availBits = 64; _current = 0; _position += 8; if (_position >= _bufferSize - 8) flush(); } #if !defined(_MSC_VER) || _MSC_VER > 1500 inline int64 DefaultOutputBitStream::tell() { if (isClosed() == true) return -1; _os.clear(); const int64 res = int64(_os.tellp()); return (res < 0) ? -1 : 8 * res + (int64(_position) << 3) + int64(64 - _availBits); } // Only support a new position at the byte boundary (pos & 7 == 0) inline bool DefaultOutputBitStream::seek(int64 pos) { if (isClosed() == true) return false; if ((pos < 0) || ((pos & 7) != 0)) return false; // Flush buffer // Round down to byte alignment const uint a = _availBits - (_availBits & 7); for (int i = 56; i >= int(a); i -= 8) { _buffer[_position++] = byte(_current >> uint(i)); if (_position >= _bufferSize) flush(); } _availBits = 64; flush(); _os.clear(); _os.seekp(std::streampos(pos >> 3)); return _os.fail() == false; } #endif } #endif kanzi-cpp-2.5.2/src/concurrent.hpp000066400000000000000000000164661516423635400171240ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_concurrent #define knz_concurrent #include "types.hpp" #if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1700) // C++ 11 (or partial) #include #define HAVE_STD_ATOMICS 1 #ifndef CONCURRENCY_DISABLED #ifdef __clang__ // Process clang first because it may define __GNUC__ with an old version #define CONCURRENCY_ENABLED #elif __GNUC__ // Require g++ 5.0 minimum, 4.8.4 generates exceptions on futures (?) #if ((__GNUC__ << 16) + __GNUC_MINOR__ >= (5 << 16) + 0) #define CONCURRENCY_ENABLED #endif #else #define CONCURRENCY_ENABLED #endif #endif #else #define HAVE_STD_ATOMICS 0 #endif #ifdef CONCURRENCY_ENABLED #include #include #include #include #include #include #include #include #if __cplusplus >= 201703L #include #endif #include #ifdef __x86_64__ #ifdef __clang__ #define CPU_PAUSE() __builtin_ia32_pause() #elif __GNUC__ #define CPU_PAUSE() __builtin_ia32_pause() #elif _MSC_VER #include #define CPU_PAUSE() _mm_pause() #else #define CPU_PAUSE() std::this_thread::yield(); #endif #else #define CPU_PAUSE() std::this_thread::yield(); #endif #else #define CPU_PAUSE() #endif template class Task { public: Task() {} virtual ~Task() {} virtual T run() = 0; }; #ifdef CONCURRENCY_ENABLED class ThreadPool FINAL { public: ThreadPool(int threads = 8); template #if __cplusplus >= 201703L // result_of deprecated from C++17 std::future> schedule(F&& f, Args&&... args); #else std::future::type> schedule(F&& f, Args&&... args); #endif ~ThreadPool() noexcept; private: std::vector _workers; std::queue> _tasks; std::mutex _mutex; std::condition_variable _condition; bool _stop; }; inline ThreadPool::ThreadPool(int threads) : _stop(false) { if ((threads <= 0) || (threads > 1024)) throw std::invalid_argument("The number of threads must be in [1..1024]"); // Start and run threads for (int i = 0; i < threads; i++) _workers.emplace_back( [this] { for(;;) { std::function task; { std::unique_lock lock(_mutex); _condition.wait(lock, [this] { return _stop || !_tasks.empty(); }); if (_stop && _tasks.empty()) return; task = std::move(_tasks.front()); _tasks.pop(); } task(); } } ); } template #if __cplusplus >= 201703L // result_of deprecated from C++17 std::future > ThreadPool::schedule(F&& f, Args&&... args) { using return_type = typename std::invoke_result::type; #else std::future::type> ThreadPool::schedule(F&& f, Args&&... args) { using return_type = typename std::result_of::type; #endif #if __cplusplus >= 201703L auto task = std::make_shared>( [fn = std::forward(f), params = std::make_tuple(std::forward(args)...)]() mutable -> return_type { return std::apply(std::move(fn), std::move(params)); } ); #else auto task = std::make_shared>( std::bind(std::forward(f), std::forward(args)...) ); #endif std::future res = task->get_future(); { std::unique_lock lock(_mutex); if (_stop == true) throw std::runtime_error("ThreadPool stopped"); _tasks.emplace([task](){ (*task)(); }); } _condition.notify_one(); return res; } // the destructor joins all threads inline ThreadPool::~ThreadPool() noexcept { { std::unique_lock lock(_mutex); _stop = true; } _condition.notify_all(); for (std::thread& w : _workers) w.join(); } template class BoundedConcurrentQueue { public: BoundedConcurrentQueue(int nbItems, T* data) : _index(0), _size(nbItems), _data(data) {} ~BoundedConcurrentQueue() { } T* get() { int idx = _index.fetch_add(1, std::memory_order_acq_rel); return (idx >= _size) ? nullptr : &_data[idx]; } void clear() { _index.store(_size); } private: std::atomic_int _index; int _size; T* _data; }; #endif #if HAVE_STD_ATOMICS typedef std::atomic_int atomic_int_t; #define LOAD_ATOMIC(a) ((a).load(std::memory_order_acquire)) #define STORE_ATOMIC(a, v) ((a).store((v), std::memory_order_release)) #define EXCHANGE_ATOMIC(a, v) ((a).exchange((v), std::memory_order_acq_rel)) #define FETCH_ADD_ATOMIC(a, v) ((a).fetch_add((v), std::memory_order_acq_rel)) #define COMPARE_EXCHANGE_ATOMIC(obj, expected, desired) \ ((obj).compare_exchange_strong((expected), (desired), \ std::memory_order_release, std::memory_order_acquire)) #else typedef int atomic_int_t; #define LOAD_ATOMIC(a) (a) #define STORE_ATOMIC(a, v) ((a) = (v)) #define EXCHANGE_ATOMIC(a, v) exchange_atomic_int((a), (v)) #define FETCH_ADD_ATOMIC(a, v) fetch_add_atomic_int((a), (v)) #define COMPARE_EXCHANGE_ATOMIC(obj, expected, desired) \ compare_exchange_fallback((obj), (expected), (desired)) inline int exchange_atomic_int(int& a, int v) { int old = a; a = v; return old; } inline int fetch_add_atomic_int(int& a, int v) { int old = a; a += v; return old; } inline bool compare_exchange_fallback(int& obj, int& expected, int desired) { if (obj == expected) { obj = desired; return true; } else { expected = obj; // update expected on failure return false; } } #endif #endif kanzi-cpp-2.5.2/src/configure000077500000000000000000000000231516423635400161160ustar00rootroot00000000000000# Fake config file kanzi-cpp-2.5.2/src/entropy/000077500000000000000000000000001516423635400157145ustar00rootroot00000000000000kanzi-cpp-2.5.2/src/entropy/ANSRangeDecoder.cpp000066400000000000000000000220501516423635400213030ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include "../BitStreamException.hpp" #include "../Global.hpp" #include "ANSRangeDecoder.hpp" #include "EntropyUtils.hpp" using namespace kanzi; using namespace std; const uint ANSRangeDecoder::ANS_TOP = 1 << 15; // max possible for ANS_TOP=1<<23 const int ANSRangeDecoder::DEFAULT_ANS0_CHUNK_SIZE = 16384; const int ANSRangeDecoder::DEFAULT_LOG_RANGE = 12; const int ANSRangeDecoder::MIN_CHUNK_SIZE = 1024; const int ANSRangeDecoder::MAX_CHUNK_SIZE = 1 << 27; // 8*MAX_CHUNK_SIZE must not over // The chunk size indicates how many bytes are encoded (per block) before // resetting the frequency stats. ANSRangeDecoder::ANSRangeDecoder(InputBitStream& bitstream, int order, int chunkSize) : _bitstream(bitstream) { if ((order != 0) && (order != 1)) throw invalid_argument("ANS Codec: The order must be 0 or 1"); if (chunkSize < MIN_CHUNK_SIZE) { stringstream ss; ss << "ANS Codec: The chunk size must be at least " << MIN_CHUNK_SIZE; throw invalid_argument(ss.str()); } if (chunkSize > MAX_CHUNK_SIZE) { stringstream ss; ss << "ANS Codec: The chunk size must be at most " << MAX_CHUNK_SIZE; throw invalid_argument(ss.str()); } _chunkSize = min(chunkSize << (8 * order), MAX_CHUNK_SIZE); _order = order; const int dim = 255 * order + 1; _freqs = new uint[dim * 256]; _symbols = new ANSDecSymbol[dim * 256]; _buffer = nullptr; _bufferSize = 0; _f2s = nullptr; _f2sSize = 0; _logRange = DEFAULT_LOG_RANGE; } ANSRangeDecoder::~ANSRangeDecoder() { _dispose(); if (_buffer != nullptr) delete[] _buffer; if (_f2s != nullptr) delete[] _f2s; delete[] _freqs; delete[] _symbols; } int ANSRangeDecoder::decodeHeader(uint frequencies[], uint alphabet[]) { _logRange = int(8 + _bitstream.readBits(3)); if (_logRange > 16) { stringstream ss; ss << "Invalid bitstream: range = " << _logRange << " (must be in [8..16])"; throw BitStreamException(ss.str(), BitStreamException::INVALID_STREAM); } int res = 0; const int dim = 255 * _order + 1; if (_f2sSize < (dim << _logRange)) { if (_f2s != nullptr) delete[] _f2s; _f2sSize = dim << _logRange; _f2s = new uint8[_f2sSize]; } const uint scale = 1 << _logRange; const int llr = Global::_log2(_logRange) + 1; for (int k = 0; k < dim; k++) { const int alphabetSize = EntropyUtils::decodeAlphabet(_bitstream, alphabet); if (alphabetSize == 0) continue; uint* f = &frequencies[k << 8]; if (alphabetSize != 256) memset(f, 0, sizeof(uint) * 256); const int chkSize = (alphabetSize >= 64) ? 8 : 6; uint sum = 0; // Decode all frequencies (but the first one) by chunks for (int i = 1; i < alphabetSize; i += chkSize) { // Read frequencies size for current chunk const uint logMax = uint(_bitstream.readBits(llr)); if (logMax > _logRange) { stringstream ss; ss << "Invalid bitstream: incorrect frequency size "; ss << logMax << " in ANS range decoder"; throw BitStreamException(ss.str(), BitStreamException::INVALID_STREAM); } const int endj = min(i + chkSize, alphabetSize); // Read frequencies for (int j = i; j < endj; j++) { const uint freq = (logMax == 0) ? 1 : uint(_bitstream.readBits(logMax) + 1); if (freq >= scale) { stringstream ss; ss << "Invalid bitstream: incorrect frequency " << freq; ss << " for symbol '" << alphabet[j] << "' in ANS range decoder"; throw BitStreamException(ss.str(), BitStreamException::INVALID_STREAM); } f[alphabet[j]] = freq; sum += freq; } } // Infer first frequency if (scale <= sum) { stringstream ss; ss << "Invalid bitstream: incorrect frequency " << frequencies[alphabet[0]]; ss << " for symbol '" << alphabet[0] << "' in ANS range decoder"; throw BitStreamException(ss.str(), BitStreamException::INVALID_STREAM); } f[alphabet[0]] = uint(scale - sum); sum = 0; ANSDecSymbol* symb = &_symbols[k << 8]; uint8* freq2sym = &_f2s[k << _logRange]; // Create reverse mapping for (int i = 0; i < 256; i++) { if (f[i] == 0) continue; memset(&freq2sym[sum], i, size_t(f[i])); symb[i].reset(sum, f[i], _logRange); sum += f[i]; } res += alphabetSize; } return res; } int ANSRangeDecoder::decode(kanzi::byte block[], uint blkptr, uint count) { if (count <= 32) { _bitstream.readBits(&block[blkptr], 8 * count); return count; } const uint minBufSize = 2 * uint(_chunkSize); if (_bufferSize < minBufSize) { if (_buffer != nullptr) delete[] _buffer; _bufferSize = minBufSize; _buffer = new kanzi::byte[_bufferSize]; } const uint end = blkptr + count; uint startChunk = blkptr; uint alphabet[256]; while (startChunk < end) { const uint sizeChunk = min(uint(_chunkSize), end - startChunk); const int alphabetSize = decodeHeader(_freqs, alphabet); if (alphabetSize == 0) return startChunk - blkptr; if ((_order == 0) && (alphabetSize == 1)) { // Shortcut for chunks with only one symbol memset(&block[startChunk], alphabet[0], size_t(sizeChunk)); } else { if (decodeChunk(&block[startChunk], sizeChunk) == false) return -1; } startChunk += sizeChunk; } return count; } bool ANSRangeDecoder::decodeChunk(kanzi::byte block[], uint count) { // Read chunk size const uint sz = uint(EntropyUtils::readVarInt(_bitstream)); if ((sz >= MAX_CHUNK_SIZE) || (sz > _bufferSize - 2)) return false; // Read initial ANS states uint st0 = uint(_bitstream.readBits(32)); uint st1 = uint(_bitstream.readBits(32)); uint st2 = uint(_bitstream.readBits(32)); uint st3 = uint(_bitstream.readBits(32)); if (count == 0) return true; // Read encoded data from bitstream memset(_buffer, 0, _bufferSize); _bitstream.readBits(&_buffer[0], 8 * sz); kanzi::byte* p = &_buffer[0]; const int mask = (1 << _logRange) - 1; const int count4 = count & -4; if (_order == 0) { for (int i = 0; i < count4; i += 4) { const uint8 cur3 = _f2s[st3 & mask]; block[i] = kanzi::byte(cur3); st3 = decodeSymbol(p, st3, _symbols[cur3], mask); const uint8 cur2 = _f2s[st2 & mask]; block[i + 1] = kanzi::byte(cur2); st2 = decodeSymbol(p, st2, _symbols[cur2], mask); const uint8 cur1 = _f2s[st1 & mask]; block[i + 2] = kanzi::byte(cur1); st1 = decodeSymbol(p, st1, _symbols[cur1], mask); const uint8 cur0 = _f2s[st0 & mask]; block[i + 3] = kanzi::byte(cur0); st0 = decodeSymbol(p, st0, _symbols[cur0], mask); } } else { const int quarter = count4 >> 2; int i0 = 0; int i1 = 1 * quarter; int i2 = 2 * quarter; int i3 = 3 * quarter; int prv0 = 0, prv1 = 0, prv2 = 0, prv3 = 0; for ( ; i0 < quarter; i0++, i1++, i2++, i3++) { const uint8 cur3 = _f2s[(prv3 << _logRange) + (st3 & mask)]; const uint8 cur2 = _f2s[(prv2 << _logRange) + (st2 & mask)]; const uint8 cur1 = _f2s[(prv1 << _logRange) + (st1 & mask)]; const uint8 cur0 = _f2s[(prv0 << _logRange) + (st0 & mask)]; st3 = decodeSymbol(p, st3, _symbols[(prv3 << 8) | cur3], mask); st2 = decodeSymbol(p, st2, _symbols[(prv2 << 8) | cur2], mask); st1 = decodeSymbol(p, st1, _symbols[(prv1 << 8) | cur1], mask); st0 = decodeSymbol(p, st0, _symbols[(prv0 << 8) | cur0], mask); block[i3] = kanzi::byte(cur3); block[i2] = kanzi::byte(cur2); block[i1] = kanzi::byte(cur1); block[i0] = kanzi::byte(cur0); prv3 = cur3; prv2 = cur2; prv1 = cur1; prv0 = cur0; } } for (uint i = count4; i < count; i++) block[i] = *p++; return true; } kanzi-cpp-2.5.2/src/entropy/ANSRangeDecoder.hpp000066400000000000000000000056521516423635400213210ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_ANSRangeDecoder #define knz_ANSRangeDecoder #include "../EntropyDecoder.hpp" #include "../types.hpp" // Implementation of an Asymmetric Numeral System decoder. // See "Asymmetric Numeral System" by Jarek Duda at http://arxiv.org/abs/0902.0271 // Some code has been ported from https://github.com/rygorous/ryg_rans // For an alternate C implementation example, see https://github.com/Cyan4973/FiniteStateEntropy namespace kanzi { struct ANSDecSymbol { void reset(int cumFreq, int freq, int logRange); uint16 _cumFreq; uint16 _freq; }; class ANSRangeDecoder : public EntropyDecoder { public: static const uint ANS_TOP; ANSRangeDecoder(InputBitStream& bitstream, int order = 0, int chunkSize = DEFAULT_ANS0_CHUNK_SIZE); ~ANSRangeDecoder(); int decode(byte block[], uint blkptr, uint len); InputBitStream& getBitStream() const { return _bitstream; } void dispose() { _dispose(); } private: static const int DEFAULT_ANS0_CHUNK_SIZE; static const int DEFAULT_LOG_RANGE; static const int MIN_CHUNK_SIZE; static const int MAX_CHUNK_SIZE; InputBitStream& _bitstream; uint* _freqs; uint8* _f2s; int _f2sSize; ANSDecSymbol* _symbols; byte* _buffer; uint _bufferSize; uint _chunkSize; uint _order; uint _logRange; bool decodeChunk(byte block[], uint count); uint decodeSymbol(byte*& p, uint& st, const ANSDecSymbol& sym, const int mask) const; int decodeHeader(uint frequencies[], uint alphabet[]); void _dispose() const {} }; inline void ANSDecSymbol::reset(int cumFreq, int freq, int logRange) { _cumFreq = uint16(cumFreq); _freq = (freq >= (1 << logRange)) ? uint16((1 << logRange) - 1) : uint16(freq); // Mirror encoder } inline uint ANSRangeDecoder::decodeSymbol(byte*& p, uint& st, const ANSDecSymbol& sym, const int mask) const { // Compute next ANS state // D(x) = (s, q_s (x/M) + mod(x,M) - b_s) where s is such b_s <= x mod M < b_{s+1} st = uint(sym._freq) * (st >> _logRange) + (st & mask) - uint(sym._cumFreq); // Normalize const int x = (st < ANS_TOP) ? -1 : 0; st = (st << (x & 16)) | (x & ((uint(p[0]) << 8) | uint(p[1]))); p -= (x + x); return st; } } #endif kanzi-cpp-2.5.2/src/entropy/ANSRangeEncoder.cpp000066400000000000000000000215471516423635400213270ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include "ANSRangeEncoder.hpp" #include "EntropyUtils.hpp" #include "../Global.hpp" #include "../Memory.hpp" using namespace kanzi; using namespace std; const int ANSRangeEncoder::ANS_TOP = 1 << 15; // max possible for ANS_TOP=1<<23 const int ANSRangeEncoder::DEFAULT_ANS0_CHUNK_SIZE = 16384; const int ANSRangeEncoder::DEFAULT_LOG_RANGE = 12; const int ANSRangeEncoder::MIN_CHUNK_SIZE = 1024; const int ANSRangeEncoder::MAX_CHUNK_SIZE = 1 << 27; // 8*MAX_CHUNK_SIZE must not overflow // The chunk size indicates how many bytes are encoded (per block) before // resetting the frequency stats. ANSRangeEncoder::ANSRangeEncoder(OutputBitStream& bitstream, int order, int chunkSize, int logRange) : _bitstream(bitstream) { if ((order != 0) && (order != 1)) throw invalid_argument("ANS Codec: The order must be 0 or 1"); if (chunkSize < MIN_CHUNK_SIZE) { stringstream ss; ss << "ANS Codec: The chunk size must be at least " << MIN_CHUNK_SIZE; throw invalid_argument(ss.str()); } if (chunkSize > MAX_CHUNK_SIZE) { stringstream ss; ss << "ANS Codec: The chunk size must be at most " << MAX_CHUNK_SIZE; throw invalid_argument(ss.str()); } if ((logRange < 8) || (logRange > 16)) { stringstream ss; ss << "ANS Codec: Invalid range: " << logRange << " (must be in [8..16])"; throw invalid_argument(ss.str()); } _chunkSize = min(chunkSize << (8 * order), MAX_CHUNK_SIZE); _order = order; const int dim = 255 * order + 1; _symbols = new ANSEncSymbol[dim * 256]; _freqs = new uint[dim * 257]; // freqs[x][256] = total(freqs[x][0..255]) _buffer = nullptr; _bufferSize = 0; _logRange = (order == 0) ? logRange : max(logRange - 1, 8); } ANSRangeEncoder::~ANSRangeEncoder() { _dispose(); if (_buffer != nullptr) delete[] _buffer; delete[] _symbols; delete[] _freqs; } // Compute cumulated frequencies and encode header int ANSRangeEncoder::updateFrequencies(uint frequencies[], uint lr) { int res = 0; const int endk = 255 * _order + 1; _bitstream.writeBits(lr - 8, 3); // logRange uint curAlphabet[256]; for (int k = 0; k < endk; k++) { uint* f = &frequencies[k * 257]; const int alphabetSize = EntropyUtils::normalizeFrequencies(f, curAlphabet, 256, f[256], 1 << lr); if (alphabetSize > 0) { ANSEncSymbol* symb = &_symbols[k << 8]; int sum = 0; for (int i = 0, count = 0; i < 256; i++) { if (f[i] == 0) continue; symb[i].reset(sum, f[i], lr); sum += f[i]; count++; if (count >= alphabetSize) break; } } encodeHeader(alphabetSize, curAlphabet, f, lr); res += alphabetSize; } return res; } // Encode alphabet and frequencies bool ANSRangeEncoder::encodeHeader(int alphabetSize, const uint alphabet[], const uint frequencies[], uint lr) const { const int encoded = EntropyUtils::encodeAlphabet(_bitstream, alphabet, 256, alphabetSize); if (encoded < 0) return false; if (encoded <= 1) return true; const int chkSize = (alphabetSize >= 64) ? 8 : 6; const int llr = Global::_log2(lr) + 1; // Encode all frequencies (but the first one) by chunks for (int i = 1; i < alphabetSize; i += chkSize) { uint max = frequencies[alphabet[i]] - 1; const int endj = min(i + chkSize, alphabetSize); // Search for max frequency log size in next chunk for (int j = i + 1; j < endj; j++) { if (frequencies[alphabet[j]] - 1 > max) max = frequencies[alphabet[j]] - 1; } const uint logMax = (max == 0) ? 0 : Global::_log2(max) + 1; _bitstream.writeBits(logMax, llr); if (logMax == 0) // all frequencies equal one in this chunk continue; // Write frequencies for (int j = i; j < endj; j++) _bitstream.writeBits(frequencies[alphabet[j]] - 1, logMax); } return true; } // Dynamically compute the frequencies for every chunk of data in the block int ANSRangeEncoder::encode(const kanzi::byte block[], uint blkptr, uint count) { if (count <= 32) { _bitstream.writeBits(&block[blkptr], 8 * count); return count; } const uint end = blkptr + count; uint startChunk = blkptr; uint sz = uint(_chunkSize); const uint size = max(min(sz + (sz >> 3), 2 * count), uint(65536)); if (_bufferSize < size) { if (_buffer != nullptr) delete[] _buffer; _bufferSize = size; _buffer = new kanzi::byte[_bufferSize]; } while (startChunk < end) { const uint sizeChunk = min(sz, end - startChunk); const int alphabetSize = rebuildStatistics(&block[startChunk], sizeChunk, _logRange); // Skip chunk if only one symbol if ((alphabetSize <= 1) && (_order == 0)) { startChunk += sizeChunk; continue; } encodeChunk(&block[startChunk], sizeChunk); startChunk += sizeChunk; } return count; } void ANSRangeEncoder::encodeChunk(const kanzi::byte block[], int end) { int st0 = ANS_TOP; int st1 = ANS_TOP; int st2 = ANS_TOP; int st3 = ANS_TOP; kanzi::byte* p = &_buffer[_bufferSize - 1]; const kanzi::byte* p0 = p; const int end4 = end & -4; for (int i = end - 1; i >= end4; i--) *p-- = block[i]; if (_order == 0) { for (int i = end4 - 1; i > 0; i -= 4) { st0 = encodeSymbol(p, st0, _symbols[int(block[i])]); st1 = encodeSymbol(p, st1, _symbols[int(block[i - 1])]); st2 = encodeSymbol(p, st2, _symbols[int(block[i - 2])]); st3 = encodeSymbol(p, st3, _symbols[int(block[i - 3])]); } } else { // order 1 const int quarter = end4 >> 2; int i0 = 1 * quarter - 2; int i1 = 2 * quarter - 2; int i2 = 3 * quarter - 2; int i3 = end4 - 2; int prv0 = int(block[i0 + 1]); int prv1 = int(block[i1 + 1]); int prv2 = int(block[i2 + 1]); int prv3 = int(block[i3 + 1]); for ( ; i0 >= 0; i0--, i1--, i2--, i3--) { const int cur0 = int(block[i0]); st0 = encodeSymbol(p, st0, _symbols[(cur0 << 8) | prv0]); const int cur1 = int(block[i1]); st1 = encodeSymbol(p, st1, _symbols[(cur1 << 8) | prv1]); const int cur2 = int(block[i2]); st2 = encodeSymbol(p, st2, _symbols[(cur2 << 8) | prv2]); const int cur3 = int(block[i3]); st3 = encodeSymbol(p, st3, _symbols[(cur3 << 8) | prv3]); prv0 = cur0; prv1 = cur1; prv2 = cur2; prv3 = cur3; } // Last symbols st0 = encodeSymbol(p, st0, _symbols[prv0]); st1 = encodeSymbol(p, st1, _symbols[prv1]); st2 = encodeSymbol(p, st2, _symbols[prv2]); st3 = encodeSymbol(p, st3, _symbols[prv3]); } // Write chunk size EntropyUtils::writeVarInt(_bitstream, uint32(p0 - p)); // Write final ANS states _bitstream.writeBits(st0, 32); _bitstream.writeBits(st1, 32); _bitstream.writeBits(st2, 32); _bitstream.writeBits(st3, 32); if (p != p0) { // Write encoded data to bitstream _bitstream.writeBits(&p[1], 8 * uint(p0 - p)); } } // Compute chunk frequencies, cumulated frequencies and encode chunk header int ANSRangeEncoder::rebuildStatistics(const kanzi::byte block[], int end, uint lr) { const int dim = 255 * _order + 1; memset(_freqs, 0, size_t(257 * dim) * sizeof(uint)); if (_order == 0){ Global::computeHistogram(block, end, _freqs, true, true); } else { const int quarter = end >> 2; if (quarter == 0) { Global::computeHistogram(block, end, _freqs, false, true); } else { Global::computeHistogram(&block[0 * quarter], quarter, _freqs, false, true); Global::computeHistogram(&block[1 * quarter], quarter, _freqs, false, true); Global::computeHistogram(&block[2 * quarter], quarter, _freqs, false, true); Global::computeHistogram(&block[3 * quarter], quarter, _freqs, false, true); } } return updateFrequencies(_freqs, lr); } kanzi-cpp-2.5.2/src/entropy/ANSRangeEncoder.hpp000066400000000000000000000076541516423635400213370ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_ANSRangeEncoder #define knz_ANSRangeEncoder #include "../EntropyEncoder.hpp" // Implementation of an Asymmetric Numeral System encoder. // See "Asymmetric Numeral System" by Jarek Duda at http://arxiv.org/abs/0902.0271 // Some code has been ported from https://github.com/rygorous/ryg_rans // For an alternate C implementation example, see https://github.com/Cyan4973/FiniteStateEntropy namespace kanzi { struct ANSEncSymbol { void reset(int cumFreq, int freq, uint logRange); int _xMax; // (Exclusive) upper bound of pre-normalization interval int _bias; // Bias int _cmplFreq; // Complement of frequency: (1 << scale_bits) - freq int _invShift; // Reciprocal shift uint64 _invFreq; // Fixed-point reciprocal frequency }; class ANSRangeEncoder : public EntropyEncoder { public: static const int ANS_TOP; ANSRangeEncoder(OutputBitStream& bitstream, int order = 0, int chunkSize = DEFAULT_ANS0_CHUNK_SIZE, int logRange = DEFAULT_LOG_RANGE); ~ANSRangeEncoder(); int updateFrequencies(uint frequencies[], uint lr); int encode(const byte block[], uint blkptr, uint len); OutputBitStream& getBitStream() const { return _bitstream; } void dispose() { _dispose(); } private: static const int DEFAULT_ANS0_CHUNK_SIZE; static const int DEFAULT_LOG_RANGE; static const int MIN_CHUNK_SIZE; static const int MAX_CHUNK_SIZE; ANSEncSymbol* _symbols; uint* _freqs; byte* _buffer; uint _bufferSize; OutputBitStream& _bitstream; uint _chunkSize; uint _logRange; uint _order; int rebuildStatistics(const byte block[], int end, uint lr); void encodeChunk(const byte block[], int end); int encodeSymbol(byte*& p, int& st, const ANSEncSymbol& sym) const; bool encodeHeader(int alphabetSize, const uint alphabet[], const uint frequencies[], uint lr) const; void _dispose() const {} }; inline void ANSEncSymbol::reset(int cumFreq, int freq, uint logRange) { // Make sure xMax is a positive int32. Compatibility with Java implementation if (freq >= 1 << logRange) freq = (1 << logRange) - 1; _xMax = ((ANSRangeEncoder::ANS_TOP >> logRange) << 16) * freq; _cmplFreq = (1 << logRange) - freq; if (freq < 2) { _invFreq = uint64(0xFFFFFFFF); _invShift = 32; _bias = cumFreq + (1 << logRange) - 1; } else { int shift = 0; while (freq > (1 << shift)) shift++; // Alverson, "Integer Division using reciprocals" _invFreq = (((uint64(1) << (shift + 31)) + freq - 1) / freq) & uint64(0xFFFFFFFF); _invShift = 32 + shift - 1; _bias = cumFreq; } } inline int ANSRangeEncoder::encodeSymbol(byte*& p, int& st, const ANSEncSymbol& sym) const { const int x = (st >= sym._xMax) ? 1 : 0; *p = byte(st); p -= x; *p = byte(st >> 8); p -= x; st >>= (-x & 16); // Compute next ANS state // C(s,x) = M floor(x/q_s) + mod(x,q_s) + b_s where b_s = q_0 + ... + q_{s-1} // st = ((st / freq) << lr) + (st % freq) + cumFreq; return st + sym._bias + int((st * sym._invFreq) >> sym._invShift) * sym._cmplFreq; } } #endif kanzi-cpp-2.5.2/src/entropy/AdaptiveProbMap.hpp000066400000000000000000000077061516423635400214550ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_AdaptiveProbMap #define knz_AdaptiveProbMap #include "../Global.hpp" // APM maps a probability and a context into a new probability // that the next bit will be 1. After each guess, it updates // its state to improve future guesses. namespace kanzi { template class LinearAdaptiveProbMap { public: LinearAdaptiveProbMap(int n); ~LinearAdaptiveProbMap() { delete[] _data; } int get(int bit, int pr, int ctx); private: int _index; // last p, context uint16* _data; // [NbCtx][33]: p, context -> p }; template inline LinearAdaptiveProbMap::LinearAdaptiveProbMap(int n) { const int size = (n == 0) ? 65 : n * 65; _data = new uint16[size]; _index = 0; for (int j = 0; j <= 64; j++) { _data[j] = uint16(j << 6) << 4; } for (int i = 1; i < n; i++) { memcpy(&_data[i * 65], &_data[0], 65 * sizeof(uint16)); } } // Return improved prediction given current bit, prediction and context template inline int LinearAdaptiveProbMap::get(int bit, int pr, int ctx) { // Update probability based on error and learning rate const int g = -bit & 65528; _data[_index] += (((g - int(_data[_index])) >> RATE) + bit); _data[_index + 1] += (((g - int(_data[_index + 1])) >> RATE) + bit); // Find index: 65*ctx + quantized prediction in [0..64] _index = (pr >> 6) + 65 * ctx; // Return interpolated probabibility const uint16 w = uint16(pr & 127); return int(_data[_index] * (128 - w) + _data[_index + 1] * w) >> 11; } template class LogisticAdaptiveProbMap { public: LogisticAdaptiveProbMap(int n); ~LogisticAdaptiveProbMap() { delete[] _data; } int get(int bit, int pr, int ctx); private: int _index; // last p, context uint16* _data; // [NbCtx][33]: p, context -> p }; template inline LogisticAdaptiveProbMap::LogisticAdaptiveProbMap(int n) { const int mult = (FAST == false) ? 33 : 32; _index = 0; if (n == 0) { _data = new uint16[mult]; } else { _data = new uint16[n * mult]; for (int j = 0; j < mult; j++) _data[j] = uint16(Global::squash((j - 16) * 128) << 4); for (int i = 1; i < n; i++) memcpy(&_data[i * mult], &_data[0], mult * sizeof(uint16)); } } // Return improved prediction given current bit, prediction and context template inline int LogisticAdaptiveProbMap::get(int bit, int pr, int ctx) { // Update probability based on error and learning rate const int g = -bit & 65528; _data[_index] += (((g - int(_data[_index])) >> RATE) + bit); if (FAST == false) { _data[_index + 1] += (((g - int(_data[_index + 1])) >> RATE) + bit); pr = Global::stretch(pr); _index = ((pr + 2048) >> 7) + 33 * ctx; // Return interpolated probabibility const uint16 w = uint16(pr & 127); return int(_data[_index] * (128 - w) + _data[_index + 1] * w) >> 11; } else { _index = ((Global::stretch(pr) + 2048) >> 7) + 32 * ctx; return int(_data[_index]) >> 4; } } } #endif kanzi-cpp-2.5.2/src/entropy/BinaryEntropyDecoder.cpp000066400000000000000000000103111516423635400225070ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include "BinaryEntropyDecoder.hpp" #include "../Memory.hpp" #include "EntropyUtils.hpp" using namespace kanzi; using namespace std; const uint64 BinaryEntropyDecoder::TOP = 0x00FFFFFFFFFFFFFF; const uint64 BinaryEntropyDecoder::MASK_0_56 = 0x00FFFFFFFFFFFFFF; const uint64 BinaryEntropyDecoder::MASK_0_32 = 0x00000000FFFFFFFF; const int BinaryEntropyDecoder::MAX_BLOCK_SIZE = 1 << 30; const int BinaryEntropyDecoder::MAX_CHUNK_SIZE = 1 << 26; BinaryEntropyDecoder::BinaryEntropyDecoder(InputBitStream& bitstream, Predictor* predictor, bool deallocate) : _predictor(predictor) , _bitstream(bitstream) , _deallocate(deallocate) , _sba(nullptr, 0) { if (predictor == nullptr) throw invalid_argument("Invalid null predictor parameter"); _low = 0; _high = TOP; _current = 0; } BinaryEntropyDecoder::~BinaryEntropyDecoder() { _dispose(); if (_sba._array != nullptr) delete[] _sba._array; if (_deallocate) delete _predictor; } int BinaryEntropyDecoder::decode(kanzi::byte block[], uint blkptr, uint count) { if (count >= MAX_BLOCK_SIZE) throw invalid_argument("Invalid block size parameter (max is 1<<30)"); uint startChunk = blkptr; const uint end = blkptr + count; uint length = max(count, 64u); if (length >= MAX_CHUNK_SIZE) { // If the block is big (>=64MB), split the decoding to avoid allocating // too much memory. length = (length / 8 < MAX_CHUNK_SIZE) ? count >> 3 : count >> 4; } const uint bufSize = length + (length >> 3); if (_sba._length < int(bufSize)) { if (_sba._array != nullptr) delete[] _sba._array; _sba._length = int(bufSize); _sba._array = new kanzi::byte[_sba._length]; } // Split block into chunks, read bit array from bitstream and decode chunk while (startChunk < end) { const uint chunkSize = min(length, end - startChunk); const uint szBytes = uint(EntropyUtils::readVarInt(_bitstream)); if (szBytes > bufSize) return 0; _current = _bitstream.readBits(56); if (szBytes != 0) _bitstream.readBits(&_sba._array[0], 8 * szBytes); _sba._index = 0; const uint endChunk = startChunk + chunkSize; for (uint i = startChunk; i < endChunk; i++) { block[i] = kanzi::byte((decodeBit(_predictor->get()) << 7) | (decodeBit(_predictor->get()) << 6) | (decodeBit(_predictor->get()) << 5) | (decodeBit(_predictor->get()) << 4) | (decodeBit(_predictor->get()) << 3) | (decodeBit(_predictor->get()) << 2) | (decodeBit(_predictor->get()) << 1) | decodeBit(_predictor->get())); } startChunk = endChunk; } return count; } // no inline void BinaryEntropyDecoder::read() { _low = (_low << 32) & MASK_0_56; _high = ((_high << 32) | MASK_0_32) & MASK_0_56; const uint64 val = BigEndian::readInt32(&_sba._array[_sba._index]) & MASK_0_32; _current = ((_current << 32) | val) & MASK_0_56; _sba._index += 4; } // no inline kanzi::byte BinaryEntropyDecoder::decodeByte() { return kanzi::byte((decodeBit(_predictor->get()) << 7) | (decodeBit(_predictor->get()) << 6) | (decodeBit(_predictor->get()) << 5) | (decodeBit(_predictor->get()) << 4) | (decodeBit(_predictor->get()) << 3) | (decodeBit(_predictor->get()) << 2) | (decodeBit(_predictor->get()) << 1) | decodeBit(_predictor->get())); } kanzi-cpp-2.5.2/src/entropy/BinaryEntropyDecoder.hpp000066400000000000000000000044131516423635400225220ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_BinaryEntropyDecoder #define knz_BinaryEntropyDecoder #include "../EntropyDecoder.hpp" #include "../Predictor.hpp" #include "../SliceArray.hpp" namespace kanzi { // This class is a generic implementation of a bool entropy decoder class BinaryEntropyDecoder FINAL : public EntropyDecoder { private: static const uint64 TOP; static const uint64 MASK_0_56; static const uint64 MASK_0_32; static const int MAX_BLOCK_SIZE; static const int MAX_CHUNK_SIZE; Predictor* _predictor; uint64 _low; uint64 _high; uint64 _current; InputBitStream& _bitstream; bool _deallocate; SliceArray _sba; void read(); void _dispose() const {} public: BinaryEntropyDecoder(InputBitStream& bitstream, Predictor* predictor, bool deallocate=true); ~BinaryEntropyDecoder(); int decode(byte block[], uint blkptr, uint count); InputBitStream& getBitStream() const { return _bitstream; } void dispose() { _dispose(); } byte decodeByte(); int decodeBit(int pred = 2048); }; inline int BinaryEntropyDecoder::decodeBit(int pred) { // Calculate interval split const uint64 split = ((((_high - _low) >> 4) * uint64(pred)) >> 8) + _low; int bit; // Update predictor if (split >= _current) { bit = 1; _high = split; _predictor->update(1); } else { bit = 0; _low = split + 1; _predictor->update(0); } // Read 32 bits from bitstream if (((_low ^ _high) >> 24) == 0) read(); return bit; } } #endif kanzi-cpp-2.5.2/src/entropy/BinaryEntropyEncoder.cpp000066400000000000000000000104111516423635400225220ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include "BinaryEntropyEncoder.hpp" #include "../Memory.hpp" #include "EntropyUtils.hpp" using namespace kanzi; using namespace std; const uint64 BinaryEntropyEncoder::TOP = 0x00FFFFFFFFFFFFFF; const uint64 BinaryEntropyEncoder::MASK_0_24 = 0x0000000000FFFFFF; const uint64 BinaryEntropyEncoder::MASK_0_32 = 0x00000000FFFFFFFF; const int BinaryEntropyEncoder::MAX_BLOCK_SIZE = 1 << 30; const int BinaryEntropyEncoder::MAX_CHUNK_SIZE = 1 << 26; BinaryEntropyEncoder::BinaryEntropyEncoder(OutputBitStream& bitstream, Predictor* predictor, bool deallocate) : _predictor(predictor) , _bitstream(bitstream) , _deallocate(deallocate) , _sba(nullptr, 0) { if (predictor == nullptr) throw invalid_argument("Invalid null predictor parameter"); _low = 0; _high = TOP; _disposed = false; } BinaryEntropyEncoder::~BinaryEntropyEncoder() { _dispose(); if (_sba._array != nullptr) delete[] _sba._array; if (_deallocate) delete _predictor; } int BinaryEntropyEncoder::encode(const kanzi::byte block[], uint blkptr, uint count) { if (count >= MAX_BLOCK_SIZE) throw invalid_argument("Invalid block size parameter (max is 1<<30)"); uint startChunk = blkptr; const uint end = blkptr + count; uint length = max(count, 64u); if (length >= MAX_CHUNK_SIZE) { // If the block is big (>=64MB), split the encoding to avoid allocating // too much memory. length = (length / 8 < MAX_CHUNK_SIZE) ? count >> 3 : count >> 4; } const uint bufSize = length + (length >> 3); if (_sba._length < int(bufSize)) { if (_sba._array != nullptr) delete[] _sba._array; _sba._length = int(bufSize); _sba._array = new kanzi::byte[_sba._length]; } // Split block into chunks, encode chunk and write bit array to bitstream while (startChunk < end) { const uint chunkSize = min(length, end - startChunk); const uint endChunk = startChunk + chunkSize; _sba._index = 0; for (uint i = startChunk; i < endChunk; i++) { encodeBit(int(block[i]) & 0x80, _predictor->get()); encodeBit(int(block[i]) & 0x40, _predictor->get()); encodeBit(int(block[i]) & 0x20, _predictor->get()); encodeBit(int(block[i]) & 0x10, _predictor->get()); encodeBit(int(block[i]) & 0x08, _predictor->get()); encodeBit(int(block[i]) & 0x04, _predictor->get()); encodeBit(int(block[i]) & 0x02, _predictor->get()); encodeBit(int(block[i]) & 0x01, _predictor->get()); } EntropyUtils::writeVarInt(_bitstream, uint32(_sba._index)); _bitstream.writeBits(&_sba._array[0], 8 * _sba._index); startChunk = endChunk; if (startChunk < end) _bitstream.writeBits(_low | MASK_0_24, 56); } return count; } void BinaryEntropyEncoder::_dispose() { if (_disposed == true) return; _disposed = true; _bitstream.writeBits(_low | MASK_0_24, 56); } // no inline void BinaryEntropyEncoder::flush() { BigEndian::writeInt32(&_sba._array[_sba._index], int32(_high >> 24)); _sba._index += 4; _low <<= 32; _high = (_high << 32) | MASK_0_32; } // no inline void BinaryEntropyEncoder::encodeByte(kanzi::byte val) { encodeBit(int(val) & 0x80, _predictor->get()); encodeBit(int(val) & 0x40, _predictor->get()); encodeBit(int(val) & 0x20, _predictor->get()); encodeBit(int(val) & 0x10, _predictor->get()); encodeBit(int(val) & 0x08, _predictor->get()); encodeBit(int(val) & 0x04, _predictor->get()); encodeBit(int(val) & 0x02, _predictor->get()); encodeBit(int(val) & 0x01, _predictor->get()); } kanzi-cpp-2.5.2/src/entropy/BinaryEntropyEncoder.hpp000066400000000000000000000042051516423635400225330ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_BinaryEntropyEncoder #define knz_BinaryEntropyEncoder #include "../EntropyEncoder.hpp" #include "../Predictor.hpp" #include "../SliceArray.hpp" namespace kanzi { // This class is a generic implementation of a bool entropy encoder class BinaryEntropyEncoder FINAL : public EntropyEncoder { private: static const uint64 TOP; static const uint64 MASK_0_24; static const uint64 MASK_0_32; static const int MAX_BLOCK_SIZE; static const int MAX_CHUNK_SIZE; Predictor* _predictor; uint64 _low; uint64 _high; OutputBitStream& _bitstream; bool _disposed; bool _deallocate; SliceArray _sba; void _dispose(); void flush(); public: BinaryEntropyEncoder(OutputBitStream& bitstream, Predictor* predictor, bool deallocate=true); ~BinaryEntropyEncoder(); int encode(const byte block[], uint blkptr, uint count); OutputBitStream& getBitStream() const { return _bitstream; } void dispose() { _dispose(); } void encodeByte(byte val); void encodeBit(int bit, int pred = 2048); }; inline void BinaryEntropyEncoder::encodeBit(int bit, int pred) { // Update fields with new interval bounds and predictor const uint64 mid = _low + ((((_high - _low) >> 4) * uint64(pred)) >> 8); (bit != 0) ? _high = mid : _low = mid + 1; _predictor->update(bit != 0); // Write unchanged first 32 bits to bitstream if (((_low ^ _high) >> 24) == 0) flush(); } } #endif kanzi-cpp-2.5.2/src/entropy/CMPredictor.cpp000066400000000000000000000022601516423635400205730ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "CMPredictor.hpp" using namespace kanzi; const int CMPredictor::FAST_RATE = 2; const int CMPredictor::MEDIUM_RATE = 4; const int CMPredictor::SLOW_RATE = 6; const int CMPredictor::PSCALE = 65536; CMPredictor::CMPredictor() { _ctx = 1; _runMask = 0; _c1 = 0; _c2 = 0; for (int i = 0; i < 256; i++) { for (int j = 0; j <= 256; j++) _counter1[i][j] = 32768; for (int j = 0; j <= 16; j++) { _counter2[2 * i][j] = j << 12; _counter2[2 * i + 1][j] = j << 12; } } _pc1 = _counter1[_ctx]; _pc2 = &_counter2[_ctx][8]; } kanzi-cpp-2.5.2/src/entropy/CMPredictor.hpp000066400000000000000000000045241516423635400206050ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_CMPredictor #define knz_CMPredictor #include "../Predictor.hpp" #include "../types.hpp" namespace kanzi { class CMPredictor FINAL : public Predictor { private: static const int FAST_RATE; static const int MEDIUM_RATE; static const int SLOW_RATE; static const int PSCALE; int _c1; int _c2; int _ctx; int _runMask; int _counter1[256][257]; int _counter2[512][17]; int* _pc1; int* _pc2; public: CMPredictor(); ~CMPredictor(){} void update(int bit); int get(); }; // Update the probability model inline void CMPredictor::update(int bit) { if (bit == 0) { _pc1[256] -= (_pc1[256] >> FAST_RATE); _pc1[_c1] -= (_pc1[_c1] >> MEDIUM_RATE); _pc2[0] -= (_pc2[0]>> SLOW_RATE); _pc2[1] -= (_pc2[1]>> SLOW_RATE); _ctx += _ctx; } else { _pc1[256] -= ((_pc1[256] - PSCALE + 16) >> FAST_RATE); _pc1[_c1] -= ((_pc1[_c1] - PSCALE + 16) >> MEDIUM_RATE); _pc2[0] -= ((_pc2[0] - PSCALE + 16) >> SLOW_RATE); _pc2[1] -= ((_pc2[1] - PSCALE + 16) >> SLOW_RATE); _ctx += (_ctx + 1); } if (_ctx > 255) { _c2 = _c1; _c1 = _ctx & 0xFF; _ctx = 1; _runMask = (_c1 == _c2) ? 0x100 : 0; } } // Return the split value representing the probability of 1 in the [0..4095] range. inline int CMPredictor::get() { _pc1 = _counter1[_ctx]; const int p = (13 * (_pc1[256] + _pc1[_c1]) + 6 * _pc1[_c2]) >> 5; _pc2 = &_counter2[_ctx | _runMask][p >> 12]; return (p + p + 3 * (_pc2[0] + _pc2[1]) + 64) >> 7; // rescale to [0..4095] } } #endif kanzi-cpp-2.5.2/src/entropy/EntropyDecoderFactory.hpp000066400000000000000000000114051516423635400227040ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_EntropyDecoderFactory #define knz_EntropyDecoderFactory #include #include "../Context.hpp" #include "ANSRangeDecoder.hpp" #include "BinaryEntropyDecoder.hpp" #include "HuffmanDecoder.hpp" #include "NullEntropyDecoder.hpp" #include "RangeDecoder.hpp" #include "CMPredictor.hpp" #include "FPAQDecoder.hpp" #include "TPAQPredictor.hpp" namespace kanzi { class EntropyDecoderFactory { public: static const short NONE_TYPE = 0; // No compression static const short HUFFMAN_TYPE = 1; // Huffman static const short FPAQ_TYPE = 2; // Fast PAQ (order 0) static const short PAQ_TYPE = 3; // Obsolete static const short RANGE_TYPE = 4; // Range static const short ANS0_TYPE = 5; // Asymmetric Numerical System order 0 static const short CM_TYPE = 6; // Context Model static const short TPAQ_TYPE = 7; // Tangelo PAQ static const short ANS1_TYPE = 8; // Asymmetric Numerical System order 1 static const short TPAQX_TYPE = 9; // Tangelo PAQ Extra static const short RESERVED1 = 10; //Reserved static const short RESERVED2 = 11; //Reserved static const short RESERVED3 = 12; //Reserved static const short RESERVED4 = 13; //Reserved static const short RESERVED5 = 14; //Reserved static const short RESERVED6 = 15; //Reserved static EntropyDecoder* newDecoder(InputBitStream& ibs, Context& ctx, short entropyType); static const char* getName(short entropyType); static short getType(const char* name); }; inline EntropyDecoder* EntropyDecoderFactory::newDecoder(InputBitStream& ibs, Context& ctx, short entropyType) { switch (entropyType) { // Each block is decoded separately // Rebuild the entropy decoder to reset block statistics case HUFFMAN_TYPE: return new HuffmanDecoder(ibs, &ctx); case ANS0_TYPE: return new ANSRangeDecoder(ibs, 0); case ANS1_TYPE: return new ANSRangeDecoder(ibs, 1); case RANGE_TYPE: return new RangeDecoder(ibs); case FPAQ_TYPE: return new FPAQDecoder(ibs); case CM_TYPE: return new BinaryEntropyDecoder(ibs, new CMPredictor()); case TPAQ_TYPE: return new BinaryEntropyDecoder(ibs, new TPAQPredictor(&ctx)); case TPAQX_TYPE: return new BinaryEntropyDecoder(ibs, new TPAQPredictor(&ctx)); case NONE_TYPE: return new NullEntropyDecoder(ibs); default: std::string msg = "Unknown entropy codec type: '"; msg += char(entropyType); msg += '\''; throw std::invalid_argument(msg); } } inline const char* EntropyDecoderFactory::getName(short entropyType) { switch (entropyType) { case HUFFMAN_TYPE: return "HUFFMAN"; case ANS0_TYPE: return "ANS0"; case ANS1_TYPE: return "ANS1"; case RANGE_TYPE: return "RANGE"; case FPAQ_TYPE: return "FPAQ"; case CM_TYPE: return "CM"; case TPAQ_TYPE: return "TPAQ"; case TPAQX_TYPE: return "TPAQX"; case NONE_TYPE: return "NONE"; default: std::string msg = "Unknown entropy codec type: '"; msg += char(entropyType); msg += '\''; throw std::invalid_argument(msg); } } inline short EntropyDecoderFactory::getType(const char* str) { std::string name = str; transform(name.begin(), name.end(), name.begin(), ::toupper); if (name == "HUFFMAN") return HUFFMAN_TYPE; if (name == "ANS0") return ANS0_TYPE; if (name == "ANS1") return ANS1_TYPE; if (name == "FPAQ") return FPAQ_TYPE; if (name == "RANGE") return RANGE_TYPE; if (name == "CM") return CM_TYPE; if (name == "TPAQ") return TPAQ_TYPE; if (name == "TPAQX") return TPAQX_TYPE; if (name == "NONE") return NONE_TYPE; throw std::invalid_argument("Unsupported entropy codec type: '" + name + "'"); } } #endif kanzi-cpp-2.5.2/src/entropy/EntropyEncoderFactory.hpp000066400000000000000000000112261516423635400227170ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_EntropyEncoderFactory #define knz_EntropyEncoderFactory #include #include "../Context.hpp" #include "ANSRangeEncoder.hpp" #include "BinaryEntropyEncoder.hpp" #include "HuffmanEncoder.hpp" #include "NullEntropyEncoder.hpp" #include "RangeEncoder.hpp" #include "CMPredictor.hpp" #include "FPAQEncoder.hpp" #include "TPAQPredictor.hpp" namespace kanzi { class EntropyEncoderFactory { public: static const short NONE_TYPE = 0; // No compression static const short HUFFMAN_TYPE = 1; // Huffman static const short FPAQ_TYPE = 2; // Fast PAQ (order 0) static const short PAQ_TYPE = 3; // Obsolete static const short RANGE_TYPE = 4; // Range static const short ANS0_TYPE = 5; // Asymmetric Numerical System order 0 static const short CM_TYPE = 6; // Context Model static const short TPAQ_TYPE = 7; // Tangelo PAQ static const short ANS1_TYPE = 8; // Asymmetric Numerical System order 1 static const short TPAQX_TYPE = 9; // Tangelo PAQ Extra static const short RESERVED1 = 10; //Reserved static const short RESERVED2 = 11; //Reserved static const short RESERVED3 = 12; //Reserved static const short RESERVED4 = 13; //Reserved static const short RESERVED5 = 14; //Reserved static const short RESERVED6 = 15; //Reserved static EntropyEncoder* newEncoder(OutputBitStream& obs, Context& ctx, short entropyType); static const char* getName(short entropyType); static short getType(const char* name); }; inline EntropyEncoder* EntropyEncoderFactory::newEncoder(OutputBitStream& obs, Context& ctx, short entropyType) { switch (entropyType) { case HUFFMAN_TYPE: return new HuffmanEncoder(obs); case ANS0_TYPE: return new ANSRangeEncoder(obs, 0); case ANS1_TYPE: return new ANSRangeEncoder(obs, 1); case RANGE_TYPE: return new RangeEncoder(obs); case FPAQ_TYPE: return new FPAQEncoder(obs); case CM_TYPE: return new BinaryEntropyEncoder(obs, new CMPredictor()); case TPAQ_TYPE: return new BinaryEntropyEncoder(obs, new TPAQPredictor(&ctx)); case TPAQX_TYPE: return new BinaryEntropyEncoder(obs, new TPAQPredictor(&ctx)); case NONE_TYPE: return new NullEntropyEncoder(obs); default: std::string msg = "Unknown entropy codec type: '"; msg += char(entropyType); msg += '\''; throw std::invalid_argument(msg); } } inline const char* EntropyEncoderFactory::getName(short entropyType) { switch (entropyType) { case HUFFMAN_TYPE: return "HUFFMAN"; case ANS0_TYPE: return "ANS0"; case ANS1_TYPE: return "ANS1"; case RANGE_TYPE: return "RANGE"; case FPAQ_TYPE: return "FPAQ"; case CM_TYPE: return "CM"; case TPAQ_TYPE: return "TPAQ"; case TPAQX_TYPE: return "TPAQX"; case NONE_TYPE: return "NONE"; default: std::string msg = "Unknown entropy codec type: '"; msg += char(entropyType); msg += '\''; throw std::invalid_argument(msg); } } inline short EntropyEncoderFactory::getType(const char* str) { std::string name = str; transform(name.begin(), name.end(), name.begin(), ::toupper); if (name == "HUFFMAN") return HUFFMAN_TYPE; if (name == "ANS0") return ANS0_TYPE; if (name == "ANS1") return ANS1_TYPE; if (name == "FPAQ") return FPAQ_TYPE; if (name == "RANGE") return RANGE_TYPE; if (name == "CM") return CM_TYPE; if (name == "TPAQ") return TPAQ_TYPE; if (name == "TPAQX") return TPAQX_TYPE; if (name == "NONE") return NONE_TYPE; throw std::invalid_argument("Unsupported entropy codec type: '" + name + "'"); } } #endif kanzi-cpp-2.5.2/src/entropy/EntropyUtils.cpp000066400000000000000000000165121516423635400211060ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include "EntropyUtils.hpp" #include "../BitStreamException.hpp" using namespace kanzi; using namespace std; const int EntropyUtils::FULL_ALPHABET = 0; const int EntropyUtils::PARTIAL_ALPHABET = 1; const int EntropyUtils::ALPHABET_256 = 0; const int EntropyUtils::ALPHABET_0 = 1; const int EntropyUtils::INCOMPRESSIBLE_THRESHOLD = 973; // 0.95*1024 class FreqSortData { public: uint* _freq; uint8 _symbol; FreqSortData(uint* freq, uint8 symbol) : _freq(freq) , _symbol(symbol) { } }; struct FreqDataComparator { bool operator()(FreqSortData const& fd1, FreqSortData const& fd2) const { // Decreasing frequency then decreasing symbol int r; return ((r = int(*fd1._freq - *fd2._freq)) == 0) ? fd1._symbol > fd2._symbol: r > 0; } }; // alphabet must be sorted in increasing order // length = alphabet array length up to 256 int EntropyUtils::encodeAlphabet(OutputBitStream& obs, const uint alphabet[], int length, int count) { // Alphabet length must be a power of 2 if ((length & (length - 1)) != 0) return -1; if ((length > 256) || (count > length)) return -1; if (count == 0) { obs.writeBit(FULL_ALPHABET); obs.writeBit(ALPHABET_0); } else if (count == 256) { obs.writeBit(FULL_ALPHABET); obs.writeBit(ALPHABET_256); } else { // Partial alphabet obs.writeBit(PARTIAL_ALPHABET); kanzi::byte masks[32] = { kanzi::byte(0) }; // Encode presence flags for (int i = 0; i < count; i++) masks[alphabet[i] >> 3] |= kanzi::byte(1 << (alphabet[i] & 7)); const int lastMask = alphabet[count - 1] >> 3; obs.writeBits(lastMask, 5); obs.writeBits(masks, 8 * (lastMask + 1)); } return count; } int EntropyUtils::decodeAlphabet(InputBitStream& ibs, uint alphabet[]) { // Read encoding mode from bitstream if (ibs.readBit() == FULL_ALPHABET) { const int alphabetSize = (ibs.readBit() == ALPHABET_256) ? 256 : 0; // Full alphabet for (int i = 0; i < alphabetSize; i++) alphabet[i] = i; return alphabetSize; } // Partial alphabet const int lastMask = int(ibs.readBits(5)); kanzi::byte masks[32] = { kanzi::byte(0) }; int count = 0; // Decode presence flags ibs.readBits(masks, 8 * (lastMask + 1)); for (int i = 0; i <= lastMask; i++) { const int n = 8 * i; for (uint j = 0; j < 8; j++) { const int bit = int(masks[i] >> j) & 1; alphabet[count] = n + j; count += bit; } } return count; } // Returns the size of the alphabet // length is the length of the alphabet array // 'totalFreq' is the sum of frequencies. // 'scale' is the target new total of frequencies // The alphabet and freqs parameters are updated int EntropyUtils::normalizeFrequencies(uint freqs[], uint alphabet[], int length, uint totalFreq, uint scale) { if (length > 256) { stringstream ss; ss << "Invalid alphabet size parameter: " << scale << " (must be less than or equal to 256)"; throw invalid_argument(ss.str()); } if ((scale < 256) || (scale > 65536)) { stringstream ss; ss << "Invalid scale parameter: " << scale << " (must be in [256..65536])"; throw invalid_argument(ss.str()); } if ((length == 0) || (totalFreq == 0)) return 0; // Number of present symbols int alphabetSize = 0; // shortcut if (totalFreq == scale) { for (int i = 0; i < 256; i++) { if (freqs[i] != 0) alphabet[alphabetSize++] = i; } return alphabetSize; } uint sumScaledFreq = 0; uint sumFreq = 0; int idxMax = 0; // Scale frequencies by squeezing/stretching distribution over complete range for (int i = 0; i < length; i++) { alphabet[i] = 0; const uint f = freqs[i]; if (f == 0) continue; alphabet[alphabetSize++] = i; const int64 sf = int64(f) * int64(scale); const uint scaledFreq = sf <= int64(totalFreq) ? 1 : uint((sf + (int64(totalFreq) >> 1)) / int64(totalFreq)); sumScaledFreq += scaledFreq; freqs[i] = scaledFreq; sumFreq += f; idxMax = (scaledFreq > freqs[idxMax]) ? i : idxMax; if (sumFreq >= totalFreq) break; } if (alphabetSize == 0) return 0; if (alphabetSize == 1) { freqs[alphabet[0]] = scale; return 1; } if (sumScaledFreq == scale) return alphabetSize; int delta = int(sumScaledFreq - scale); const int errThr = int(freqs[idxMax]) >> 4; if (abs(delta) <= errThr) { // Fast path (small error): just adjust the max frequency freqs[idxMax] -= delta; return alphabetSize; } if (delta < 0) { delta += errThr; freqs[idxMax] += uint(errThr); } else { delta -= errThr; freqs[idxMax] -= uint(errThr); } // Slow path: spread error across frequencies const int inc = (delta < 0) ? 1 : -1; delta = abs(delta); int round = 0; while ((++round < 6) && (delta > 0)) { int adjustments = 0; for (int i = 0; i < alphabetSize; i++) { const int idx = alphabet[i]; // Skip small frequencies to avoid big distortion // Do not zero out frequencies if (freqs[idx] <= 2) continue; // Adjust frequency freqs[idx] += inc; adjustments++; delta--; if (delta == 0) break; } if (adjustments == 0) break; } freqs[idxMax] = max(freqs[idxMax] - delta, uint(1)); return alphabetSize; } int EntropyUtils::writeVarInt(OutputBitStream& obs, uint32 value) { uint32 res = 0; while (value >= 128) { obs.writeBits(0x80 | (value & 0x7F), 8); value >>= 7; res++; } obs.writeBits(value, 8); return res; } uint32 EntropyUtils::readVarInt(InputBitStream& ibs) { uint32 value = uint32(ibs.readBits(8)); uint32 res = value & 0x7F; for (int shift = 7; value >= 128; shift += 7) { value = uint32(ibs.readBits(8)); if (shift == 28) { // uint32 varint: last byte may only carry 4 payload bits and // must terminate the sequence. if ((value >= 128) || ((value & 0x70) != 0)) { throw BitStreamException("Invalid variable-length integer in bitstream", BitStreamException::INVALID_STREAM); } res |= ((value & 0x0F) << shift); return res; } res |= ((value & 0x7F) << shift); } return res; } kanzi-cpp-2.5.2/src/entropy/EntropyUtils.hpp000066400000000000000000000026721516423635400211150ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_EntropyUtils #define knz_EntropyUtils #include "../InputBitStream.hpp" #include "../OutputBitStream.hpp" namespace kanzi { class EntropyUtils { private: static const int FULL_ALPHABET; static const int PARTIAL_ALPHABET; static const int ALPHABET_256; static const int ALPHABET_0; public: static const int INCOMPRESSIBLE_THRESHOLD; EntropyUtils() {} ~EntropyUtils() {} static int encodeAlphabet(OutputBitStream& obs, const uint alphabet[], int length, int count); static int decodeAlphabet(InputBitStream& ibs, uint alphabet[]); static int normalizeFrequencies(uint freqs[], uint alphabet[], int length, uint totalFreq, uint scale); static int writeVarInt(OutputBitStream& obs, uint32 val); static uint32 readVarInt(InputBitStream& ibs); }; } #endif kanzi-cpp-2.5.2/src/entropy/ExpGolombDecoder.cpp000066400000000000000000000024361516423635400216070ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "ExpGolombDecoder.hpp" using namespace kanzi; ExpGolombDecoder::ExpGolombDecoder(InputBitStream& bitstream, bool sgn) : _bitstream(bitstream), _signed(sgn) { } int ExpGolombDecoder::decode(kanzi::byte block[], uint blkptr, uint len) { kanzi::byte* buf = &block[blkptr]; const uint len8 = len & uint(-8); for (uint i = 0; i < len8; i += 8) { buf[i] = decodeByte(); buf[i+1] = decodeByte(); buf[i+2] = decodeByte(); buf[i+3] = decodeByte(); buf[i+4] = decodeByte(); buf[i+5] = decodeByte(); buf[i+6] = decodeByte(); buf[i+7] = decodeByte(); } for (uint i = len8; i < len; i++) buf[i] = decodeByte(); return len; } kanzi-cpp-2.5.2/src/entropy/ExpGolombDecoder.hpp000066400000000000000000000036201516423635400216100ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_ExpGolombDecoder #define knz_ExpGolombDecoder #include "../EntropyDecoder.hpp" namespace kanzi { class ExpGolombDecoder : public EntropyDecoder { private: InputBitStream& _bitstream; const bool _signed; void flush(); void _dispose() const {} public: ExpGolombDecoder(InputBitStream& bitstream, bool sign=true); ~ExpGolombDecoder() { _dispose(); } int decode(byte arr[], uint blkptr, uint len); InputBitStream& getBitStream() const { return _bitstream; } byte decodeByte(); void dispose() { _dispose(); } bool isSigned() const { return _signed; } }; inline byte ExpGolombDecoder::decodeByte() { if (_bitstream.readBit() == 1) return byte(0); uint log2 = 1; while (_bitstream.readBit() == 0) log2++; // Clamp. Do not attempt to detect a corrupted bitstream log2 &= 7; if (_signed == true) { // Decode signed: read value + sign int res = int(_bitstream.readBits(log2 + 1)); const int sgn = res & 1; res = (res >> 1) + (1 << log2) - 1; return byte((res - sgn) ^ -sgn); // res or -res } // Decode unsigned return byte((1 << log2) - 1 + _bitstream.readBits(log2)); } } #endif kanzi-cpp-2.5.2/src/entropy/ExpGolombEncoder.cpp000066400000000000000000000112701516423635400216150ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "ExpGolombEncoder.hpp" using namespace kanzi; const int ExpGolombEncoder::CACHE[2][256] = { // Unsigned { 513, 1538, 1539, 2564, 2565, 2566, 2567, 3592, 3593, 3594, 3595, 3596, 3597, 3598, 3599, 4624, 4625, 4626, 4627, 4628, 4629, 4630, 4631, 4632, 4633, 4634, 4635, 4636, 4637, 4638, 4639, 5664, 5665, 5666, 5667, 5668, 5669, 5670, 5671, 5672, 5673, 5674, 5675, 5676, 5677, 5678, 5679, 5680, 5681, 5682, 5683, 5684, 5685, 5686, 5687, 5688, 5689, 5690, 5691, 5692, 5693, 5694, 5695, 6720, 6721, 6722, 6723, 6724, 6725, 6726, 6727, 6728, 6729, 6730, 6731, 6732, 6733, 6734, 6735, 6736, 6737, 6738, 6739, 6740, 6741, 6742, 6743, 6744, 6745, 6746, 6747, 6748, 6749, 6750, 6751, 6752, 6753, 6754, 6755, 6756, 6757, 6758, 6759, 6760, 6761, 6762, 6763, 6764, 6765, 6766, 6767, 6768, 6769, 6770, 6771, 6772, 6773, 6774, 6775, 6776, 6777, 6778, 6779, 6780, 6781, 6782, 6783, 7808, 7809, 7808, 6783, 6782, 6781, 6780, 6779, 6778, 6777, 6776, 6775, 6774, 6773, 6772, 6771, 6770, 6769, 6768, 6767, 6766, 6765, 6764, 6763, 6762, 6761, 6760, 6759, 6758, 6757, 6756, 6755, 6754, 6753, 6752, 6751, 6750, 6749, 6748, 6747, 6746, 6745, 6744, 6743, 6742, 6741, 6740, 6739, 6738, 6737, 6736, 6735, 6734, 6733, 6732, 6731, 6730, 6729, 6728, 6727, 6726, 6725, 6724, 6723, 6722, 6721, 6720, 5695, 5694, 5693, 5692, 5691, 5690, 5689, 5688, 5687, 5686, 5685, 5684, 5683, 5682, 5681, 5680, 5679, 5678, 5677, 5676, 5675, 5674, 5673, 5672, 5671, 5670, 5669, 5668, 5667, 5666, 5665, 5664, 4639, 4638, 4637, 4636, 4635, 4634, 4633, 4632, 4631, 4630, 4629, 4628, 4627, 4626, 4625, 4624, 3599, 3598, 3597, 3596, 3595, 3594, 3593, 3592, 2567, 2566, 2565, 2564, 1539, 1538 }, // Signed { 513, 2052, 2054, 3080, 3082, 3084, 3086, 4112, 4114, 4116, 4118, 4120, 4122, 4124, 4126, 5152, 5154, 5156, 5158, 5160, 5162, 5164, 5166, 5168, 5170, 5172, 5174, 5176, 5178, 5180, 5182, 6208, 6210, 6212, 6214, 6216, 6218, 6220, 6222, 6224, 6226, 6228, 6230, 6232, 6234, 6236, 6238, 6240, 6242, 6244, 6246, 6248, 6250, 6252, 6254, 6256, 6258, 6260, 6262, 6264, 6266, 6268, 6270, 7296, 7298, 7300, 7302, 7304, 7306, 7308, 7310, 7312, 7314, 7316, 7318, 7320, 7322, 7324, 7326, 7328, 7330, 7332, 7334, 7336, 7338, 7340, 7342, 7344, 7346, 7348, 7350, 7352, 7354, 7356, 7358, 7360, 7362, 7364, 7366, 7368, 7370, 7372, 7374, 7376, 7378, 7380, 7382, 7384, 7386, 7388, 7390, 7392, 7394, 7396, 7398, 7400, 7402, 7404, 7406, 7408, 7410, 7412, 7414, 7416, 7418, 7420, 7422, 8448, 8451, 8449, 7423, 7421, 7419, 7417, 7415, 7413, 7411, 7409, 7407, 7405, 7403, 7401, 7399, 7397, 7395, 7393, 7391, 7389, 7387, 7385, 7383, 7381, 7379, 7377, 7375, 7373, 7371, 7369, 7367, 7365, 7363, 7361, 7359, 7357, 7355, 7353, 7351, 7349, 7347, 7345, 7343, 7341, 7339, 7337, 7335, 7333, 7331, 7329, 7327, 7325, 7323, 7321, 7319, 7317, 7315, 7313, 7311, 7309, 7307, 7305, 7303, 7301, 7299, 7297, 6271, 6269, 6267, 6265, 6263, 6261, 6259, 6257, 6255, 6253, 6251, 6249, 6247, 6245, 6243, 6241, 6239, 6237, 6235, 6233, 6231, 6229, 6227, 6225, 6223, 6221, 6219, 6217, 6215, 6213, 6211, 6209, 5183, 5181, 5179, 5177, 5175, 5173, 5171, 5169, 5167, 5165, 5163, 5161, 5159, 5157, 5155, 5153, 4127, 4125, 4123, 4121, 4119, 4117, 4115, 4113, 3087, 3085, 3083, 3081, 2055, 2053 } }; ExpGolombEncoder::ExpGolombEncoder(OutputBitStream& bitstream, bool sgn) : _bitstream(bitstream), _signed((sgn == true) ? 1 : 0) { } int ExpGolombEncoder::encode(const kanzi::byte block[], uint blkptr, uint len) { const kanzi::byte* buf = &block[blkptr]; const uint len8 = len & uint(-8); for (uint i = 0; i < len8; i += 8) { encodeByte(buf[i]); encodeByte(buf[i+1]); encodeByte(buf[i+2]); encodeByte(buf[i+3]); encodeByte(buf[i+4]); encodeByte(buf[i+5]); encodeByte(buf[i+6]); encodeByte(buf[i+7]); } for (uint i = len8; i < len; i++) encodeByte(buf[i]); return len; } kanzi-cpp-2.5.2/src/entropy/ExpGolombEncoder.hpp000066400000000000000000000031141516423635400216200ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_ExpGolombEncoder #define knz_ExpGolombEncoder #include "../EntropyEncoder.hpp" namespace kanzi { class ExpGolombEncoder : public EntropyEncoder { private: static const int CACHE[2][256]; OutputBitStream& _bitstream; const int _signed; void _dispose() const {} public: ExpGolombEncoder(OutputBitStream& bitstream, bool sign=true); ~ExpGolombEncoder() { _dispose(); } int encode(const byte block[], uint blkptr, uint len); OutputBitStream& getBitStream() const { return _bitstream; } void encodeByte(byte val); void dispose() { _dispose(); } bool isSigned() const { return _signed == 1; } }; inline void ExpGolombEncoder::encodeByte(byte val) { if (val == byte(0)) { // shortcut when input is 0 _bitstream.writeBit(1); return; } const int emit = CACHE[_signed][uint8(val)]; _bitstream.writeBits(emit & 0x1FF, emit >> 9); } } #endif kanzi-cpp-2.5.2/src/entropy/FPAQDecoder.cpp000066400000000000000000000056071516423635400204450ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include "FPAQDecoder.hpp" #include "EntropyUtils.hpp" using namespace kanzi; using namespace std; const uint64 FPAQDecoder::TOP = 0x00FFFFFFFFFFFFFF; const uint64 FPAQDecoder::MASK_0_56 = 0x00FFFFFFFFFFFFFF; const uint64 FPAQDecoder::MASK_0_32 = 0x00000000FFFFFFFF; const uint FPAQDecoder::DEFAULT_CHUNK_SIZE = 4 * 1024 * 1024; const uint FPAQDecoder::MAX_BLOCK_SIZE = 1 << 30; const int FPAQDecoder::PSCALE = 65536; FPAQDecoder::FPAQDecoder(InputBitStream& bitstream) : _bitstream(bitstream) { reset(); } FPAQDecoder::~FPAQDecoder() { _dispose(); } bool FPAQDecoder::reset() { _low = 0; _high = TOP; _current = 0; _ctx = 1; _index = 0; for (int i = 0; i < 4; i++) { for (int j = 0; j < 256; j++) _probs[i][j] = PSCALE >> 1; } _p = _probs[0]; return true; } int FPAQDecoder::decode(kanzi::byte block[], uint blkptr, uint count) { if (count >= MAX_BLOCK_SIZE) throw invalid_argument("Invalid block size parameter (max is 1<<30)"); uint startChunk = blkptr; const uint end = blkptr + count; // Read bit array from bitstream and decode chunk while (startChunk < end) { const uint szBytes = uint(EntropyUtils::readVarInt(_bitstream)); // Sanity check if (szBytes >= 2 * count) return 0; const size_t bufSize = max(szBytes + (szBytes >> 3), 8192u); if (_buf.size() < bufSize) _buf.resize(bufSize); _current = _bitstream.readBits(56); if (bufSize > szBytes) memset(&_buf[szBytes], 0, bufSize - szBytes); _bitstream.readBits(&_buf[0], 8 * szBytes); _index = 0; const uint chunkSize = min(DEFAULT_CHUNK_SIZE, end - startChunk); const uint endChunk = startChunk + chunkSize; _p = _probs[0]; for (uint i = startChunk; i < endChunk; i++) { _ctx = 1; decodeBit(_p[_ctx]); decodeBit(_p[_ctx]); decodeBit(_p[_ctx]); decodeBit(_p[_ctx]); decodeBit(_p[_ctx]); decodeBit(_p[_ctx]); decodeBit(_p[_ctx]); decodeBit(_p[_ctx]); block[i] = kanzi::byte(_ctx); _p = _probs[(_ctx & 0xFF) >> 6]; } startChunk = endChunk; } return count; } kanzi-cpp-2.5.2/src/entropy/FPAQDecoder.hpp000066400000000000000000000055461516423635400204540ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_FPAQDecoder #define knz_FPAQDecoder #include #include "../EntropyDecoder.hpp" #include "../Memory.hpp" #include "../SliceArray.hpp" namespace kanzi { // Derived from fpaq0r by Matt Mahoney & Alexander Ratushnyak. // See http://mattmahoney.net/dc/#fpaq0. // Simple (and fast) adaptive entropy bit coder class FPAQDecoder : public EntropyDecoder { private: static const uint64 TOP; static const uint64 MASK_0_56; static const uint64 MASK_0_32; static const uint DEFAULT_CHUNK_SIZE; static const uint MAX_BLOCK_SIZE; static const int PSCALE; uint64 _low; uint64 _high; uint64 _current; InputBitStream& _bitstream; std::vector _buf; uint _index; uint16 _probs[4][256]; // probability of bit=1 uint16* _p; // pointer to current prob int _ctx; // previous bits void _dispose() const {} int decodeBit(int pred = 2048); bool reset(); public: FPAQDecoder(InputBitStream& bitstream); ~FPAQDecoder(); int decode(byte block[], uint blkptr, uint count); InputBitStream& getBitStream() const { return _bitstream; } void dispose() { _dispose(); } void read(); }; inline int FPAQDecoder::decodeBit(int prob) { // Calculate interval split // Written in a way to maximize accuracy of multiplication/division const uint64 split = ((((_high - _low) >> 8) * uint64(prob)) >> 8) + _low; int bit; // Update probabilities if (split >= _current) { _high = split; _p[_ctx] -= uint16((_p[_ctx] - PSCALE + 64) >> 6); _ctx += (_ctx + 1); bit = 1; } else { _low = split + 1; _p[_ctx] -= uint16(_p[_ctx] >> 6); _ctx += _ctx; bit = 0; } // Read 32 bits from bitstream if (((_low ^ _high) >> 24) == 0) read(); return bit; } inline void FPAQDecoder::read() { _low = (_low << 32) & MASK_0_56; _high = ((_high << 32) | MASK_0_32) & MASK_0_56; const uint64 val = BigEndian::readInt32(&_buf[_index]) & MASK_0_32; _current = ((_current << 32) | val) & MASK_0_56; _index += 4; } } #endif kanzi-cpp-2.5.2/src/entropy/FPAQEncoder.cpp000066400000000000000000000060411516423635400204500ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include "FPAQEncoder.hpp" #include "EntropyUtils.hpp" using namespace kanzi; using namespace std; const uint64 FPAQEncoder::TOP = 0x00FFFFFFFFFFFFFF; const uint64 FPAQEncoder::MASK_0_24 = 0x0000000000FFFFFF; const uint64 FPAQEncoder::MASK_0_32 = 0x00000000FFFFFFFF; const uint FPAQEncoder::DEFAULT_CHUNK_SIZE = 4 * 1024 * 1024; const uint FPAQEncoder::MAX_BLOCK_SIZE = 1 << 30; const int FPAQEncoder::PSCALE = 65536; FPAQEncoder::FPAQEncoder(OutputBitStream& bitstream) : _bitstream(bitstream) { reset(); } FPAQEncoder::~FPAQEncoder() { _dispose(); } bool FPAQEncoder::reset() { _index = 0; _low = 0; _high = TOP; _disposed = false; for (int i = 0; i < 4; i++) { for (int j = 0; j < 256; j++) _probs[i][j] = PSCALE >> 1; } return true; } int FPAQEncoder::encode(const kanzi::byte block[], uint blkptr, uint count) { if (count >= MAX_BLOCK_SIZE) throw invalid_argument("Invalid block size parameter (max is 1<<30)"); uint startChunk = blkptr; const uint end = blkptr + count; const size_t bufSize = max(DEFAULT_CHUNK_SIZE + (DEFAULT_CHUNK_SIZE >> 3), 1024u); if (_buf.size() < bufSize) _buf.resize(bufSize); // Split block into chunks, encode chunk and write bit array to bitstream while (startChunk < end) { const uint chunkSize = min(DEFAULT_CHUNK_SIZE, end - startChunk); _index = 0; const uint endChunk = startChunk + chunkSize; uint16* p = _probs[0]; for (uint i = startChunk; i < endChunk; i++) { const int val = int(block[i]); const int bits = val + 256; encodeBit(val & 0x80, p[1]); encodeBit(val & 0x40, p[bits >> 7]); encodeBit(val & 0x20, p[bits >> 6]); encodeBit(val & 0x10, p[bits >> 5]); encodeBit(val & 0x08, p[bits >> 4]); encodeBit(val & 0x04, p[bits >> 3]); encodeBit(val & 0x02, p[bits >> 2]); encodeBit(val & 0x01, p[bits >> 1]); p = _probs[val >> 6]; } EntropyUtils::writeVarInt(_bitstream, uint32(_index)); _bitstream.writeBits(&_buf[0], 8 * _index); startChunk += chunkSize; if (startChunk < end) _bitstream.writeBits(_low | MASK_0_24, 56); } return count; } void FPAQEncoder::_dispose() { if (_disposed == true) return; _disposed = true; _bitstream.writeBits(_low | MASK_0_24, 56); } kanzi-cpp-2.5.2/src/entropy/FPAQEncoder.hpp000066400000000000000000000047171516423635400204650ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_FPAQEncoder #define knz_FPAQEncoder #include #include "../EntropyEncoder.hpp" #include "../Memory.hpp" #include "../SliceArray.hpp" namespace kanzi { // Derived from fpaq0r by Matt Mahoney & Alexander Ratushnyak. // See http://mattmahoney.net/dc/#fpaq0. // Simple (and fast) adaptive entropy bit coder class FPAQEncoder : public EntropyEncoder { private: static const uint64 TOP; static const uint64 MASK_0_24; static const uint64 MASK_0_32; static const uint DEFAULT_CHUNK_SIZE; static const uint MAX_BLOCK_SIZE; static const int PSCALE; uint64 _low; uint64 _high; bool _disposed; OutputBitStream& _bitstream; std::vector _buf; uint _index; uint16 _probs[4][256]; // probability of bit=1 void encodeBit(int bit, uint16& prob); bool reset(); void _dispose(); public: FPAQEncoder(OutputBitStream& bitstream); ~FPAQEncoder(); int encode(const byte block[], uint blkptr, uint count); OutputBitStream& getBitStream() const { return _bitstream; } void dispose() { _dispose(); } void flush(); }; inline void FPAQEncoder::encodeBit(int bit, uint16& prob) { // Update probabilities if (bit == 0) { _low = _low + ((((_high - _low) >> 8) * uint64(prob)) >> 8) + 1; prob -= uint16(prob >> 6); } else { _high = _low + ((((_high - _low) >> 8) * uint64(prob)) >> 8); prob -= uint16((prob - PSCALE + 64) >> 6); } // Write unchanged first 32 bits to bitstream if (((_low ^ _high) >> 24) == 0) flush(); } inline void FPAQEncoder::flush() { BigEndian::writeInt32(&_buf[_index], int32(_high >> 24)); _index += 4; _low <<= 32; _high = (_high << 32) | MASK_0_32; } } #endif kanzi-cpp-2.5.2/src/entropy/HuffmanCommon.cpp000066400000000000000000000033621516423635400211610ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "HuffmanCommon.hpp" using namespace kanzi; const int HuffmanCommon::LOG_MAX_CHUNK_SIZE = 14; const int HuffmanCommon::MAX_CHUNK_SIZE = 1 << LOG_MAX_CHUNK_SIZE; const int HuffmanCommon::MAX_SYMBOL_SIZE = 12; const int HuffmanCommon::BUFFER_SIZE = (MAX_SYMBOL_SIZE << 8) + 256; // Return the number of codes generated // codes and symbols are updated int HuffmanCommon::generateCanonicalCodes(const uint16 sizes[], uint16 codes[], uint symbols[], int count) { if (count == 0) return 0; if (count > 1) { int8 buf[BUFFER_SIZE] = { int8(0) }; for (int i = 0; i < count; i++) { const uint s = symbols[i]; if ((s > 255) || (sizes[s] > MAX_SYMBOL_SIZE)) return -1; buf[((sizes[s] - 1) << 8) | s] = int8(1); } for (int i = 0, n = 0; n < count; i++) { symbols[n] = i & 0xFF; n += buf[i]; } } int curLen = sizes[symbols[0]]; for (int i = 0, code = 0; i < count; i++) { const int s = symbols[i]; code <<= (sizes[s] - curLen); curLen = sizes[s]; codes[s] = uint16(code); code++; } return count; } kanzi-cpp-2.5.2/src/entropy/HuffmanCommon.hpp000066400000000000000000000020041516423635400211560ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_HuffmanCommon #define knz_HuffmanCommon #include "../types.hpp" namespace kanzi { class HuffmanCommon { public: static const int LOG_MAX_CHUNK_SIZE; static const int MAX_CHUNK_SIZE; static const int MAX_SYMBOL_SIZE; static int generateCanonicalCodes(const uint16 sizes[], uint16 codes[], uint ranks[], int count); private: static const int BUFFER_SIZE; }; } #endif kanzi-cpp-2.5.2/src/entropy/HuffmanDecoder.cpp000066400000000000000000000367041516423635400213040ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include "HuffmanDecoder.hpp" #include "EntropyUtils.hpp" #include "ExpGolombDecoder.hpp" #include "../BitStreamException.hpp" #include "../Memory.hpp" using namespace kanzi; using namespace std; const int HuffmanDecoder::DECODING_BATCH_SIZE = 12; // ensures decoding table fits in L1 cache const int HuffmanDecoder::TABLE_MASK = (1 << DECODING_BATCH_SIZE) - 1; // The chunk size indicates how many bytes are encoded (per block) before // resetting the frequency stats. HuffmanDecoder::HuffmanDecoder(InputBitStream& bitstream, Context* pCtx, int chunkSize) : _bitstream(bitstream) { if (chunkSize < 1024) throw invalid_argument("Huffman codec: The chunk size must be at least 1024"); if (chunkSize > HuffmanCommon::MAX_CHUNK_SIZE) { stringstream ss; ss << "Huffman codec: The chunk size must be at most " << HuffmanCommon::MAX_CHUNK_SIZE; throw invalid_argument(ss.str()); } _chunkSize = chunkSize; _buffer = nullptr; _bufferSize = 0; _pCtx = pCtx; reset(); } bool HuffmanDecoder::reset() { // Default lengths & canonical codes for (uint16 i = 0; i < 256; i++) { _codes[i] = i; _sizes[i] = 8; } memset(_alphabet, 0, sizeof(_alphabet)); memset(_table, 0, sizeof(_table)); return true; } int HuffmanDecoder::readLengths() { const int count = EntropyUtils::decodeAlphabet(_bitstream, _alphabet); if (count == 0) return 0; ExpGolombDecoder egdec(_bitstream, true); int8 curSize = 2; // Read lengths from bitstream for (int i = 0; i < count; i++) { const uint s = _alphabet[i]; if (s > 255) { stringstream ss; ss << "Invalid bitstream: incorrect Huffman symbol " << s; throw BitStreamException(ss.str(), BitStreamException::INVALID_STREAM); } _codes[s] = 0; curSize += int8(egdec.decodeByte()); if ((curSize <= 0) || (curSize > HuffmanCommon::MAX_SYMBOL_SIZE)) { stringstream ss; ss << "Invalid bitstream: incorrect size " << int(curSize); ss << " for Huffman symbol " << s; throw BitStreamException(ss.str(), BitStreamException::INVALID_STREAM); } _sizes[s] = uint16(curSize); } // Create canonical codes if (HuffmanCommon::generateCanonicalCodes(_sizes, _codes, _alphabet, count) < 0) { stringstream ss; ss << "Could not generate Huffman codes: max code length ("; ss << HuffmanCommon::MAX_SYMBOL_SIZE; ss << " bits) exceeded"; throw BitStreamException(ss.str(), BitStreamException::INVALID_STREAM); } return count; } // max(CodeLen) must be <= MAX_SYMBOL_SIZE bool HuffmanDecoder::buildDecodingTable(int count) { // Initialize table with non zero values. // If the bitstream is altered, the decoder may access these default table values. // The number of consumed bits cannot be 0. memset(_table, 7, sizeof(_table)); uint16 length = 0; for (int i = 0; i < count; i++) { const uint s = _alphabet[i]; // All DECODING_BATCH_SIZE bit values read from the bit stream and // starting with the same prefix point to symbol s length = max(_sizes[s], length); const int w = 1 << (DECODING_BATCH_SIZE - length); int idx = int(_codes[s]) * w; const int end = idx + w; if (end > TABLE_MASK + 1) return false; // code -> size, symbol const uint16 val = (uint16(s) << 8) | _sizes[s]; while (idx < end) _table[idx++] = val; } return true; } int HuffmanDecoder::decode(kanzi::byte block[], uint blkptr, uint count) { if (count == 0) return 0; int bsVersion = _pCtx == nullptr ? 6 : _pCtx->getInt("bsVersion", 6); if (bsVersion < 6) return decodeV5(block, blkptr, count); return decodeV6(block, blkptr, count); } int HuffmanDecoder::decodeV6(kanzi::byte block[], uint blkptr, uint count) { const uint minBufSize = 2 * uint(_chunkSize); if (_bufferSize < minBufSize) { if (_buffer != nullptr) delete[] _buffer; _bufferSize = minBufSize; _buffer = new kanzi::byte[_bufferSize]; } uint startChunk = blkptr; const uint end = blkptr + count; while (startChunk < end) { const uint sizeChunk = min(uint(_chunkSize), end - startChunk); if (sizeChunk < 32) { // Special case for small chunks _bitstream.readBits(&block[startChunk], 8 * sizeChunk); } else { // For each chunk, read code lengths, rebuild codes, rebuild decoding table const int alphabetSize = readLengths(); if (alphabetSize <= 0) return startChunk - blkptr; if (alphabetSize == 1) { // Shortcut for chunks with only one symbol memset(&block[startChunk], _alphabet[0], size_t(sizeChunk)); } else { if (buildDecodingTable(alphabetSize) == false) return -1; if (decodeChunk(&block[startChunk], sizeChunk) == false) return -1; } } startChunk += sizeChunk; } return count; } // count is at least 32 bool HuffmanDecoder::decodeChunk(kanzi::byte block[], uint count) { // Read fragment sizes const int szBits0 = EntropyUtils::readVarInt(_bitstream); const int szBits1 = EntropyUtils::readVarInt(_bitstream); const int szBits2 = EntropyUtils::readVarInt(_bitstream); const int szBits3 = EntropyUtils::readVarInt(_bitstream); if ((szBits0 < 0) || (szBits1 < 0) || (szBits2 < 0) || (szBits3 < 0)) return false; // Each of the 4 streams is stored in one quarter of _buffer. const int maxFragBits = int((_bufferSize >> 2) << 3); if ((szBits0 > maxFragBits) || (szBits1 > maxFragBits) || (szBits2 > maxFragBits) || (szBits3 > maxFragBits)) return false; memset(_buffer, 0, _bufferSize); int idx0 = 0 * (_bufferSize / 4); int idx1 = 1 * (_bufferSize / 4); int idx2 = 2 * (_bufferSize / 4); int idx3 = 3 * (_bufferSize / 4); // Read all compressed data from bitstream _bitstream.readBits(&_buffer[idx0], szBits0); _bitstream.readBits(&_buffer[idx1], szBits1); _bitstream.readBits(&_buffer[idx2], szBits2); _bitstream.readBits(&_buffer[idx3], szBits3); // State variables for each of the four parallel streams uint64 state0 = 0, state1 = 0, state2 = 0, state3 = 0; // bits read from bitstream uint8 bits0 = 0, bits1 = 0, bits2 = 0, bits3 = 0; // number of available bits in state #define READ_STATE(shift, state, idx, bits) do {\ const uint8 shift = (56 - bits) & -8; \ bits += shift - DECODING_BATCH_SIZE; \ state = (state << shift) | (uint64(BigEndian::readLong64(&_buffer[idx])) >> 1 >> (63 - shift)); /* handle shift = 0 */ \ idx += (shift >> 3); \ } while (0); const int szFrag = count / 4; kanzi::byte* block0 = &block[0 * szFrag]; kanzi::byte* block1 = &block[1 * szFrag]; kanzi::byte* block2 = &block[2 * szFrag]; kanzi::byte* block3 = &block[3 * szFrag]; int n = 0; while (n < szFrag - 4) { // Fill 64 bits of state from the bitstream for each stream READ_STATE(shift, state0, idx0, bits0); READ_STATE(shift, state1, idx1, bits1); READ_STATE(shift, state2, idx2, bits2); READ_STATE(shift, state3, idx3, bits3); // Decompress 4 symbols per stream const uint16 val00 = _table[(state0 >> bits0) & TABLE_MASK]; bits0 -= uint8(val00); const uint16 val10 = _table[(state1 >> bits1) & TABLE_MASK]; bits1 -= uint8(val10); const uint16 val20 = _table[(state2 >> bits2) & TABLE_MASK]; bits2 -= uint8(val20); const uint16 val30 = _table[(state3 >> bits3) & TABLE_MASK]; bits3 -= uint8(val30); const uint16 val01 = _table[(state0 >> bits0) & TABLE_MASK]; bits0 -= uint8(val01); const uint16 val11 = _table[(state1 >> bits1) & TABLE_MASK]; bits1 -= uint8(val11); const uint16 val21 = _table[(state2 >> bits2) & TABLE_MASK]; bits2 -= uint8(val21); const uint16 val31 = _table[(state3 >> bits3) & TABLE_MASK]; bits3 -= uint8(val31); const uint16 val02 = _table[(state0 >> bits0) & TABLE_MASK]; bits0 -= uint8(val02); const uint16 val12 = _table[(state1 >> bits1) & TABLE_MASK]; bits1 -= uint8(val12); const uint16 val22 = _table[(state2 >> bits2) & TABLE_MASK]; bits2 -= uint8(val22); const uint16 val32 = _table[(state3 >> bits3) & TABLE_MASK]; bits3 -= uint8(val32); const uint16 val03 = _table[(state0 >> bits0) & TABLE_MASK]; bits0 -= uint8(val03); const uint16 val13 = _table[(state1 >> bits1) & TABLE_MASK]; bits1 -= uint8(val13); const uint16 val23 = _table[(state2 >> bits2) & TABLE_MASK]; bits2 -= uint8(val23); const uint16 val33 = _table[(state3 >> bits3) & TABLE_MASK]; bits3 -= uint8(val33); bits0 += DECODING_BATCH_SIZE; bits1 += DECODING_BATCH_SIZE; bits2 += DECODING_BATCH_SIZE; bits3 += DECODING_BATCH_SIZE; block0[n + 0] = kanzi::byte(val00 >> 8); block1[n + 0] = kanzi::byte(val10 >> 8); block2[n + 0] = kanzi::byte(val20 >> 8); block3[n + 0] = kanzi::byte(val30 >> 8); block0[n + 1] = kanzi::byte(val01 >> 8); block1[n + 1] = kanzi::byte(val11 >> 8); block2[n + 1] = kanzi::byte(val21 >> 8); block3[n + 1] = kanzi::byte(val31 >> 8); block0[n + 2] = kanzi::byte(val02 >> 8); block1[n + 2] = kanzi::byte(val12 >> 8); block2[n + 2] = kanzi::byte(val22 >> 8); block3[n + 2] = kanzi::byte(val32 >> 8); block0[n + 3] = kanzi::byte(val03 >> 8); block1[n + 3] = kanzi::byte(val13 >> 8); block2[n + 3] = kanzi::byte(val23 >> 8); block3[n + 3] = kanzi::byte(val33 >> 8); n += 4; } // Fill 64 bits of state from the bitstream for each stream READ_STATE(shift, state0, idx0, bits0); READ_STATE(shift, state1, idx1, bits1); READ_STATE(shift, state2, idx2, bits2); READ_STATE(shift, state3, idx3, bits3); while (n < szFrag) { // Decompress 1 symbol per stream const uint16 val0 = _table[(state0 >> bits0) & TABLE_MASK]; bits0 -= uint8(val0); const uint16 val1 = _table[(state1 >> bits1) & TABLE_MASK]; bits1 -= uint8(val1); const uint16 val2 = _table[(state2 >> bits2) & TABLE_MASK]; bits2 -= uint8(val2); const uint16 val3 = _table[(state3 >> bits3) & TABLE_MASK]; bits3 -= uint8(val3); block0[n] = kanzi::byte(val0 >> 8); block1[n] = kanzi::byte(val1 >> 8); block2[n] = kanzi::byte(val2 >> 8); block3[n] = kanzi::byte(val3 >> 8); n++; } // Process any remaining bytes at the end of the whole chunk const uint count4 = 4 * szFrag; for (uint i = count4; i < count; i++) block[i] = kanzi::byte(_bitstream.readBits(8)); return true; } int HuffmanDecoder::decodeV5(kanzi::byte block[], uint blkptr, uint count) { uint startChunk = blkptr; const uint end = blkptr + count; while (startChunk < end) { const uint endChunk = min(startChunk + _chunkSize, end); const uint sizeChunk = endChunk - startChunk; // For each chunk, read code lengths, rebuild codes, rebuild decoding table const int alphabetSize = readLengths(); if (alphabetSize <= 0) return startChunk - blkptr; if (alphabetSize == 1) { // Shortcut for chunks with only one symbol memset(&block[startChunk], _alphabet[0], size_t(endChunk - startChunk)); startChunk = endChunk; continue; } if (buildDecodingTable(alphabetSize) == false) return -1; // Read number of streams. Only 1 steam supported for now if (_bitstream.readBits(2) != 0) return -1; // Read chunk size const int szBits = EntropyUtils::readVarInt(_bitstream); if ((szBits < 0) || (szBits > int(sizeChunk) * HuffmanCommon::MAX_SYMBOL_SIZE)) return -1; // Read compressed data from bitstream if (szBits != 0) { const int sz = (szBits + 7) >> 3; const uint minLenBuf = uint(max(sz + (sz >> 3), 1024)); if (_bufferSize < minLenBuf) { if (_buffer != nullptr) delete[] _buffer; _bufferSize = minLenBuf; _buffer = new kanzi::byte[_bufferSize]; } _bitstream.readBits(&_buffer[0], szBits); uint64 state = 0; // holds bits read from bitstream uint8 bits = 0; // number of available bits in state int idx = 0; uint n = startChunk; while (idx < sz - 8) { const uint8 shift = (56 - bits) & -8; state = (state << shift) | (uint64(BigEndian::readLong64(&_buffer[idx])) >> 1 >> (63 - shift)); // handle shift = 0 idx += (shift >> 3); uint8 bs = bits + shift - DECODING_BATCH_SIZE; const uint16 val0 = _table[(state >> bs) & TABLE_MASK]; bs -= uint8(val0); const uint16 val1 = _table[(state >> bs) & TABLE_MASK]; bs -= uint8(val1); const uint16 val2 = _table[(state >> bs) & TABLE_MASK]; bs -= uint8(val2); const uint16 val3 = _table[(state >> bs) & TABLE_MASK]; bs -= uint8(val3); bits = bs + DECODING_BATCH_SIZE; block[n + 0] = kanzi::byte(val0 >> 8); block[n + 1] = kanzi::byte(val1 >> 8); block[n + 2] = kanzi::byte(val2 >> 8); block[n + 3] = kanzi::byte(val3 >> 8); n += 4; } // Last bytes uint nbBits = idx * 8; while (n < endChunk) { while ((bits < HuffmanCommon::MAX_SYMBOL_SIZE) && (idx < sz)) { state = (state << 8) | uint64(_buffer[idx] & kanzi::byte(0xFF)); idx++; nbBits = (idx == sz) ? szBits : nbBits + 8; // 'bits' may overshoot when idx == sz due to padding state bits // It is necessary to compute proper _table indexes // and has no consequence (except bits != 0 at end of chunk) bits += 8; } // Sanity check if (bits > 64) return n; uint16 val; if (bits >= DECODING_BATCH_SIZE) val = _table[(state >> (bits - DECODING_BATCH_SIZE)) & TABLE_MASK]; else val = _table[(state << (DECODING_BATCH_SIZE - bits)) & TABLE_MASK]; bits -= uint8(val); block[n++] = kanzi::byte(val >> 8); } } startChunk = endChunk; } return count; } kanzi-cpp-2.5.2/src/entropy/HuffmanDecoder.hpp000066400000000000000000000036121516423635400213010ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_HuffmanDecoder #define knz_HuffmanDecoder #include "HuffmanCommon.hpp" #include "../Context.hpp" #include "../EntropyDecoder.hpp" namespace kanzi { // Implementation of a static Huffman coder. class HuffmanDecoder : public EntropyDecoder { public: HuffmanDecoder(InputBitStream& bitstream, Context* pCtx = nullptr, int chunkSize = HuffmanCommon::MAX_CHUNK_SIZE) ; ~HuffmanDecoder() { _dispose(); if (_buffer != nullptr) delete[] _buffer; } int decode(byte block[], uint blkptr, uint len); InputBitStream& getBitStream() const { return _bitstream; } void dispose() { _dispose(); } private: static const int DECODING_BATCH_SIZE; static const int TABLE_MASK; InputBitStream& _bitstream; byte* _buffer; uint _bufferSize; uint16 _codes[256]; uint _alphabet[256]; uint16 _sizes[256]; uint16 _table[1 << 12]; // decoding table: code -> size, symbol int _chunkSize; Context* _pCtx; int readLengths(); bool decodeChunk(byte block[], uint count); bool buildDecodingTable(int count); bool reset(); int decodeV5(byte block[], uint blkptr, uint len); int decodeV6(byte block[], uint blkptr, uint len); void _dispose() const {} }; } #endif kanzi-cpp-2.5.2/src/entropy/HuffmanEncoder.cpp000066400000000000000000000272071516423635400213140ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include "HuffmanEncoder.hpp" #include "EntropyUtils.hpp" #include "ExpGolombEncoder.hpp" #include "../Global.hpp" #include "../Memory.hpp" using namespace kanzi; using namespace std; // The chunk size indicates how many bytes are encoded (per block) before // resetting the frequency stats. 0 means that frequencies calculated at the // beginning of the block apply to the whole block. HuffmanEncoder::HuffmanEncoder(OutputBitStream& bitstream, int chunkSize) : _bitstream(bitstream) { if (chunkSize < 1024) throw invalid_argument("Huffman codec: The chunk size must be at least 1024"); if (chunkSize > HuffmanCommon::MAX_CHUNK_SIZE) { stringstream ss; ss << "Huffman codec: The chunk size must be at most " << HuffmanCommon::MAX_CHUNK_SIZE; throw invalid_argument(ss.str()); } _chunkSize = chunkSize; _buffer = nullptr; _bufferSize = 0; reset(); } bool HuffmanEncoder::reset() { for (uint16 i = 0; i < 256; i++) _codes[i] = i; return true; } // Rebuild Huffman codes int HuffmanEncoder::updateFrequencies(uint freqs[]) { int count = 0; uint16 sizes[256] = { 0 }; uint alphabet[256] = { 0 }; for (int i = 0; i < 256; i++) { _codes[i] = 0; if (freqs[i] > 0) alphabet[count++] = i; } EntropyUtils::encodeAlphabet(_bitstream, alphabet, 256, count); if (count == 0) return 0; if (count == 1) { _codes[alphabet[0]] = 1 << 12; sizes[alphabet[0]] = 1; } else { uint ranks[256]; // sorted ranks for (int i = 0; i < count; i++) ranks[i] = (freqs[alphabet[i]] << 8) | alphabet[i]; int maxCodeLen = computeCodeLengths(sizes, ranks, count); if (maxCodeLen == 0) throw invalid_argument("Could not generate Huffman codes: invalid code length 0"); if (maxCodeLen > HuffmanCommon::MAX_SYMBOL_SIZE) { maxCodeLen = limitCodeLengths(alphabet, freqs, sizes, ranks, count); if (maxCodeLen == 0) throw invalid_argument("Could not generate Huffman codes: invalid code length 0"); } if (maxCodeLen > HuffmanCommon::MAX_SYMBOL_SIZE) { uint16 n = 0; for (int i = 0; i < count; i++) { _codes[alphabet[i]] = n; sizes[alphabet[i]] = 8; n++; } } else { HuffmanCommon::generateCanonicalCodes(sizes, _codes, ranks, count); } } // Transmit code lengths only, freqs and codes do not matter ExpGolombEncoder egenc(_bitstream, true); uint16 prevSize = 2; // Pack size and code (size <= MAX_SYMBOL_SIZE bits) // Unary encode the code length differences for (int i = 0; i < count; i++) { const int s = alphabet[i]; _codes[s] |= uint16(sizes[s] << 12); egenc.encodeByte(kanzi::byte(sizes[s] - prevSize)); prevSize = sizes[s]; } return count; } int HuffmanEncoder::limitCodeLengths(const uint alphabet[], uint freqs[], uint16 sizes[], uint ranks[], int count) const { int n = 0; int debt = 0; // Fold over-the-limit sizes, skip at-the-limit sizes => incur bit debt while (sizes[ranks[n]] >= HuffmanCommon::MAX_SYMBOL_SIZE) { debt += (sizes[ranks[n]] - HuffmanCommon::MAX_SYMBOL_SIZE); sizes[ranks[n]] = HuffmanCommon::MAX_SYMBOL_SIZE; n++; } if (debt == 0) return HuffmanCommon::MAX_SYMBOL_SIZE; // Check (up to) 6 levels; one vector per size delta vector v[6]; size_t vHead[6] = { 0 }; for (int i = 0; i < 6; i++) v[i].reserve(count - n); while (n < count) { const int idx = HuffmanCommon::MAX_SYMBOL_SIZE - 1 - sizes[ranks[n]]; if ((idx > 5) || (debt < (1 << idx))) break; v[idx].push_back(n); n++; } int idx = 5; // Repay bit debt in a "semi optimized" way while ((debt > 0) && (idx >= 0)) { if ((vHead[idx] >= v[idx].size()) || (debt < (1 << idx))) { idx--; continue; } // Access element at current head sizes[ranks[v[idx][vHead[idx]]]]++; debt -= (1 << idx); // Advance head vHead[idx]++; } idx = 0; // Adjust if necessary while ((debt > 0) && (idx < 6)) { if (vHead[idx] >= v[idx].size()) { idx++; continue; } sizes[ranks[v[idx][vHead[idx]]]]++; debt -= (1 << idx); vHead[idx]++; } if (debt > 0) { // Fallback to slow (more accurate) path if fast path failed to repay the debt uint alpha[256] = { 0 }; uint f[256]; uint totalFreq = 0; for (int i = 0; i < count; i++) { f[i] = freqs[alphabet[i]]; totalFreq += f[i]; } // Renormalize to a smaller scale EntropyUtils::normalizeFrequencies(f, alpha, count, totalFreq, HuffmanCommon::MAX_CHUNK_SIZE >> 3); for (int i = 0; i < count; i++) { freqs[alphabet[i]] = f[i]; ranks[i] = (f[i] << 8) | alphabet[i]; } return computeCodeLengths(sizes, ranks, count); } return HuffmanCommon::MAX_SYMBOL_SIZE; } // Called only when more than 1 symbol int HuffmanEncoder::computeCodeLengths(uint16 sizes[], uint ranks[], int count) const { // Sort ranks by increasing freqs (first key) and increasing value (second key) sort(ranks, ranks + count); uint freqs[256] = { 0 }; bool valid = true; for (int i = 0; i < count; i++) { freqs[i] = ranks[i] >> 8; ranks[i] = ranks[i] & 0xFF; valid &= (freqs[i] != 0); } if (valid == false) return 0; // See [In-Place Calculation of Minimum-Redundancy Codes] // by Alistair Moffat & Jyrki Katajainen computeInPlaceSizesPhase1(freqs, count); const int maxCodeLen = computeInPlaceSizesPhase2(freqs, count); for (int i = 0; i < count; i++) sizes[ranks[i]] = uint16(freqs[i]); return maxCodeLen; } void HuffmanEncoder::computeInPlaceSizesPhase1(uint data[], int n) { for (int s = 0, r = 0, t = 0; t < n - 1; t++) { uint sum = 0; for (int i = 0; i < 2; i++) { if ((s >= n) || ((r < t) && (data[r] < data[s]))) { sum += data[r]; data[r] = t; r++; continue; } sum += data[s]; if (s > t) data[s] = 0; s++; } data[t] = sum; } } // n must be at least 2 // return max symbol length uint HuffmanEncoder::computeInPlaceSizesPhase2(uint data[], int n) { if (n < 2) return 0; uint topLevel = n - 2; //root uint depth = 1; uint totalNodesAtLevel = 2; while (n > 0) { uint k = topLevel; while ((k != 0) && (data[k - 1] >= topLevel)) k--; const int internalNodesAtLevel = topLevel - k; const int leavesAtLevel = totalNodesAtLevel - internalNodesAtLevel; for (int j = 0; j < leavesAtLevel; j++) data[--n] = depth; totalNodesAtLevel = internalNodesAtLevel << 1; topLevel = k; depth++; } return depth - 1; } // Dynamically compute the frequencies for every chunk of data in the block int HuffmanEncoder::encode(const kanzi::byte block[], uint blkptr, uint count) { if (count == 0) return 0; const uint sz = uint(_chunkSize); const uint minLenBuf = max(min(sz + (sz >> 3), 2 * count), uint(65536)); if (_bufferSize < minLenBuf) { if (_buffer != nullptr) delete[] _buffer; _bufferSize = minLenBuf; _buffer = new kanzi::byte[_bufferSize]; } uint startChunk = blkptr; const uint end = startChunk + count; while (startChunk < end) { // Update frequencies and rebuild Huffman codes const uint sizeChunk = min(uint(_chunkSize), end - startChunk); if (sizeChunk < 32) { // Special case for small chunks _bitstream.writeBits(&block[startChunk], 8 * sizeChunk); } else { uint freqs[256] = { 0 }; Global::computeHistogram(&block[startChunk], sizeChunk, freqs); // Skip chunk if only one symbol if (updateFrequencies(freqs) > 1) { encodeChunk(&block[startChunk], sizeChunk); } } startChunk += sizeChunk; } return count; } // count is at least 32 void HuffmanEncoder::encodeChunk(const kanzi::byte block[], uint count) { uint nbBits[4] = { 0 }; const uint szFrag = count / 4; const uint szFrag4 = szFrag & ~3; const uint szBuf = _bufferSize / 4; // Encode chunk for (int j = 0; j < 4; j++) { const kanzi::byte* src = &block[j * szFrag]; kanzi::byte* buf = &_buffer[j * szBuf]; int idx = 0; int bits = 0; // number of accumulated bits uint64 state = 0; // Encode fragments sequentially for (uint i = 0; i < szFrag4; i += 4) { const uint16 code0 = _codes[int(src[i])]; const uint16 codeLen0 = code0 >> 12; const uint16 code1 = _codes[int(src[i + 1])]; const uint16 codeLen1 = code1 >> 12; const uint16 code2 = _codes[int(src[i + 2])]; const uint16 codeLen2 = code2 >> 12; const uint16 code3 = _codes[int(src[i + 3])]; const uint16 codeLen3 = code3 >> 12; state = (state << codeLen0) | uint64(code0 & 0x0FFF); state = (state << codeLen1) | uint64(code1 & 0x0FFF); state = (state << codeLen2) | uint64(code2 & 0x0FFF); state = (state << codeLen3) | uint64(code3 & 0x0FFF); bits += (codeLen0 + codeLen1 + codeLen2 + codeLen3); BigEndian::writeLong64(&buf[idx], state << (64 - bits)); // bits cannot be 0 idx += (bits >> 3); bits &= 7; } // Fragment last bytes for (uint i = szFrag4; i < szFrag; i++) { const uint16 code = _codes[int(src[i])]; const uint16 codeLen = code >> 12; state = (state << codeLen) | uint64(code & 0x0FFF); bits += codeLen; } nbBits[j] = (idx * 8) + bits; while (bits >= 8) { bits -= 8; buf[idx++] = kanzi::byte(state >> bits); } if (bits > 0) buf[idx++] = kanzi::byte(state << (8 - bits)); } // Write chunk size in bits EntropyUtils::writeVarInt(_bitstream, nbBits[0]); EntropyUtils::writeVarInt(_bitstream, nbBits[1]); EntropyUtils::writeVarInt(_bitstream, nbBits[2]); EntropyUtils::writeVarInt(_bitstream, nbBits[3]); // Write compressed data to bitstream _bitstream.writeBits(&_buffer[0 * szBuf], nbBits[0]); _bitstream.writeBits(&_buffer[1 * szBuf], nbBits[1]); _bitstream.writeBits(&_buffer[2 * szBuf], nbBits[2]); _bitstream.writeBits(&_buffer[3 * szBuf], nbBits[3]); // Chunk last bytes const uint count4 = 4 * szFrag; for (uint i = count4; i < count; i++) _bitstream.writeBits(uint64(block[i]), 8); } kanzi-cpp-2.5.2/src/entropy/HuffmanEncoder.hpp000066400000000000000000000036031516423635400213130ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_HuffmanEncoder #define knz_HuffmanEncoder #include "HuffmanCommon.hpp" #include "../EntropyEncoder.hpp" namespace kanzi { // Implementation of a static Huffman encoder. // Uses in place generation of canonical codes instead of a tree class HuffmanEncoder : public EntropyEncoder { public: HuffmanEncoder(OutputBitStream& bitstream, int chunkSize = HuffmanCommon::MAX_CHUNK_SIZE); ~HuffmanEncoder() { _dispose(); if (_buffer != nullptr) delete[] _buffer; } int updateFrequencies(uint frequencies[]); int encode(const byte block[], uint blkptr, uint len); OutputBitStream& getBitStream() const { return _bitstream; } void dispose() { _dispose(); } private: OutputBitStream& _bitstream; uint16 _codes[256]; int _chunkSize; byte* _buffer; uint _bufferSize; void encodeChunk(const byte block[], uint count); int computeCodeLengths(uint16 sizes[], uint sranks[], int count) const; int limitCodeLengths(const uint alphabet[], uint freqs[], uint16 sizes[], uint sranks[], int count) const; void _dispose() const {} bool reset(); static void computeInPlaceSizesPhase1(uint data[], int n); static uint computeInPlaceSizesPhase2(uint data[], int n); }; } #endif kanzi-cpp-2.5.2/src/entropy/NullEntropyDecoder.hpp000066400000000000000000000035271516423635400222150ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_NullEntropyDecoder #define knz_NullEntropyDecoder #include "../EntropyDecoder.hpp" #include "../InputBitStream.hpp" namespace kanzi { // Null entropy decoder // Pass through that writes the data directly to the bitstream class NullEntropyDecoder FINAL : public EntropyDecoder { private: InputBitStream& _bitstream; public: NullEntropyDecoder(InputBitStream& bitstream); ~NullEntropyDecoder() {} int decode(byte block[], uint blkptr, uint len); byte decodeByte(); InputBitStream& getBitStream() const { return _bitstream; } void dispose() {} }; inline NullEntropyDecoder::NullEntropyDecoder(InputBitStream& bitstream) : _bitstream(bitstream) { } inline int NullEntropyDecoder::decode(byte block[], uint blkptr, uint count) { uint res = 0; while (count != 0) { const uint ckSize = (count < 1<<23) ? count : 1<<23; const uint r = uint(_bitstream.readBits(&block[blkptr], 8 * ckSize) >> 3); if (r == 0) break; res += r; blkptr += r; count -= r; } return res; } inline byte NullEntropyDecoder::decodeByte() { return byte(_bitstream.readBits(8)); } } #endif kanzi-cpp-2.5.2/src/entropy/NullEntropyEncoder.hpp000066400000000000000000000035721516423635400222270ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_NullEntropyEncoder #define knz_NullEntropyEncoder #include "../EntropyEncoder.hpp" #include "../OutputBitStream.hpp" namespace kanzi { // Null entropy encoder // Pass through that writes the data directly to the bitstream class NullEntropyEncoder FINAL : public EntropyEncoder { private: OutputBitStream& _bitstream; public: NullEntropyEncoder(OutputBitStream& bitstream); ~NullEntropyEncoder() {} int encode(const byte block[], uint blkptr, uint len); void encodeByte(byte val); OutputBitStream& getBitStream() const { return _bitstream; } void dispose() {} }; inline NullEntropyEncoder::NullEntropyEncoder(OutputBitStream& bitstream) : _bitstream(bitstream) { } inline int NullEntropyEncoder::encode(const byte block[], uint blkptr, uint count) { uint res = 0; while (count != 0) { const uint ckSize = (count < 1<<23) ? count : 1<<23; const uint w = uint(_bitstream.writeBits(&block[blkptr], 8 * ckSize) >> 3); if (w == 0) break; res += w; blkptr += w; count -= w; } return res; } inline void NullEntropyEncoder::encodeByte(byte val) { _bitstream.writeBits(uint64(val), 8); } } #endif kanzi-cpp-2.5.2/src/entropy/RangeDecoder.cpp000066400000000000000000000144141516423635400207460ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include "../BitStreamException.hpp" #include "RangeDecoder.hpp" #include "EntropyUtils.hpp" using namespace kanzi; using namespace std; const int RangeDecoder::DECODING_BATCH_SIZE = 12; // in bits const int RangeDecoder::DECODING_MASK = (1 << DECODING_BATCH_SIZE) - 1; const uint64 RangeDecoder::TOP_RANGE = 0x0FFFFFFFFFFFFFFF; const uint64 RangeDecoder::BOTTOM_RANGE = 0x000000000000FFFF; const uint64 RangeDecoder::RANGE_MASK = 0x0FFFFFFF00000000; const int RangeDecoder::DEFAULT_CHUNK_SIZE = 1 << 15; // 32 KB by default const int RangeDecoder::DEFAULT_LOG_RANGE = 12; const int RangeDecoder::MAX_CHUNK_SIZE = 1 << 30; // The chunk size indicates how many bytes are encoded (per block) before // resetting the frequency stats. RangeDecoder::RangeDecoder(InputBitStream& bitstream, int chunkSize) : _bitstream(bitstream) { if (chunkSize < 1024) throw invalid_argument("The chunk size must be at least 1024"); if (chunkSize > MAX_CHUNK_SIZE) throw invalid_argument("The chunk size must be at most 2^30"); _f2s = nullptr; _chunkSize = chunkSize; reset(); } bool RangeDecoder::reset() { _low = 0; _range = TOP_RANGE; _code = 0; _lenF2S = 0; _shift = 0; memset(_alphabet, 0, sizeof(uint) * 256); memset(_freqs, 0, sizeof(uint) * 256); memset(_cumFreqs, 0, sizeof(uint64) * 257); return true; } int RangeDecoder::decodeHeader(uint frequencies[]) { int alphabetSize = EntropyUtils::decodeAlphabet(_bitstream, _alphabet); if (alphabetSize == 0) return 0; if (alphabetSize != 256) { memset(frequencies, 0, sizeof(uint) * 256); } const uint logRange = uint(8 + _bitstream.readBits(3)); const int scale = 1 << logRange; _shift = logRange; int sum = 0; const int chkSize = (alphabetSize >= 64) ? 8 : 6; int llr = 3; while (uint(1 << llr) <= logRange) llr++; // Decode all frequencies (but the first one) by chunks of size 'inc' for (int i = 1; i < alphabetSize; i += chkSize) { const int logMax = int(_bitstream.readBits(llr)); if ((1 << logMax) > scale) { stringstream ss; ss << "Invalid bitstream: incorrect frequency size "; ss << logMax << " in ANS range decoder"; throw BitStreamException(ss.str(), BitStreamException::INVALID_STREAM); } const int endj = min(i + chkSize, alphabetSize); // Read frequencies for (int j = i; j < endj; j++) { const int freq = (logMax == 0) ? 1 : int(_bitstream.readBits(logMax) + 1); if ((freq <= 0) || (freq >= scale)) { stringstream ss; ss << "Invalid bitstream: incorrect frequency " << freq; ss << " for symbol '" << _alphabet[j] << "' in range decoder"; throw BitStreamException(ss.str(), BitStreamException::INVALID_STREAM); } frequencies[_alphabet[j]] = uint(freq); sum += freq; } } // Infer first frequency if (scale <= sum) { stringstream ss; ss << "Invalid bitstream: incorrect frequency " << frequencies[_alphabet[0]]; ss << " for symbol '" << _alphabet[0] << "' in range decoder"; throw BitStreamException(ss.str(), BitStreamException::INVALID_STREAM); } frequencies[_alphabet[0]] = uint(scale - sum); _cumFreqs[0] = 0; if (_lenF2S < scale) { if (_f2s != nullptr) delete[] _f2s; _lenF2S = scale; _f2s = new short[_lenF2S]; } // Create histogram of frequencies scaled to 'range' and reverse mapping for (int i = 0; i < 256; i++) { _cumFreqs[i + 1] = _cumFreqs[i] + frequencies[i]; const int base = int(_cumFreqs[i]); for (int j = frequencies[i] - 1; j >= 0; j--) _f2s[base + j] = short(i); } return alphabetSize; } // Initialize once (if necessary) at the beginning, the use the faster decodeByte_() // Reset frequency stats for each chunk of data in the block int RangeDecoder::decode(kanzi::byte block[], uint blkptr, uint count) { if (count == 0) return 0; const uint end = blkptr + count; const uint sz = _chunkSize; uint startChunk = blkptr; while (startChunk < end) { const uint endChunk = min(startChunk + sz, end); const int alphabetSize = decodeHeader(_freqs); if (alphabetSize == 0) return startChunk - blkptr; if (alphabetSize == 1) { // Shortcut for chunks with only one symbol memset(&block[startChunk], _alphabet[0], size_t(endChunk - startChunk)); startChunk = endChunk; continue; } _range = TOP_RANGE; _low = 0; _code = _bitstream.readBits(60); for (uint i = startChunk; i < endChunk; i++) block[i] = decodeByte(); startChunk = endChunk; } return count; } kanzi::byte RangeDecoder::decodeByte() { // Compute next low and range _range >>= _shift; const int symbol = _f2s[int((_code - _low) / _range)]; const uint64 cumFreq = _cumFreqs[symbol]; const uint64 freq = _cumFreqs[symbol + 1] - cumFreq; _low += (cumFreq * _range); _range *= freq; // If the left-most digits are the same throughout the range, read bits from bitstream while (true) { if (((_low ^ (_low + _range)) & RANGE_MASK) != 0) { if (_range > BOTTOM_RANGE) break; // Normalize _range = ~(_low-1) & BOTTOM_RANGE; } _code = (_code << 28) | _bitstream.readBits(28); _range <<= 28; _low <<= 28; } return kanzi::byte(symbol); } kanzi-cpp-2.5.2/src/entropy/RangeDecoder.hpp000066400000000000000000000040521516423635400207500ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_RangeDecoder #define knz_RangeDecoder #include "../EntropyDecoder.hpp" namespace kanzi { // Based on Order 0 range coder by Dmitry Subbotin itself derived from the algorithm // described by G.N.N Martin in his seminal article in 1979. // [G.N.N. Martin on the Data Recording Conference, Southampton, 1979] // Optimized for speed. class RangeDecoder : public EntropyDecoder { public: static const int DECODING_BATCH_SIZE; static const int DECODING_MASK; RangeDecoder(InputBitStream& bitstream, int chunkSize = DEFAULT_CHUNK_SIZE); ~RangeDecoder() { _dispose(); if (_f2s != nullptr) delete[] _f2s; } int decode(byte block[], uint blkptr, uint len); InputBitStream& getBitStream() const { return _bitstream; } void dispose() { _dispose(); } private: static const uint64 TOP_RANGE; static const uint64 BOTTOM_RANGE; static const uint64 RANGE_MASK; static const int DEFAULT_CHUNK_SIZE; static const int DEFAULT_LOG_RANGE; static const int MAX_CHUNK_SIZE; uint64 _code; uint64 _low; uint64 _range; uint _alphabet[256]; uint _freqs[256]; uint64 _cumFreqs[257]; short* _f2s; int _lenF2S; InputBitStream& _bitstream; uint _chunkSize; uint _shift; int decodeHeader(uint frequencies[]); byte decodeByte(); bool reset(); void _dispose() const {} }; } #endif kanzi-cpp-2.5.2/src/entropy/RangeEncoder.cpp000066400000000000000000000134301516423635400207550ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include "RangeEncoder.hpp" #include "EntropyUtils.hpp" #include "../Global.hpp" using namespace kanzi; using namespace std; const uint64 RangeEncoder::TOP_RANGE = 0x0FFFFFFFFFFFFFFF; const uint64 RangeEncoder::BOTTOM_RANGE = 0x000000000000FFFF; const uint64 RangeEncoder::RANGE_MASK = 0x0FFFFFFF00000000; const int RangeEncoder::DEFAULT_CHUNK_SIZE = 1 << 15; // 32 KB by default const int RangeEncoder::DEFAULT_LOG_RANGE = 12; const int RangeEncoder::MAX_CHUNK_SIZE = 1 << 30; // The chunk size indicates how many bytes are encoded (per block) before // resetting the frequency stats. RangeEncoder::RangeEncoder(OutputBitStream& bitstream, int chunkSize, int logRange) : _bitstream(bitstream) { if (chunkSize < 1024) throw invalid_argument("The chunk size must be at least 1024"); if (chunkSize > MAX_CHUNK_SIZE) throw invalid_argument("The chunk size must be at most 2^30"); if ((logRange < 8) || (logRange > 16)) { stringstream ss; ss << "Invalid range parameter: " << logRange << " (must be in [8..16])"; throw invalid_argument(ss.str()); } _logRange = logRange; _chunkSize = chunkSize; reset(); } bool RangeEncoder::reset() { _low = 0; _range = TOP_RANGE; _shift = 0; memset(_alphabet, 0, 256 * sizeof(uint)); memset(_freqs, 0, 256 * sizeof(uint)); memset(_cumFreqs, 0, 257 * sizeof(uint64)); return true; } int RangeEncoder::updateFrequencies(uint frequencies[], int size, int lr) { int alphabetSize = EntropyUtils::normalizeFrequencies(frequencies, _alphabet, 256, size, 1 << lr); if (alphabetSize > 0) { _cumFreqs[0] = 0; // Create histogram of frequencies scaled to 'range' for (int i = 0; i < 256; i++) _cumFreqs[i + 1] = _cumFreqs[i] + frequencies[i]; } encodeHeader(alphabetSize, _alphabet, frequencies, lr); return alphabetSize; } bool RangeEncoder::encodeHeader(int alphabetSize, const uint alphabet[], const uint frequencies[], int lr) const { const int encoded = EntropyUtils::encodeAlphabet(_bitstream, alphabet, 256, alphabetSize); if (encoded < 0) return false; if (encoded == 0) return true; _bitstream.writeBits(lr - 8, 3); // logRange if (encoded == 1) return true; int chkSize = (alphabetSize >= 64) ? 8 : 6; int llr = 3; while (1 << llr <= lr) llr++; // Encode all frequencies (but the first one) by chunks for (int i = 1; i < alphabetSize; i += chkSize) { uint max = frequencies[alphabet[i]] - 1; int endj = min(i + chkSize, alphabetSize); // Search for max frequency log size in next chunk for (int j = i + 1; j < endj; j++) { if (frequencies[alphabet[j]] - 1 > max) max = frequencies[alphabet[j]] - 1; } const uint logMax = (max == 0) ? 0 : Global::_log2(max) + 1; _bitstream.writeBits(logMax, llr); if (logMax == 0) // all frequencies equal one in this chunk continue; // Write frequencies for (int j = i; j < endj; j++) _bitstream.writeBits(frequencies[alphabet[j]] - 1, logMax); } return true; } // Reset frequency stats for each chunk of data in the block int RangeEncoder::encode(const kanzi::byte block[], uint blkptr, uint count) { if (count == 0) return 0; const uint end = blkptr + count; const uint sz = _chunkSize; uint startChunk = blkptr; while (startChunk < end) { const uint endChunk = min(startChunk + sz, end); _range = TOP_RANGE; _low = 0; int lr = _logRange; // Lower log range if the size of the data chunk is small while ((lr > 8) && (uint(1 << lr) > endChunk - startChunk)) lr--; if (rebuildStatistics(block, startChunk, endChunk, lr) <= 1) { // Skip chunk if only one symbol startChunk = endChunk; continue; } _shift = lr; for (uint i = startChunk; i < endChunk; i++) encodeByte(block[i]); // Flush 'low' _bitstream.writeBits(_low, 60); startChunk = endChunk; } return count; } void RangeEncoder::encodeByte(kanzi::byte b) { // Compute next low and range const int symbol = int(b); const uint64 cumFreq = _cumFreqs[symbol]; const uint64 freq = _cumFreqs[symbol + 1] - cumFreq; _range >>= _shift; _low += (cumFreq * _range); _range *= freq; // If the left-most digits are the same throughout the range, write bits to bitstream while (true) { if (((_low ^ (_low + _range)) & RANGE_MASK) != 0) { if (_range > BOTTOM_RANGE) break; // Normalize _range = ~(_low - 1) & BOTTOM_RANGE; } _bitstream.writeBits(_low >> 32, 28); _range <<= 28; _low <<= 28; } } // Compute chunk frequencies, cumulated frequencies and encode chunk header int RangeEncoder::rebuildStatistics(const kanzi::byte block[], int start, int end, int lr) { memset(_freqs, 0, sizeof(_freqs)); Global::computeHistogram(&block[start], end - start, _freqs); return updateFrequencies(_freqs, end - start, lr); } kanzi-cpp-2.5.2/src/entropy/RangeEncoder.hpp000066400000000000000000000042161516423635400207640ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_RangeEncoder #define knz_RangeEncoder #include "../EntropyEncoder.hpp" namespace kanzi { // Based on Order 0 range coder by Dmitry Subbotin itself derived from the algorithm // described by G.N.N Martin in his seminal article in 1979. // [G.N.N. Martin on the Data Recording Conference, Southampton, 1979] // Optimized for speed. class RangeEncoder : public EntropyEncoder { public: RangeEncoder(OutputBitStream& bitstream, int chunkSize = DEFAULT_CHUNK_SIZE, int logRange=DEFAULT_LOG_RANGE); ~RangeEncoder() { _dispose(); } int encode(const byte block[], uint blkptr, uint len); OutputBitStream& getBitStream() const { return _bitstream; } void dispose() { _dispose(); } private: static const uint64 TOP_RANGE; static const uint64 BOTTOM_RANGE; static const uint64 RANGE_MASK; static const int DEFAULT_CHUNK_SIZE; static const int DEFAULT_LOG_RANGE; static const int MAX_CHUNK_SIZE; uint64 _low; uint64 _range; uint _alphabet[256]; uint _freqs[256]; uint64 _cumFreqs[257]; OutputBitStream& _bitstream; uint _chunkSize; uint _logRange; uint _shift; int rebuildStatistics(const byte block[], int start, int end, int lr); int updateFrequencies(uint frequencies[], int size, int lr); void encodeByte(byte b); bool encodeHeader(int alphabetSize, const uint alphabet[], const uint frequencies[], int lr) const; bool reset(); void _dispose() const {} }; } #endif kanzi-cpp-2.5.2/src/entropy/TPAQPredictor.cpp000066400000000000000000000035671516423635400210540ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "TPAQPredictor.hpp" using namespace kanzi; const int TPAQMixer::BEGIN_LEARN_RATE = 60 << 7; const int TPAQMixer::END_LEARN_RATE = 11 << 7; template<> const int TPAQPredictor::MAX_LENGTH = 88; template<> const int TPAQPredictor::BUFFER_SIZE = 64 * 1024 * 1024; template<> const int TPAQPredictor::HASH_SIZE = 16 * 1024 * 1024; template<> const int TPAQPredictor::HASH = 0x7FEB352D; template<> const uint TPAQPredictor::MASK_80808080 = 0x80808080u; template<> const uint TPAQPredictor::MASK_F0F0F000 = 0xF0F0F000u; template<> const uint TPAQPredictor::MASK_4F4FFFFF = 0x4F4FFFFFu; template<> const int TPAQPredictor::MAX_LENGTH = 88; template<> const int TPAQPredictor::BUFFER_SIZE = 64 * 1024 * 1024; template<> const int TPAQPredictor::HASH_SIZE = 16 * 1024 * 1024; template<> const int TPAQPredictor::HASH = 0x7FEB352D; template<> const uint TPAQPredictor::MASK_80808080 = 0x80808080u; template<> const uint TPAQPredictor::MASK_F0F0F000 = 0xF0F0F000u; template<> const uint TPAQPredictor::MASK_4F4FFFFF = 0x4F4FFFFFu; TPAQMixer::TPAQMixer() { _pr = 2048; _skew = 0; _w0 = _w1 = _w2 = _w3 = _w4 = _w5 = _w6 = _w7 = 32768; _p0 = _p1 = _p2 = _p3 = _p4 = _p5 = _p6 = _p7 = 0; _learnRate = BEGIN_LEARN_RATE; } kanzi-cpp-2.5.2/src/entropy/TPAQPredictor.hpp000066400000000000000000000511761516423635400210600ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_TPAQPredictor #define knz_TPAQPredictor #include #include "../Context.hpp" #include "../Predictor.hpp" #include "../Memory.hpp" #include "AdaptiveProbMap.hpp" namespace kanzi { // TPAQ predictor // Initially based on Tangelo 2.4 (by Jan Ondrus). // PAQ8 is written by Matt Mahoney. // See http://encode.su/threads/1738-TANGELO-new-compressor-(derived-from-PAQ8-FP8) // Mixer combines models using neural networks with 8 inputs. class TPAQMixer { public: TPAQMixer(); ~TPAQMixer() { } void update(int bit); int get(int p0, int p1, int p2, int p3, int p4, int p5, int p6, int p7); private: static const int BEGIN_LEARN_RATE; static const int END_LEARN_RATE; int _w0, _w1, _w2, _w3, _w4, _w5, _w6, _w7; int _p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7; int _pr; int _skew; int _learnRate; }; template class TPAQPredictor FINAL : public Predictor { public: TPAQPredictor(Context* ctx = nullptr); ~TPAQPredictor(); void update(int bit); // Return the split value representing the probability of 1 in the [0..4095] range. int get() { return _pr; } private: static const int MAX_LENGTH; static const int BUFFER_SIZE; static const int HASH_SIZE; static const int HASH; static const uint MASK_80808080; static const uint MASK_F0F0F000; static const uint MASK_4F4FFFFF; #define SSE0_RATE(T) ((T == true) ? 6 : 7) int _pr; // next predicted value (0-4095) uint _c0; // bitwise context: last 0-7 bits with a leading 1 (1-255) uint _c4; // last 4 whole bytes, last is in low 8 bits uint _c8; // last 8 to 4 whole bytes, last is in low 8 bits int _bpos; // number of bits in c0 (0-7) int _pos; int _binCount; int _matchLen; int _matchPos; int _matchVal; uint _hash; LogisticAdaptiveProbMap _sse0; LogisticAdaptiveProbMap _sse1; TPAQMixer* _mixers; TPAQMixer* _mixer; // current mixer byte* _buffer; int* _hashes; // hash table(context, buffer position) uint8* _bigStatesMap;// hash table(context, prediction) uint8* _smallStatesMap0; // hash table(context, prediction) uint8* _smallStatesMap1; // hash table(context, prediction) uint _statesMask; uint _mixersMask; uint _hashMask; uint _bufferMask; uint8* _cp0; // context pointers uint8* _cp1; uint8* _cp2; uint8* _cp3; uint8* _cp4; uint8* _cp5; uint8* _cp6; int _ctx0; // contexts int _ctx1; int _ctx2; int _ctx3; int _ctx4; int _ctx5; int _ctx6; int hash(uint x, uint y) const; int createContext(uint ctxId, uint cx) const; int getMatchContextPred(); void findMatch(); bool reset(); }; // Adjust weights to minimize coding cost of last prediction inline void TPAQMixer::update(int bit) { const int err = (((bit << 12) - _pr) * _learnRate) >> 10; if (err == 0) return; // Quickly decaying learn rate _learnRate -= (uint32(END_LEARN_RATE - _learnRate) >> 31); _skew += err; // Train Neural Network: update weights _w0 += ((_p0 * err + 0) >> 12); _w1 += ((_p1 * err + 0) >> 12); _w2 += ((_p2 * err + 0) >> 12); _w3 += ((_p3 * err + 0) >> 12); _w4 += ((_p4 * err + 0) >> 12); _w5 += ((_p5 * err + 0) >> 12); _w6 += ((_p6 * err + 0) >> 12); _w7 += ((_p7 * err + 0) >> 12); } inline int TPAQMixer::get(int p0, int p1, int p2, int p3, int p4, int p5, int p6, int p7) { _p0 = p0; _p1 = p1; _p2 = p2; _p3 = p3; _p4 = p4; _p5 = p5; _p6 = p6; _p7 = p7; // Neural Network dot product (sum weights*inputs) _pr = Global::squash(((p0 * _w0) + (p1 * _w1) + (p2 * _w2) + (p3 * _w3) + (p4 * _w4) + (p5 * _w5) + (p6 * _w6) + (p7 * _w7) + _skew + 65536) >> 17); return _pr; } ///////////////////////// state table //////////////////////// // States represent a bit history within some context. // State 0 is the starting state (no bits seen). // States 1-30 represent all possible sequences of 1-4 bits. // States 31-252 represent a pair of counts, (n0,n1), the number // of 0 and 1 bits respectively. If n0+n1 < 16 then there are // two states for each pair, depending on if a 0 or 1 was the last // bit seen. // If n0 and n1 are too large, then there is no state to represent this // pair, so another state with about the same ratio of n0/n1 is substituted. // Also, when a bit is observed and the count of the opposite bit is large, // then part of this count is discarded to favor newer data over old. const uint8 STATE_TRANSITIONS[2][256] = { // Bit 0 { 1, 3, 143, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 47, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 6, 71, 71, 71, 61, 75, 56, 77, 78, 77, 80, 81, 82, 83, 84, 85, 86, 87, 88, 77, 90, 91, 92, 80, 94, 95, 96, 97, 98, 99, 90, 101, 94, 103, 101, 102, 104, 107, 104, 105, 108, 111, 112, 113, 114, 115, 116, 92, 118, 94, 103, 119, 122, 123, 94, 113, 126, 113, 128, 129, 114, 131, 132, 112, 134, 111, 134, 110, 134, 134, 128, 128, 142, 143, 115, 113, 142, 128, 148, 149, 79, 148, 142, 148, 150, 155, 149, 157, 149, 159, 149, 131, 101, 98, 115, 114, 91, 79, 58, 1, 170, 129, 128, 110, 174, 128, 176, 129, 174, 179, 174, 176, 141, 157, 179, 185, 157, 187, 188, 168, 151, 191, 192, 188, 187, 172, 175, 170, 152, 185, 170, 176, 170, 203, 148, 185, 203, 185, 192, 209, 188, 211, 192, 213, 214, 188, 216, 168, 84, 54, 54, 221, 54, 55, 85, 69, 63, 56, 86, 58, 230, 231, 57, 229, 56, 224, 54, 54, 66, 58, 54, 61, 57, 222, 78, 85, 82, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // Bit 1 { 2, 163, 169, 163, 165, 89, 245, 217, 245, 245, 233, 244, 227, 74, 221, 221, 218, 226, 243, 218, 238, 242, 74, 238, 241, 240, 239, 224, 225, 221, 232, 72, 224, 228, 223, 225, 238, 73, 167, 76, 237, 234, 231, 72, 31, 63, 225, 237, 236, 235, 53, 234, 53, 234, 229, 219, 229, 233, 232, 228, 226, 72, 74, 222, 75, 220, 167, 57, 218, 70, 168, 72, 73, 74, 217, 76, 167, 79, 79, 166, 162, 162, 162, 162, 165, 89, 89, 165, 89, 162, 93, 93, 93, 161, 100, 93, 93, 93, 93, 93, 161, 102, 120, 104, 105, 106, 108, 106, 109, 110, 160, 134, 108, 108, 126, 117, 117, 121, 119, 120, 107, 124, 117, 117, 125, 127, 124, 139, 130, 124, 133, 109, 110, 135, 110, 136, 137, 138, 127, 140, 141, 145, 144, 124, 125, 146, 147, 151, 125, 150, 127, 152, 153, 154, 156, 139, 158, 139, 156, 139, 130, 117, 163, 164, 141, 163, 147, 2, 2, 199, 171, 172, 173, 177, 175, 171, 171, 178, 180, 172, 181, 182, 183, 184, 186, 178, 189, 181, 181, 190, 193, 182, 182, 194, 195, 196, 197, 198, 169, 200, 201, 202, 204, 180, 205, 206, 207, 208, 210, 194, 212, 184, 215, 193, 184, 208, 193, 163, 219, 168, 94, 217, 223, 224, 225, 76, 227, 217, 229, 219, 79, 86, 165, 217, 214, 225, 216, 216, 234, 75, 214, 237, 74, 74, 163, 217, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }; const int STATE_MAP[] = { -31, -400, 406, -547, -642, -743, -827, -901, -901, -974, -945, -955, -1060, -1031, -1044, -956, -994, -1035, -1147, -1069, -1111, -1145, -1096, -1084, -1171, -1199, -1062, -1498, -1199, -1199, -1328, -1405, -1275, -1248, -1167, -1448, -1441, -1199, -1357, -1160, -1437, -1428, -1238, -1343, -1526, -1331, -1443, -2047, -2047, -2044, -2047, -2047, -2047, -232, -414, -573, -517, -768, -627, -666, -644, -740, -721, -829, -770, -963, -863, -1099, -811, -830, -277, -1036, -286, -218, -42, -411, 141, -1014, -1028, -226, -469, -540, -573, -581, -594, -610, -628, -711, -670, -144, -408, -485, -464, -173, -221, -310, -335, -375, -324, -413, -99, -179, -105, -150, -63, -9, 56, 83, 119, 144, 198, 118, -42, -96, -188, -285, -376, 107, -138, 38, -82, 186, -114, -190, 200, 327, 65, 406, 108, -95, 308, 171, -18, 343, 135, 398, 415, 464, 514, 494, 508, 519, 92, -123, 343, 575, 585, 516, -7, -156, 209, 574, 613, 621, 670, 107, 989, 210, 961, 246, 254, -12, -108, 97, 281, -143, 41, 173, -209, 583, -55, 250, 354, 558, 43, 274, 14, 488, 545, 84, 528, 519, 587, 634, 663, 95, 700, 94, -184, 730, 742, 162, -10, 708, 692, 773, 707, 855, 811, 703, 790, 871, 806, 9, 867, 840, 990, 1023, 1409, 194, 1397, 183, 1462, 178, -23, 1403, 247, 172, 1, -32, -170, 72, -508, -46, -365, -26, -146, 101, -18, -163, -422, -461, -146, -69, -78, -319, -334, -232, -99, 0, 47, -74, 0, -452, 14, -57, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, }; const int MATCH_PRED[] = { 0, 64, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1038, 1053, 1067, 1082, 1096, 1111, 1125, 1139, 1154, 1168, 1183, 1197, 1211, 1226, 1240, 1255, 1269, 1284, 1298, 1312, 1327, 1341, 1356, 1370, 1385, 1399, 1413, 1428, 1442, 1457, 1471, 1486, 1500, 1514, 1529, 1543, 1558, 1572, 1586, 1601, 1615, 1630, 1644, 1659, 1673, 1687, 1702, 1716, 1731, 1745, 1760, 1774, 1788, 1803, 1817, 1832, 1846, 1861, 1875, 1889, 1904, 1918, 1933, 1947, 1961, 1976, 1990, 2005, 2019, 2034, 2047, }; template TPAQPredictor::TPAQPredictor(Context* ctx) : _sse0(256) , _sse1((T == true) ? 65536 : 256) { uint statesSize = 1 << 28; uint mixersSize = 1 << 12; uint hashSize = HASH_SIZE; uint extraMem = (T == true) ? 1 : 0; uint bufferSize = BUFFER_SIZE; uint bsVersion = 6; if (ctx != nullptr) { // Block size requested by the user // The user can request a big block size to force more states const int rbsz = ctx->getInt("blockSize", 32768); if (rbsz >= 64 * 1024 * 1024) statesSize = 1 << 28; else if (rbsz >= 16 * 1024 * 1024) statesSize = 1 << 27; else if (rbsz >= 4 * 1024 * 1024) statesSize = 1 << 26; else statesSize = (rbsz >= 1024 * 1024) ? 1 << 24 : 1 << 22; // Actual size of the current block // Too many mixers hurts compression for small blocks. // Too few mixers hurts compression for big blocks. const int absz = ctx->getInt("size", rbsz); if (absz >= 32 * 1024 * 1024) mixersSize = 1 << 16; else if (absz >= 16 * 1024 * 1024) mixersSize = 1 << 15; else if (absz >= 8 * 1024 * 1024) mixersSize = 1 << 14; else if (absz >= 4 * 1024 * 1024) mixersSize = 1 << 13; else mixersSize = (absz >= 1 * 1024 * 1024) ? 1 << 11 : 1 << 8; bufferSize = rbsz < BUFFER_SIZE ? rbsz : BUFFER_SIZE; const uint mxsz = absz < (1 << 26) ? absz * 16 : 1 << 30; hashSize = hashSize < mxsz ? hashSize : mxsz; bsVersion = ctx->getInt("bsVersion", bsVersion); } mixersSize <<= (2 * extraMem); statesSize <<= (2 * extraMem); hashSize <<= (2 * extraMem); // Cap hash size for java compatibility if ((bsVersion > 5) && (hashSize > 1024 * 1024 * 1024)) hashSize = 1024 * 1024 * 1024; _statesMask = statesSize - 1; _mixersMask = (mixersSize - 1) & ~1; _hashMask = hashSize - 1; _bufferMask = bufferSize - 1; _mixers = new TPAQMixer[mixersSize]; _bigStatesMap = new uint8[statesSize]; _smallStatesMap0 = new uint8[1 << 16]; _smallStatesMap1 = new uint8[1 << 24]; _hashes = new int[hashSize]; _buffer = new byte[bufferSize]; reset(); } template bool TPAQPredictor::reset() { _pr = 2048; _c0 = 1; _c4 = 0; _c8 = 0; _pos = 0; _bpos = 8; _binCount = 0; _matchLen = 0; _matchPos = 0; _matchVal = 0; _hash = 0; _mixer = &_mixers[0]; memset(_bigStatesMap, 0, size_t(_statesMask + 1)); memset(_smallStatesMap0, 0, 1 << 16); memset(_smallStatesMap1, 0, 1 << 24); memset(_hashes, 0, sizeof(int) * size_t(_hashMask + 1)); memset(_buffer, 0, size_t(_bufferMask + 1)); _cp0 = &_smallStatesMap0[0]; _cp1 = &_smallStatesMap1[0]; _cp2 = &_bigStatesMap[0]; _cp3 = &_bigStatesMap[0]; _cp4 = &_bigStatesMap[0]; _cp5 = &_bigStatesMap[0]; _cp6 = &_bigStatesMap[0]; _ctx0 = _ctx1 = _ctx2 = _ctx3 = 0; _ctx4 = _ctx5 = _ctx6 = 0; return true; } template TPAQPredictor::~TPAQPredictor() { delete[] _bigStatesMap; delete[] _smallStatesMap0; delete[] _smallStatesMap1; delete[] _hashes; delete[] _buffer; delete[] _mixers; } // Update the probability model template void TPAQPredictor::update(int bit) { _mixer->update(bit); _c0 += (_c0 + bit); _bpos--; if (_bpos == 0) { _buffer[_pos & _bufferMask] = byte(_c0); _pos++; _c8 = (_c8 << 8) | ((_c4 >> 24) & 0xFF); _c4 = (_c4 << 8) | (_c0 & 0xFF); _hash = (((_hash * HASH) << 4) + _c4) & _hashMask; _c0 = 1; _bpos = 8; _binCount += ((_c4 >> 7) & 1); // Select Neural Net _mixer = &_mixers[(_c4 & _mixersMask) + (_matchLen != 0 ? 1 : 0)]; // Add contexts to NN _ctx0 = (_c4 & 0xFF) << 8; _ctx1 = (_c4 & 0xFFFF) << 8; _ctx2 = createContext(2, _c4 & 0x00FFFFFF); _ctx3 = createContext(3, _c4); if (_binCount < (_pos >> 2)) { // Mostly text or mixed _ctx4 = createContext(_ctx1, _c4 ^ (_c8 & 0xFFFF)); _ctx5 = (_c8 & MASK_F0F0F000) | ((_c4 & MASK_F0F0F000) >> 4); if (T == true) { const uint h1 = ((_c4 & MASK_80808080) == 0) ? _c4 & MASK_4F4FFFFF : _c4 & MASK_80808080; const uint h2 = ((_c8 & MASK_80808080) == 0) ? _c8 & MASK_4F4FFFFF : _c8 & MASK_80808080; _ctx6 = hash(h1 << 2, h2 >> 2); } } else { // Mostly binary _ctx4 = createContext(HASH + _matchLen, _c4 ^ (_c4 & 0x000FFFFF)); _ctx5 = _ctx0 | (_c8 << 16); if (T == true) { _ctx6 = hash(_c4 & 0xFFFF0000, _c8 >> 16); } } findMatch(); _matchVal = int(_buffer[_matchPos & _bufferMask]) | 0x100; // Keep track current position _hashes[_hash] = _pos; } // Get initial predictions // It has been observed that accessing memory via [ctx ^ c] is significantly faster // on SandyBridge/Windows and slower on SkyLake/Linux except when [ctx & 255 == 0] // (with c < 256). Hence, use XOR for _ctx5 which is the only context that fulfills // the condition. const int idx2 = (uint(_ctx2) + _c0) & _statesMask; const int idx3 = (uint(_ctx3) + _c0) & _statesMask; const int idx4 = (uint(_ctx4) + _c0) & _statesMask; const int idx5 = (uint(_ctx5) ^ _c0) & _statesMask; prefetchRead(&_bigStatesMap[idx2]); prefetchRead(&_bigStatesMap[idx3]); prefetchRead(&_bigStatesMap[idx4]); prefetchRead(&_bigStatesMap[idx5]); const uint8* table = STATE_TRANSITIONS[bit]; *_cp0 = table[*_cp0]; *_cp1 = table[*_cp1]; *_cp2 = table[*_cp2]; *_cp3 = table[*_cp3]; *_cp4 = table[*_cp4]; *_cp5 = table[*_cp5]; _cp0 = &_smallStatesMap0[_ctx0 + _c0]; const int p0 = STATE_MAP[*_cp0]; _cp1 = &_smallStatesMap1[_ctx1 + _c0]; const int p1 = STATE_MAP[*_cp1]; _cp2 = &_bigStatesMap[idx2]; const int p2 = STATE_MAP[*_cp2]; _cp3 = &_bigStatesMap[idx3]; const int p3 = STATE_MAP[*_cp3]; _cp4 = &_bigStatesMap[idx4]; const int p4 = STATE_MAP[*_cp4]; _cp5 = &_bigStatesMap[idx5]; const int p5 = STATE_MAP[*_cp5]; const int p7 = (_matchLen == 0) ? 0 : getMatchContextPred(); int p; if (T == false) { // Mix predictions using NN p = _mixer->get(p0, p1, p2, p3, p4, p5, p7, p7); // SSE (Secondary Symbol Estimation) if (_binCount < (_pos >> 3)) { p = (3 * _sse0.get(bit, p, _c0) + p) >> 2; } } else { // One more prediction const int idx6 = (uint(_ctx6) + _c0) & _statesMask; prefetchRead(&_bigStatesMap[idx6]); *_cp6 = table[*_cp6]; _cp6 = &_bigStatesMap[idx6]; const int p6 = STATE_MAP[*_cp6]; // Mix predictions using NN p = _mixer->get(p0, p1, p2, p3, p4, p5, p6, p7); // SSE (Secondary Symbol Estimation) if (_binCount < (_pos >> 3)) { p = _sse1.get(bit, p, _ctx0 + _c0); } else { if (_binCount >= (_pos >> 2)) p = (3 * _sse0.get(bit, p, _c0) + p) >> 2; p = (3 * _sse1.get(bit, p, _ctx0 + _c0) + p) >> 2; } } _pr = p + ((p < 2048) ? 1 : 0); } template void TPAQPredictor::findMatch() { // Update ongoing sequence match or detect match in the buffer (LZ like) if (_matchLen > 0) { if (_matchLen < MAX_LENGTH) _matchLen++; _matchPos++; return; } // Retrieve match position _matchPos = _hashes[_hash]; // Detect match if ((_matchPos != 0) && (uint(_pos - _matchPos) <= _bufferMask)) { int r = _matchLen + 2; while (r <= MAX_LENGTH) { if ((_buffer[(_pos - r - 1) & _bufferMask]) != (_buffer[(_matchPos - r - 1) & _bufferMask])) break; if ((_buffer[(_pos - r) & _bufferMask]) != (_buffer[(_matchPos - r) & _bufferMask])) break; r += 2; } _matchLen = r - 2; } } template inline int TPAQPredictor::hash(uint x, uint y) const { const int h = x * HASH ^ y * HASH; return (h >> 1) ^ (h >> 9) ^ (x >> 2) ^ (y >> 3) ^ HASH; } template inline int TPAQPredictor::createContext(uint ctxId, uint cx) const { cx = cx * 987654323 + ctxId; cx = (cx << 16) | (cx >> 16); return cx * 123456791 + ctxId; } // Get a prediction from the match model in [-2047..2048] template inline int TPAQPredictor::getMatchContextPred() { const uint matchPrefix = uint(_matchVal) >> _bpos; if (_c0 == matchPrefix) { return (((_matchVal >> (_bpos - 1)) & 1) != 0) ? MATCH_PRED[_matchLen - 1] : -MATCH_PRED[_matchLen - 1]; } _matchLen = 0; return 0; } } #endif kanzi-cpp-2.5.2/src/io/000077500000000000000000000000001516423635400146235ustar00rootroot00000000000000kanzi-cpp-2.5.2/src/io/CompressedInputStream.cpp000066400000000000000000001011031516423635400216230ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include "CompressedInputStream.hpp" #include "IOException.hpp" #include "../Error.hpp" #include "../entropy/EntropyDecoderFactory.hpp" #include "../transform/TransformFactory.hpp" #include "../util/fixedbuf.hpp" using namespace kanzi; using namespace std; const int CompressedInputStream::BITSTREAM_TYPE = 0x4B414E5A; // "KANZ" const int CompressedInputStream::BITSTREAM_FORMAT_VERSION = 6; const int CompressedInputStream::DEFAULT_BUFFER_SIZE = 256 * 1024; const int CompressedInputStream::EXTRA_BUFFER_SIZE = 512; const kanzi::byte CompressedInputStream::COPY_BLOCK_MASK = kanzi::byte(0x80); const kanzi::byte CompressedInputStream::TRANSFORMS_MASK = kanzi::byte(0x10); const int CompressedInputStream::MIN_BITSTREAM_BLOCK_SIZE = 1024; const int CompressedInputStream::MAX_BITSTREAM_BLOCK_SIZE = 1024 * 1024 * 1024; const int CompressedInputStream::CANCEL_TASKS_ID = -1; const int CompressedInputStream::MAX_CONCURRENCY = 64; const int CompressedInputStream::MAX_BLOCK_ID = int((uint(1) << 31) - 1); CompressedInputStream::CompressedInputStream(InputStream& is, int tasks, const string& entropy, const string& transform, int blockSize, int checksum, uint64 originalSize, #ifdef CONCURRENCY_ENABLED ThreadPool* pool, #endif bool headerless, int bsVersion) : InputStream(is.rdbuf()) , _parentCtx(nullptr) { #ifdef CONCURRENCY_ENABLED if ((tasks <= 0) || (tasks > MAX_CONCURRENCY)) { stringstream ss; ss << "The number of jobs must be in [1.." << MAX_CONCURRENCY << "], got " << tasks; throw invalid_argument(ss.str()); } _pool = pool; // may be null #else if (tasks != 1) throw invalid_argument("The number of jobs is limited to 1 in this version"); #endif _hasher32 = nullptr; _hasher64 = nullptr; _blockId = 0; _bufferId = 0; _maxBufferId = 0; _submitBlockId = 0; _blockSize = blockSize; _bufferThreshold = 0; _available = 0; _entropyType = EntropyDecoderFactory::getType(entropy.c_str()); // throws on error _transformType = TransformFactory::getType(transform.c_str()); // throws on error _initialized = 0; _closed = 0; _gcount = 0; _ibs = new DefaultInputBitStream(is, DEFAULT_BUFFER_SIZE); _jobs = tasks; _outputSize = originalSize; _nbInputBlocks = 0; _buffers = new SliceArray*[2 * _jobs]; _headless = headerless; _consumeBlockId = 0; if (_headless == true) { if ((_blockSize < MIN_BITSTREAM_BLOCK_SIZE) || (_blockSize > MAX_BITSTREAM_BLOCK_SIZE)) { stringstream ss; ss << "Invalid or missing block size: " << _blockSize; throw invalid_argument(ss.str()); } _ctx.putInt("bsVersion", bsVersion); _ctx.putString("entropy", entropy); _ctx.putString("transform", transform); _ctx.putInt("blockSize", blockSize); if (checksum == 32) { _hasher32 = new XXHash32(BITSTREAM_TYPE); _hasher64 = nullptr; } else if (checksum == 64) { _hasher32 = nullptr; _hasher64 = new XXHash64(BITSTREAM_TYPE); } else if (checksum != 0) { throw invalid_argument("The block checksum size must be 0, 32 or 64"); } } _jobsPerTask.resize(_jobs); std::fill(_jobsPerTask.begin(), _jobsPerTask.end(), 1); #ifdef CONCURRENCY_ENABLED _futures.resize(_jobs); #else _results.resize(_jobs); #endif for (int i = 0; i < 2 * _jobs; i++) _buffers[i] = new SliceArray(nullptr, 0, 0); } CompressedInputStream::CompressedInputStream(InputStream& is, Context& ctx, bool headerless) : InputStream(is.rdbuf()) , _ctx(ctx) , _parentCtx(&ctx) { int tasks = _ctx.getInt("jobs", 1); #ifdef CONCURRENCY_ENABLED if ((tasks <= 0) || (tasks > MAX_CONCURRENCY)) { stringstream ss; ss << "The number of jobs must be in [1.." << MAX_CONCURRENCY << "], got " << tasks; throw invalid_argument(ss.str()); } _pool = _ctx.getPool(); // may be null #else if (tasks != 1) throw invalid_argument("The number of jobs is limited to 1 in this version"); #endif _blockId = 0; _bufferId = 0; _maxBufferId = 0; _submitBlockId = 0; _blockSize = 0; _bufferThreshold = 0; _available = 0; _entropyType = EntropyDecoderFactory::NONE_TYPE; _transformType = TransformFactory::NONE_TYPE; _initialized = 0; _closed = 0; _gcount = 0; _ibs = new DefaultInputBitStream(is, DEFAULT_BUFFER_SIZE); _jobs = tasks; _hasher32 = nullptr; _hasher64 = nullptr; _outputSize = 0; _nbInputBlocks = 0; _headless = headerless; _consumeBlockId = 0; if (_headless == true) { // Validation of required values // Optional bsVersion const int bsVersion = _ctx.getInt("bsVersion", BITSTREAM_FORMAT_VERSION); if (bsVersion > BITSTREAM_FORMAT_VERSION) { stringstream ss; ss << "Invalid or missing bitstream version, cannot read this version of the stream: " << bsVersion; throw invalid_argument(ss.str()); } _ctx.putInt("bsVersion", bsVersion); string entropy = _ctx.getString("entropy"); _entropyType = EntropyDecoderFactory::getType(entropy.c_str()); // throws on error string transform = _ctx.getString("transform"); _transformType = TransformFactory::getType(transform.c_str()); // throws on error _blockSize = _ctx.getInt("blockSize", 0); if ((_blockSize < MIN_BITSTREAM_BLOCK_SIZE) || (_blockSize > MAX_BITSTREAM_BLOCK_SIZE)) { stringstream ss; ss << "Invalid or missing block size: " << _blockSize; throw invalid_argument(ss.str()); } _bufferThreshold = _blockSize; // Optional outputSize if (_ctx.has("outputSize")) { _outputSize = _ctx.getLong("outputSize", 0); if ((_outputSize < 0) || (_outputSize >= (int64(1) << 48))) _outputSize = 0; // not provided } const int nbBlocks = int((_outputSize + int64(_blockSize - 1)) / int64(_blockSize)); _nbInputBlocks = min(nbBlocks, MAX_CONCURRENCY - 1); // Optional checksum int checksum = ctx.getInt("checksum", 0); if (checksum == 0) { _hasher32 = nullptr; _hasher64 = nullptr; } else if (checksum == 32) { _hasher32 = new XXHash32(BITSTREAM_TYPE); _hasher64 = nullptr; } else if (checksum == 64) { _hasher32 = nullptr; _hasher64 = new XXHash64(BITSTREAM_TYPE); } else { throw invalid_argument("The block checksum size must be 0, 32 or 64"); } } _jobsPerTask.resize(_jobs); std::fill(_jobsPerTask.begin(), _jobsPerTask.end(), 1); #ifdef CONCURRENCY_ENABLED _futures.resize(_jobs); #else _results.resize(_jobs); #endif _buffers = new SliceArray*[2 * _jobs]; for (int i = 0; i < 2 * _jobs; i++) _buffers[i] = new SliceArray(nullptr, 0, 0); } CompressedInputStream::~CompressedInputStream() { try { close(); } catch (const exception&) { // Ignore and continue } for (int i = 0; i < 2 * _jobs; i++) { if (_buffers[i]->_array != nullptr) delete[] _buffers[i]->_array; delete _buffers[i]; } delete[] _buffers; delete _ibs; if (_hasher32 != nullptr) { delete _hasher32; _hasher32 = nullptr; } if (_hasher64 != nullptr) { delete _hasher64; _hasher64 = nullptr; } } void CompressedInputStream::submitBlock(int bufferId) { const int blkSize = max(_blockSize + EXTRA_BUFFER_SIZE, _blockSize + (_blockSize >> 4)); if (_buffers[bufferId]->_length < blkSize) { if (_buffers[bufferId]->_array != nullptr) delete[] _buffers[bufferId]->_array; _buffers[bufferId]->_array = new kanzi::byte[blkSize]; _buffers[bufferId]->_length = blkSize; } Context copyCtx(_ctx); copyCtx.putLong("tType", _transformType); copyCtx.putInt("eType", _entropyType); copyCtx.putInt("blockId", _submitBlockId + 1); copyCtx.putInt("jobs", _jobsPerTask[bufferId]); copyCtx.putInt("tasks", _jobs); _buffers[bufferId]->_index = 0; _buffers[_jobs + bufferId]->_index = 0; DecodingTask* task = new DecodingTask( _buffers[bufferId], _buffers[_jobs + bufferId], blkSize, _ibs, _hasher32, _hasher64, #ifdef CONCURRENCY_ENABLED &_blockMutex, &_blockCondition, #endif &_blockId, _listeners, copyCtx); #ifdef CONCURRENCY_ENABLED std::shared_ptr> safeTask(task); auto taskRunner = [safeTask]() { return safeTask->run(); }; if (_pool == nullptr) { // std::async returns std::future _futures[bufferId] = std::async(std::launch::async, taskRunner); } else { // pool->schedule returns std::future _futures[bufferId] = _pool->schedule(taskRunner); } #else // Synchronous execution try { _results[bufferId] = task->run(); delete task; } catch (...) { delete task; throw; } #endif _submitBlockId++; } int CompressedInputStream::_get(int inc) { try { if (LOAD_ATOMIC(_initialized) == 0) { readHeader(); for (int i = 0; i < _jobs; i++) submitBlock(i); } if (_available == 0) { if (LOAD_ATOMIC(_closed) == 1) throw ios_base::failure("Stream closed"); DecodingTaskResult res; #ifdef CONCURRENCY_ENABLED if (_futures[_bufferId].valid()) { res = _futures[_bufferId].get(); } else { setstate(ios::eofbit); return EOF; } #else res = _results[_bufferId]; #endif if (res._error != 0) throw IOException(res._msg, res._error); if (res._decoded > _blockSize) { stringstream ss; ss << "Block " << res._blockId << " incorrectly decompressed"; throw IOException(ss.str(), Error::ERR_PROCESS_BLOCK); } // Fire events if (!_listeners.empty()) { Event::HashType hashType = Event::NO_HASH; if (_hasher32 != nullptr) hashType = Event::SIZE_32; else if (_hasher64 != nullptr) hashType = Event::SIZE_64; Event evt(Event::AFTER_TRANSFORM, res._blockId, int64(res._decoded), res._completionTime, res._checksum, hashType); CompressedInputStream::notifyListeners(_listeners, evt); } _available = res._decoded; if (_available == 0) { if (res._skipped == false) { setstate(ios::eofbit); return EOF; } submitBlock(_bufferId); _bufferId = (_bufferId + 1) % _jobs; _consumeBlockId++; } _buffers[_bufferId]->_index = 0; } int res = int(_buffers[_bufferId]->_array[_buffers[_bufferId]->_index]); if (inc == 0) return res; _available -= inc; _buffers[_bufferId]->_index += inc; if (_available == 0) { submitBlock(_bufferId); _bufferId = (_bufferId + 1) % _jobs; _consumeBlockId++; } return res; } catch (const IOException&) { setstate(ios::badbit); throw; } catch (const exception&) { setstate(ios::badbit); throw; } } istream& CompressedInputStream::read(char* data, streamsize length) { int remaining = int(length); if (remaining < 0) throw ios_base::failure("Invalid buffer size"); _gcount = 0; while (remaining > 0) { // Reuse _get(0) logic logic implicitly if (LOAD_ATOMIC(_initialized) == 0) { readHeader(); for (int i = 0; i < _jobs; i++) submitBlock(i); } if (_available == 0) { DecodingTaskResult res; #ifdef CONCURRENCY_ENABLED if (_futures[_bufferId].valid()) { res = _futures[_bufferId].get(); } else { setstate(ios::eofbit); break; } #else res = _results[_bufferId]; #endif if (res._error != 0) throw IOException(res._msg, res._error); if (res._decoded > _blockSize) { stringstream ss; ss << "Block " << res._blockId << " incorrectly decompressed"; throw IOException(ss.str(), Error::ERR_PROCESS_BLOCK); } if (!_listeners.empty()) { Event::HashType hashType = Event::NO_HASH; if (_hasher32 != nullptr) hashType = Event::SIZE_32; else if (_hasher64 != nullptr) hashType = Event::SIZE_64; Event evt(Event::AFTER_TRANSFORM, res._blockId, int64(res._decoded), res._completionTime, res._checksum, hashType); CompressedInputStream::notifyListeners(_listeners, evt); } _available = res._decoded; _buffers[_bufferId]->_index = 0; if ((_available == 0) && (res._skipped == false)) { setstate(ios::eofbit); break; } } const int lenChunk = min(remaining, int(_available)); if (lenChunk > 0) { memcpy(&data[_gcount], &_buffers[_bufferId]->_array[_buffers[_bufferId]->_index], lenChunk); _buffers[_bufferId]->_index += lenChunk; _gcount += lenChunk; remaining -= lenChunk; _available -= lenChunk; } if (_available == 0) { submitBlock(_bufferId); _bufferId = (_bufferId + 1) % _jobs; _consumeBlockId++; } } return *this; } void CompressedInputStream::readHeader() { if (EXCHANGE_ATOMIC(_initialized, 1) == 1) return; if (_headless == true) return; // Read stream type const int type = int(_ibs->readBits(32)); // Sanity check if (type != BITSTREAM_TYPE) { throw IOException("Invalid stream type", Error::ERR_INVALID_FILE); } // Read stream version const int bsVersion = int(_ibs->readBits(4)); // Sanity check if (bsVersion > BITSTREAM_FORMAT_VERSION) { stringstream ss; ss << "Invalid bitstream, cannot read this version of the stream: " << bsVersion; throw IOException(ss.str(), Error::ERR_STREAM_VERSION); } _ctx.putInt("bsVersion", bsVersion); uint64 ckSize = 0; // Read block checksum if (bsVersion >= 6) { ckSize = _ibs->readBits(2); if (ckSize == 1) { _hasher32 = new XXHash32(BITSTREAM_TYPE); } else if (ckSize == 2) { _hasher64 = new XXHash64(BITSTREAM_TYPE); } else if (ckSize == 3) { throw IOException("Invalid bitstream, incorrect block checksum size", Error::ERR_INVALID_FILE); } } else { if (_ibs->readBit() == 1) _hasher32 = new XXHash32(BITSTREAM_TYPE); } try { // Read entropy codec _entropyType = short(_ibs->readBits(5)); _ctx.putString("entropy", EntropyDecoderFactory::getName(_entropyType)); } catch (const invalid_argument&) { stringstream err; err << "Invalid bitstream, unknown entropy type: " << _entropyType; throw IOException(err.str(), Error::ERR_INVALID_CODEC); } try { // Read transform: 8*6 bits _transformType = _ibs->readBits(48); _ctx.putString("transform", TransformFactory::getName(_transformType)); } catch (const invalid_argument&) { stringstream err; err << "Invalid bitstream, unknown transform type: " << _transformType; throw IOException(err.str(), Error::ERR_INVALID_CODEC); } // Read block size _blockSize = int(_ibs->readBits(28) << 4); _ctx.putInt("blockSize", _blockSize); _bufferThreshold = _blockSize; if ((_blockSize < MIN_BITSTREAM_BLOCK_SIZE) || (_blockSize > MAX_BITSTREAM_BLOCK_SIZE)) { stringstream ss; ss << "Invalid bitstream, incorrect block size: " << _blockSize; throw IOException(ss.str(), Error::ERR_BLOCK_SIZE); } // Read original size // 0 -> not provided, <2^16 -> 1, <2^32 -> 2, <2^48 -> 3 const int szMask = int(_ibs->readBits(2)); if (szMask != 0) { _outputSize = _ibs->readBits(16 * szMask); if (_parentCtx != nullptr) _parentCtx->putLong("outputSize", _outputSize); const int nbBlocks = int((_outputSize + int64(_blockSize - 1)) / int64(_blockSize)); _nbInputBlocks = min(nbBlocks, MAX_CONCURRENCY - 1); } if (bsVersion >= 6) { // Padding _ibs->readBits(15); } // Assign optimal number of tasks and jobs per task (if the number of blocks is available) if (_jobs > 1) { // Limit the number of tasks if there are fewer blocks that _jobs int nbTasks = (_nbInputBlocks != 0) ? min(_nbInputBlocks, _jobs) : _jobs; Global::computeJobsPerTask(&_jobsPerTask[0], _jobs, nbTasks); } else { _jobsPerTask[0] = 1; } // Read & verify checksum const int crcSize = bsVersion <= 5 ? 16 : 24; const uint32 cksum1 = uint32(_ibs->readBits(crcSize)); uint32 seed = (bsVersion >= 6 ? 0x01030507 : 1) * uint32(bsVersion); const uint32 HASH = 0x1E35A7BD; uint32 cksum2 = HASH * seed; if (bsVersion >= 6) cksum2 ^= (HASH * uint32(~ckSize)); cksum2 ^= (HASH * uint32(~_entropyType)); cksum2 ^= (HASH * uint32((~_transformType) >> 32)); cksum2 ^= (HASH * uint32(~_transformType)); cksum2 ^= (HASH * uint32(~_blockSize)); if (szMask != 0) { cksum2 ^= (HASH * uint32((~_outputSize) >> 32)); cksum2 ^= (HASH * uint32(~_outputSize)); } cksum2 = (cksum2 >> 23) ^ (cksum2 >> 3); if (cksum1 != (cksum2 & ((1 << crcSize) - 1))) throw IOException("Invalid bitstream, header checksum mismatch", Error::ERR_CRC_CHECK); if (_listeners.size() > 0) { Event::HeaderInfo info; info.inputName = _ctx.getString("inputName", ""); info.bsVersion = bsVersion; info.checksumSize = int(32 * ckSize); info.blockSize = _blockSize; info.entropyType = EntropyDecoderFactory::getName(_entropyType); info.transformType = TransformFactory::getName(_transformType); int64 fileSize = _ctx.getLong("fileSize", 0); info.fileSize = (fileSize >= 0) ? fileSize : -1; info.originalSize = (szMask != 0) ? _outputSize : -1; WallTimer timer; Event evt(Event::AFTER_HEADER_DECODING, 0, info, timer.getCurrentTime()); notifyListeners(_listeners, evt); } } bool CompressedInputStream::addListener(Listener& bl) { _listeners.push_back(&bl); return true; } bool CompressedInputStream::removeListener(Listener& bl) { std::vector*>::iterator it = find(_listeners.begin(), _listeners.end(), &bl); if (it == _listeners.end()) return false; _listeners.erase(it); return true; } void CompressedInputStream::close() { if (EXCHANGE_ATOMIC(_closed, 1) == 1) return; // Signal to break the waits in DecodingTask::run immediately and // ensure no thread is writing to _buffers before we delete them. #ifdef CONCURRENCY_ENABLED { std::lock_guard lock(_blockMutex); STORE_ATOMIC(_blockId, CANCEL_TASKS_ID); } _blockCondition.notify_all(); for (size_t i = 0; i < _futures.size(); i++) { if (_futures[i].valid()) { try { _futures[i].get(); } catch (...) { // Ignore exceptions, we are closing anyway. } } } #else STORE_ATOMIC(_blockId, CANCEL_TASKS_ID); #endif try { _ibs->close(); } catch (const BitStreamException& e) { throw IOException(e.what(), e.error()); } _available = 0; // Force subsequent reads to trigger submitBlock immediately _bufferThreshold = 0; // Buffer cleanup: force error on any subsequent read attempt for (int i = 0; i < 2 * _jobs; i++) { if (_buffers[i]->_array != nullptr) delete[] _buffers[i]->_array; _buffers[i]->_array = nullptr; _buffers[i]->_length = 0; _buffers[i]->_index = 0; } } void CompressedInputStream::notifyListeners(vector*>& listeners, const Event& evt) { for (vector*>::iterator it = listeners.begin(); it != listeners.end(); ++it) (*it)->processEvent(evt); } template DecodingTask::DecodingTask(SliceArray* iBuffer, SliceArray* oBuffer, int blockSize, DefaultInputBitStream* ibs, XXHash32* hasher32, XXHash64* hasher64, #ifdef CONCURRENCY_ENABLED std::mutex* blockMutex, std::condition_variable* blockCondition, #endif atomic_int_t* processedBlockId, vector*>& listeners, const Context& ctx) : _listeners(listeners) , _ctx(ctx) { _blockLength = blockSize; _data = iBuffer; _buffer = oBuffer; _ibs = ibs; _hasher32 = hasher32; _hasher64 = hasher64; #ifdef CONCURRENCY_ENABLED _blockMutex = blockMutex; _blockCondition = blockCondition; #endif _processedBlockId = processedBlockId; } // Decode mode + transformed entropy coded data // mode | 0b1yy0xxxx => copy block // | 0b0yy00000 => size(size(block))-1 // case 4 transforms or less // | 0b0001xxxx => transform sequence skip flags (1 means skip) // case more than 4 transforms // | 0b0yy00000 0bxxxxxxxx => transform sequence skip flags in next kanzi::byte (1 means skip) template T DecodingTask::run() { int blockId = _ctx.getInt("blockId"); bool streamPerTask = _ctx.getInt("tasks") > 1; uint64 tType = _ctx.getLong("tType"); short eType = short(_ctx.getInt("eType")); auto storeProcessedBlockId = [this](int value) { #ifdef CONCURRENCY_ENABLED { std::lock_guard lock(*_blockMutex); STORE_ATOMIC(*_processedBlockId, value); } _blockCondition->notify_all(); #else STORE_ATOMIC(*_processedBlockId, value); #endif }; #ifdef CONCURRENCY_ENABLED { std::unique_lock lock(*_blockMutex); _blockCondition->wait(lock, [this, blockId]() { const int taskId = LOAD_ATOMIC(*_processedBlockId); return (taskId == CompressedInputStream::CANCEL_TASKS_ID) || (taskId == blockId - 1); }); } if (LOAD_ATOMIC(*_processedBlockId) == CompressedInputStream::CANCEL_TASKS_ID) { // Skip, an error occurred return T(*_data, blockId, 0, 0, 0, "Canceled"); } #endif uint64 checksum1 = 0; EntropyDecoder* ed = nullptr; InputBitStream* ibs = nullptr; TransformSequence* transform = nullptr; try { // Read shared bitstream sequentially (each task is gated by _processedBlockId) #if !defined(_MSC_VER) || _MSC_VER > 1500 const uint64 blockOffset = _ibs->tell(); #endif const uint lr = 3 + uint(_ibs->readBits(5)); uint64 read = _ibs->readBits(lr); if (read == 0) { storeProcessedBlockId(CompressedInputStream::CANCEL_TASKS_ID); return T(*_data, blockId, 0, 0, 0, "Success"); } if (read > (uint64(1) << 34)) { storeProcessedBlockId(CompressedInputStream::CANCEL_TASKS_ID); return T(*_data, blockId, 0, 0, Error::ERR_BLOCK_SIZE, "Invalid block size"); } const int from = _ctx.getInt("from", 1); const int to = _ctx.getInt("to", CompressedInputStream::MAX_BLOCK_ID); const uint r = uint((read + 7) >> 3); // Read from the shared bitstream if // - there is one that one task (each with their own local bitstream) // - the block is going to be skipped (bits must be consumed) if ((streamPerTask == true) || (blockId < from)) { if (_data->_length < int(max(_blockLength, r))) { _data->_length = int(max(_blockLength, r)); delete[] _data->_array; _data->_array = new kanzi::byte[_data->_length]; } for (int n = 0; read > 0; ) { const uint chkSize = uint(min(read, uint64(1) << 30)); _ibs->readBits(&_data->_array[n], chkSize); n += ((chkSize + 7) >> 3); read -= uint64(chkSize); } } // After completion of the bitstream reading, increment the block id. // It unblocks the task processing the next block (if any) storeProcessedBlockId(blockId); // Check if the block must be skipped if (blockId < from) { return T(*_data, blockId, 0, 0, 0, "Skipped", true); } else if (blockId >= to) { return T(*_data, blockId, 0, 0, 0, "Success"); } ifixedbuf buf(reinterpret_cast(&_data->_array[0]), streamsize(r)); istream ios(&buf); ibs = (streamPerTask == true) ? new DefaultInputBitStream(ios) : _ibs; // Extract block header from bitstream kanzi::byte mode = kanzi::byte(ibs->readBits(8)); kanzi::byte skipFlags = kanzi::byte(0); if ((mode & CompressedInputStream::COPY_BLOCK_MASK) != kanzi::byte(0)) { tType = TransformFactory::NONE_TYPE; eType = EntropyDecoderFactory::NONE_TYPE; } else { if ((mode & CompressedInputStream::TRANSFORMS_MASK) != kanzi::byte(0)) skipFlags = kanzi::byte(ibs->readBits(8)); else skipFlags = (mode << 4) | kanzi::byte(0x0F); } const int dataSize = 1 + (int(mode >> 5) & 0x03); const int length = dataSize << 3; const uint64 mask = (uint64(1) << length) - 1; const int preTransformLength = int(ibs->readBits(length) & mask); const int maxTransformSize = int(min(max(_blockLength + _blockLength / 2, 2048u), uint(CompressedInputStream::MAX_BITSTREAM_BLOCK_SIZE))); if ((preTransformLength <= 0) || (preTransformLength > maxTransformSize)) { // Error => cancel concurrent decoding tasks storeProcessedBlockId(CompressedInputStream::CANCEL_TASKS_ID); stringstream ss; ss << "Invalid compressed block length: " << preTransformLength; if (streamPerTask == true) delete ibs; return T(*_data, blockId, 0, checksum1, Error::ERR_READ_FILE, ss.str()); } Event::HashType hashType = Event::NO_HASH; WallTimer timer; // Extract checksum from bitstream (if any) if (_hasher32 != nullptr) { checksum1 = ibs->readBits(32); hashType = Event::SIZE_32; } else if (_hasher64 != nullptr) { checksum1 = ibs->readBits(64); hashType = Event::SIZE_64; } if (_listeners.size() > 0) { #if !defined(_MSC_VER) || _MSC_VER > 1500 if (_ctx.getInt("verbosity", 0) > 4) { Event evt1(Event::BLOCK_INFO, blockId, int64(r), timer.getCurrentTime(), checksum1, hashType, blockOffset, uint8(skipFlags)); CompressedInputStream::notifyListeners(_listeners, evt1); } #endif // Notify before entropy Event evt2(Event::BEFORE_ENTROPY, blockId, int64(r), timer.getCurrentTime(), checksum1, hashType); CompressedInputStream::notifyListeners(_listeners, evt2); } const int bufferSize = max(int(_blockLength), preTransformLength + CompressedInputStream::EXTRA_BUFFER_SIZE); if (_buffer->_length < bufferSize) { _buffer->_length = bufferSize; if (_buffer->_array != nullptr) delete[] _buffer->_array; _buffer->_array = new kanzi::byte[_buffer->_length]; } const int savedIdx = _data->_index; _ctx.putInt("size", preTransformLength); // Each block is decoded separately // Rebuild the entropy decoder to reset block statistics ed = EntropyDecoderFactory::newDecoder(*ibs, _ctx, eType); // Block entropy decode if (ed->decode(_buffer->_array, 0, preTransformLength) != preTransformLength) { // Error => cancel concurrent decoding tasks storeProcessedBlockId(CompressedInputStream::CANCEL_TASKS_ID); delete ed; if (streamPerTask == true) delete ibs; return T(*_data, blockId, 0, checksum1, Error::ERR_PROCESS_BLOCK, "Entropy decoding failed"); } if (streamPerTask == true) { delete ibs; ibs = nullptr; } delete ed; ed = nullptr; if (_listeners.size() > 0) { // Notify after entropy Event evt1(Event::AFTER_ENTROPY, blockId, int64(preTransformLength), timer.getCurrentTime(), checksum1, hashType); CompressedInputStream::notifyListeners(_listeners, evt1); // Notify before transform (block size after entropy decoding) Event evt2(Event::BEFORE_TRANSFORM, blockId, int64(preTransformLength), timer.getCurrentTime(), checksum1, hashType); CompressedInputStream::notifyListeners(_listeners, evt2); } transform = TransformFactory::newTransform(_ctx, tType); transform->setSkipFlags(skipFlags); _buffer->_index = 0; // Inverse transform bool res = transform->inverse(*_buffer, *_data, preTransformLength); delete transform; transform = nullptr; if (res == false) { storeProcessedBlockId(CompressedInputStream::CANCEL_TASKS_ID); return T(*_data, blockId, 0, checksum1, Error::ERR_PROCESS_BLOCK, "Transform inverse failed"); } const int decoded = _data->_index - savedIdx; // Verify checksum if (_hasher32 != nullptr) { const uint32 checksum2 = _hasher32->hash(&_data->_array[savedIdx], decoded); if (checksum2 != uint32(checksum1)) { storeProcessedBlockId(CompressedInputStream::CANCEL_TASKS_ID); stringstream ss; ss << "Corrupted bitstream: expected checksum " << std::hex << checksum1 << ", found " << std::hex << checksum2; return T(*_data, blockId, decoded, checksum1, Error::ERR_CRC_CHECK, ss.str()); } } else if (_hasher64 != nullptr) { const uint64 checksum2 = _hasher64->hash(&_data->_array[savedIdx], decoded); if (checksum2 != checksum1) { storeProcessedBlockId(CompressedInputStream::CANCEL_TASKS_ID); stringstream ss; ss << "Corrupted bitstream: expected checksum " << std::hex << checksum1 << ", found " << std::hex << checksum2; return T(*_data, blockId, decoded, checksum1, Error::ERR_CRC_CHECK, ss.str()); } } return T(*_data, blockId, decoded, checksum1, 0, "Success"); } catch (const exception& e) { // Cancel any in-flight task waiting on this block. storeProcessedBlockId(CompressedInputStream::CANCEL_TASKS_ID); if (transform != nullptr) delete transform; if (ed != nullptr) delete ed; if ((streamPerTask == true) && (ibs != nullptr)) delete ibs; return T(*_data, blockId, 0, checksum1, Error::ERR_PROCESS_BLOCK, e.what()); } } kanzi-cpp-2.5.2/src/io/CompressedInputStream.hpp000066400000000000000000000254541516423635400216460ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_CompressedInputStream #define knz_CompressedInputStream #include // definition of EOF #include #include #include "../concurrent.hpp" #include "../Context.hpp" #include "../Event.hpp" #include "../Listener.hpp" #include "../InputStream.hpp" #include "../SliceArray.hpp" #include "../bitstream/DefaultInputBitStream.hpp" #include "../util/XXHash.hpp" #if __cplusplus >= 201103L #include #endif #ifdef CONCURRENCY_ENABLED #include #endif namespace kanzi { class DecodingTaskResult FINAL { public: int _blockId; int _decoded; byte* _data; int _error; // 0 = OK std::string _msg; uint64 _checksum; bool _skipped; WallTimer::TimeData _completionTime; DecodingTaskResult() { _blockId = -1; _decoded = 0; _data = nullptr; _error = 0; _checksum = 0; _skipped = false; WallTimer timer; _completionTime = timer.getCurrentTime(); } DecodingTaskResult(const SliceArray& data, int blockId, int decoded, uint64 checksum, int error, const std::string& msg, bool skipped = false) : _blockId(blockId) , _decoded(decoded) , _data(data._array) , _error(error) , _msg(msg) , _checksum(checksum) , _skipped(skipped) { WallTimer timer; _completionTime = timer.getCurrentTime(); } DecodingTaskResult(const DecodingTaskResult& result) : _blockId(result._blockId) , _decoded(result._decoded) , _data(result._data) , _error(result._error) , _msg(result._msg) , _checksum(result._checksum) , _skipped(result._skipped) , _completionTime(result._completionTime) { } DecodingTaskResult& operator = (const DecodingTaskResult& result) { _msg = result._msg; _data = result._data; _blockId = result._blockId; _error = result._error; _decoded = result._decoded; _checksum = result._checksum; _completionTime = result._completionTime; _skipped = result._skipped; return *this; } #if __cplusplus >= 201103L // Check for C++11 or later DecodingTaskResult(DecodingTaskResult&& other) noexcept : _blockId(other._blockId) , _decoded(other._decoded) , _data(other._data) , _error(other._error) , _msg(std::move(other._msg)) , _checksum(other._checksum) , _skipped(other._skipped) , _completionTime(other._completionTime) { other._data = nullptr; } DecodingTaskResult& operator=(DecodingTaskResult&& other) noexcept { if (this != &other) { _blockId = other._blockId; _decoded = other._decoded; _data = other._data; // No ownership so don't need to delete _error = other._error; _msg = std::move(other._msg); _checksum = other._checksum; _skipped = other._skipped; _completionTime = other._completionTime; other._data = nullptr; } return *this; } #endif ~DecodingTaskResult() {} }; // A task used to decode a block // Several tasks (transform+entropy) may run in parallel template class DecodingTask FINAL : public Task { private: SliceArray* _data; SliceArray* _buffer; uint _blockLength; DefaultInputBitStream* _ibs; XXHash32* _hasher32; XXHash64* _hasher64; #ifdef CONCURRENCY_ENABLED std::mutex* _blockMutex; std::condition_variable* _blockCondition; #endif atomic_int_t* _processedBlockId; std::vector*> _listeners; Context _ctx; public: DecodingTask(SliceArray* iBuffer, SliceArray* oBuffer, int blockSize, DefaultInputBitStream* ibs, XXHash32* hasher32, XXHash64* hasher64, #ifdef CONCURRENCY_ENABLED std::mutex* blockMutex, std::condition_variable* blockCondition, #endif atomic_int_t* processedBlockId, std::vector*>& listeners, const Context& ctx); ~DecodingTask(){} T run(); }; class CompressedInputStream : public InputStream { friend class DecodingTask; public: // If headerless == false, all provided compression parameters will be overwritten // with values read from the bitstream header. CompressedInputStream(InputStream& is, int jobs = 1, const std::string& entropy = "NONE", const std::string& transform = "NONE", int blockSize = 4*1024*1024, int checksum = 0, uint64 originalSize = 0, #ifdef CONCURRENCY_ENABLED ThreadPool* pool = nullptr, #endif bool headerless = false, int bsVersion = BITSTREAM_FORMAT_VERSION); // If headerless == true, the context must contain "entropy", "transform", "checksum" & "blockSize" // If "bsVersion" is missing, the current value of BITSTREAM_FORMAT_VERSION is assumed. CompressedInputStream(InputStream& is, Context& ctx, bool headerless = false); ~CompressedInputStream(); bool addListener(Listener& bl); bool removeListener(Listener& bl); std::streampos tellg(); std::istream& seekg(std::streampos pos); std::istream& putback(char c); std::istream& unget(); std::istream& read(char* s, std::streamsize n); std::streamsize gcount() const { return _gcount; } int get(); int peek(); void close(); uint64 getRead() const { return (_ibs->read() + 7) >> 3; } #if !defined(_MSC_VER) || _MSC_VER > 1500 bool seek(int64 bitPos); int64 tell(); #endif protected: void readHeader(); private: static const int BITSTREAM_TYPE; static const int BITSTREAM_FORMAT_VERSION; static const int DEFAULT_BUFFER_SIZE; static const int EXTRA_BUFFER_SIZE; static const byte COPY_BLOCK_MASK; static const byte TRANSFORMS_MASK; static const int MIN_BITSTREAM_BLOCK_SIZE; static const int MAX_BITSTREAM_BLOCK_SIZE; static const int CANCEL_TASKS_ID; static const int MAX_CONCURRENCY; static const int MAX_BLOCK_ID; int _blockSize; int _bufferId; // index of current read buffer int _maxBufferId; // max index of read buffer int _nbInputBlocks; int _jobs; int _bufferThreshold; int64 _available; // decoded not consumed bytes int64 _outputSize; XXHash32* _hasher32; XXHash64* _hasher64; SliceArray** _buffers; // input & output per block short _entropyType; uint64 _transformType; DefaultInputBitStream* _ibs; atomic_int_t _initialized; atomic_int_t _closed; atomic_int_t _blockId; atomic_int_t _submitBlockId; // Next block to submit to pool int _consumeBlockId; // Next block to be consumed by read() std::vector*> _listeners; std::streamsize _gcount; Context _ctx; Context* _parentCtx; // not owner bool _headless; std::vector _jobsPerTask; #ifdef CONCURRENCY_ENABLED ThreadPool* _pool; std::vector> _futures; std::mutex _blockMutex; std::condition_variable _blockCondition; #else std::vector _results; #endif void submitBlock(int bufferId); int _get(int inc); static void notifyListeners(std::vector*>& listeners, const Event& evt); }; inline int CompressedInputStream::get() { const int res = _get(1); _gcount = (res != EOF) ? 1 : 0; return res; } inline int CompressedInputStream::peek() { return _get(0); } inline std::streampos CompressedInputStream::tellg() { throw std::ios_base::failure("Not supported"); } inline std::istream& CompressedInputStream::seekg(std::streampos) { throw std::ios_base::failure("Not supported"); } inline std::istream& CompressedInputStream::putback(char) { setstate(std::ios::badbit); throw std::ios_base::failure("Not supported"); } inline std::istream& CompressedInputStream::unget() { setstate(std::ios::badbit); throw std::ios_base::failure("Not supported"); } #if !defined(_MSC_VER) || _MSC_VER > 1500 inline bool CompressedInputStream::seek(int64 bitPos) { // The only valid positions are block boundaries. if (LOAD_ATOMIC(_closed) == 1) return false; if (bitPos < 0) return false; #ifdef CONCURRENCY_ENABLED // Cancel any in-flight decode pipeline tied to the previous position. STORE_ATOMIC(_blockId, CANCEL_TASKS_ID); // Drain futures so no task can still consume the old underlying bitstream. for (int i = 0; i < _jobs; i++) { if (_futures[i].valid()) { try { (void) _futures[i].get(); } catch (...) { // Ignore: we are resetting the stream state anyway. } } } #endif // Reset decode state. _available = 0; _gcount = 0; _bufferId = 0; _maxBufferId = 0; _submitBlockId = 0; _consumeBlockId = 0; STORE_ATOMIC(_blockId, 0); if (_ibs->seek(bitPos) == false) return false; // Clear eof/fail flags potentially set by prior reads. this->clear(); // If stream was already initialized, bootstrap decoding tasks from new pos now. // If not initialized, read()/get() will initialize and submit as usual. if (LOAD_ATOMIC(_initialized) == 1) { for (int i = 0; i < _jobs; i++) submitBlock(i); } return true; } inline int64 CompressedInputStream::tell() { return _ibs->tell(); } #endif } #endif kanzi-cpp-2.5.2/src/io/CompressedOutputStream.cpp000066400000000000000000000723531516423635400220420ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include "CompressedOutputStream.hpp" #include "IOException.hpp" #include "../Error.hpp" #include "../Magic.hpp" #include "../entropy/EntropyEncoderFactory.hpp" #include "../entropy/EntropyUtils.hpp" #include "../transform/TransformFactory.hpp" #include "../util/fixedbuf.hpp" using namespace kanzi; using namespace std; const int CompressedOutputStream::BITSTREAM_TYPE = 0x4B414E5A; // "KANZ" const int CompressedOutputStream::BITSTREAM_FORMAT_VERSION = 6; const int CompressedOutputStream::DEFAULT_BUFFER_SIZE = 256 * 1024; const kanzi::byte CompressedOutputStream::COPY_BLOCK_MASK = kanzi::byte(0x80); const kanzi::byte CompressedOutputStream::TRANSFORMS_MASK = kanzi::byte(0x10); const int CompressedOutputStream::MIN_BITSTREAM_BLOCK_SIZE = 1024; const int CompressedOutputStream::MAX_BITSTREAM_BLOCK_SIZE = 1024 * 1024 * 1024; const int CompressedOutputStream::SMALL_BLOCK_SIZE = 15; const int CompressedOutputStream::CANCEL_TASKS_ID = -1; const int CompressedOutputStream::MAX_CONCURRENCY = 64; CompressedOutputStream::CompressedOutputStream(OutputStream& os, int tasks, const string& entropy, const string& transform, int blockSize, int checksum, uint64 fileSize, #ifdef CONCURRENCY_ENABLED ThreadPool* pool, #endif bool headerless) : OutputStream(os.rdbuf()) { #ifdef CONCURRENCY_ENABLED if ((tasks <= 0) || (tasks > MAX_CONCURRENCY)) { stringstream ss; ss << "The number of jobs must be in [1.." << MAX_CONCURRENCY << "], got " << tasks; throw invalid_argument(ss.str()); } _pool = pool; // can be null #else if (tasks != 1) throw invalid_argument("The number of jobs is limited to 1 in this version"); #endif if (blockSize > MAX_BITSTREAM_BLOCK_SIZE) { std::stringstream ss; ss << "The block size must be at most " << (MAX_BITSTREAM_BLOCK_SIZE >> 20) << " MB"; throw invalid_argument(ss.str()); } if (blockSize < MIN_BITSTREAM_BLOCK_SIZE) { std::stringstream ss; ss << "The block size must be at least " << MIN_BITSTREAM_BLOCK_SIZE; throw invalid_argument(ss.str()); } if ((blockSize & -16) != blockSize) throw invalid_argument("The block size must be a multiple of 16"); _blockId = 0; _inputBlockId = 0; _bufferId = 0; _blockSize = blockSize; _bufferThreshold = blockSize; _inputSize = fileSize; const int nbBlocks = (_inputSize == 0) ? 0 : int((_inputSize + int64(blockSize - 1)) / int64(blockSize)); _nbInputBlocks = min(nbBlocks, MAX_CONCURRENCY - 1); _headless = headerless; _initialized = 0; _closed = 0; _obs = new DefaultOutputBitStream(os, DEFAULT_BUFFER_SIZE); _entropyType = EntropyEncoderFactory::getType(entropy.c_str()); _transformType = TransformFactory::getType(transform.c_str()); if (checksum == 0) { _hasher32 = nullptr; _hasher64 = nullptr; } else if (checksum == 32) { _hasher32 = new XXHash32(BITSTREAM_TYPE); _hasher64 = nullptr; } else if (checksum == 64) { _hasher32 = nullptr; _hasher64 = new XXHash64(BITSTREAM_TYPE); } else { throw invalid_argument("The block checksum size must be 0, 32 or 64"); } _jobs = tasks; _ctx.putInt("blockSize", _blockSize); _ctx.putInt("checksum", checksum); _ctx.putString("entropy", entropy); _ctx.putString("transform", transform); _ctx.putInt("bsVersion", BITSTREAM_FORMAT_VERSION); #ifdef CONCURRENCY_ENABLED _futures.resize(_jobs); #endif _jobsPerTask.resize(_jobs); // Assign optimal number of tasks and jobs per task (if the number of blocks is available) if (_jobs > 1) { // Limit the number of tasks if there are fewer blocks that _jobs // It allows more jobs per task and reduces memory usage. int nbTasks = (_nbInputBlocks != 0) ? min(_nbInputBlocks, _jobs) : _jobs; Global::computeJobsPerTask(&_jobsPerTask[0], _jobs, nbTasks); } else { _jobsPerTask[0] = 1; } // Allocate first buffer and add padding for incompressible blocks _buffers = new SliceArray*[2 * _jobs]; const int bufSize = max(_blockSize + (_blockSize >> 3), DEFAULT_BUFFER_SIZE); _buffers[0] = new SliceArray(new kanzi::byte[bufSize], bufSize, 0); for (int i = 1; i < 2 * _jobs; i++) _buffers[i] = new SliceArray(nullptr, 0, 0); } CompressedOutputStream::CompressedOutputStream(OutputStream& os, Context& ctx, bool headerless) : OutputStream(os.rdbuf()) , _ctx(ctx) { int tasks = ctx.getInt("jobs", 1); #ifdef CONCURRENCY_ENABLED if ((tasks <= 0) || (tasks > MAX_CONCURRENCY)) { stringstream ss; ss << "The number of jobs must be in [1.." << MAX_CONCURRENCY << "], got " << tasks; throw invalid_argument(ss.str()); } _pool = _ctx.getPool(); // can be null #else if (tasks != 1) throw invalid_argument("The number of jobs is limited to 1 in this version"); #endif int blockSize = ctx.getInt("blockSize"); if (blockSize > MAX_BITSTREAM_BLOCK_SIZE) { std::stringstream ss; ss << "The block size must be at most " << (MAX_BITSTREAM_BLOCK_SIZE >> 20) << " MB"; throw invalid_argument(ss.str()); } if (blockSize < MIN_BITSTREAM_BLOCK_SIZE) { std::stringstream ss; ss << "The block size must be at least " << MIN_BITSTREAM_BLOCK_SIZE; throw invalid_argument(ss.str()); } if ((blockSize & -16) != blockSize) throw invalid_argument("The block size must be a multiple of 16"); _inputSize = ctx.getLong("fileSize", 0); const int nbBlocks = (_inputSize == 0) ? 0 : int((_inputSize + int64(blockSize - 1)) / int64(blockSize)); _nbInputBlocks = min(nbBlocks, MAX_CONCURRENCY - 1); _jobs = tasks; _blockId = 0; _inputBlockId = 0; _bufferId = 0; _blockSize = blockSize; _bufferThreshold = blockSize; _initialized = 0; _closed = 0; _headless = headerless; _obs = new DefaultOutputBitStream(os, DEFAULT_BUFFER_SIZE); _ctx.putInt("bsVersion", BITSTREAM_FORMAT_VERSION); string entropyCodec = ctx.getString("entropy"); string transform = ctx.getString("transform"); _entropyType = EntropyEncoderFactory::getType(entropyCodec.c_str()); _transformType = TransformFactory::getType(transform.c_str()); int checksum = ctx.getInt("checksum", 0); if (checksum == 0) { _hasher32 = nullptr; _hasher64 = nullptr; } else if (checksum == 32) { _hasher32 = new XXHash32(BITSTREAM_TYPE); _hasher64 = nullptr; } else if (checksum == 64) { _hasher32 = nullptr; _hasher64 = new XXHash64(BITSTREAM_TYPE); } else { throw invalid_argument("The block checksum size must be 0, 32 or 64"); } #ifdef CONCURRENCY_ENABLED _futures.resize(_jobs); #endif _jobsPerTask.resize(_jobs); // Assign optimal number of tasks and jobs per task (if the number of blocks is available) if (_jobs > 1) { // Limit the number of tasks if there are fewer blocks that _jobs // It allows more jobs per task and reduces memory usage. int nbTasks = (_nbInputBlocks != 0) ? min(_nbInputBlocks, _jobs) : _jobs; Global::computeJobsPerTask(&_jobsPerTask[0], _jobs, nbTasks); } else { _jobsPerTask[0] = 1; } _buffers = new SliceArray*[2 * _jobs]; // Allocate first buffer and add padding for incompressible blocks const int bufSize = max(_blockSize + (_blockSize >> 3), DEFAULT_BUFFER_SIZE); _buffers[0] = new SliceArray(new kanzi::byte[bufSize], bufSize, 0); for (int i = 1; i < 2 * _jobs; i++) _buffers[i] = new SliceArray(nullptr, 0, 0); } CompressedOutputStream::~CompressedOutputStream() { try { close(); } catch (const exception&) { // Ignore and continue } for (int i = 0; i < 2 * _jobs; i++) { if (_buffers[i]->_array != nullptr) delete[] _buffers[i]->_array; delete _buffers[i]; } delete[] _buffers; delete _obs; if (_hasher32 != nullptr) { delete _hasher32; _hasher32 = nullptr; } if (_hasher64 != nullptr) { delete _hasher64; _hasher64 = nullptr; } } void CompressedOutputStream::writeHeader() { if ((_headless == true) || (EXCHANGE_ATOMIC(_initialized, 1) == 1)) return; if (_obs->writeBits(BITSTREAM_TYPE, 32) != 32) throw IOException("Cannot write bitstream type to header", Error::ERR_WRITE_FILE); if (_obs->writeBits(BITSTREAM_FORMAT_VERSION, 4) != 4) throw IOException("Cannot write bitstream version to header", Error::ERR_WRITE_FILE); uint ckSize = 0; if (_hasher32 != nullptr) ckSize = 1; else if (_hasher64 != nullptr) ckSize = 2; if (_obs->writeBits(ckSize, 2) != 2) throw IOException("Cannot write block checksum size to header", Error::ERR_WRITE_FILE); if (_obs->writeBits(_entropyType, 5) != 5) throw IOException("Cannot write entropy type to header", Error::ERR_WRITE_FILE); if (_obs->writeBits(_transformType, 48) != 48) throw IOException("Cannot write transform types to header", Error::ERR_WRITE_FILE); if (_obs->writeBits(_blockSize >> 4, 28) != 28) throw IOException("Cannot write block size to header", Error::ERR_WRITE_FILE); // _inputSize not provided or >= 2^48 -> 0, <2^16 -> 1, <2^32 -> 2, <2^48 -> 3 const uint szMask = ((_inputSize == 0) || (_inputSize >= (int64(1) << 48))) ? 0 : (Global::log2(uint64(_inputSize)) >> 4) + 1; if (_obs->writeBits(szMask, 2) != 2) throw IOException("Cannot write size of input to header", Error::ERR_WRITE_FILE); if (szMask != 0) { if (_obs->writeBits(_inputSize, 16 * szMask) != 16 * szMask) throw IOException("Cannot write size of input to header", Error::ERR_WRITE_FILE); } const uint64 padding = 0; if (_obs->writeBits(padding, 15) != 15) throw IOException("Cannot write padding to header", Error::ERR_WRITE_FILE); uint32 seed = 0x01030507 * BITSTREAM_FORMAT_VERSION; // no const to avoid VS2008 warning const uint32 HASH = 0x1E35A7BD; uint32 cksum = HASH * seed; cksum ^= (HASH * uint32(~ckSize)); cksum ^= (HASH * uint32(~_entropyType)); cksum ^= (HASH * uint32((~_transformType) >> 32)); cksum ^= (HASH * uint32(~_transformType)); cksum ^= (HASH * uint32(~_blockSize)); if (szMask != 0) { cksum ^= (HASH * uint32((~_inputSize) >> 32)); cksum ^= (HASH * uint32(~_inputSize)); } cksum = (cksum >> 23) ^ (cksum >> 3); if (_obs->writeBits(cksum, 24) != 24) throw IOException("Cannot write checksum to header", Error::ERR_WRITE_FILE); } bool CompressedOutputStream::addListener(Listener& bl) { _listeners.push_back(&bl); return true; } bool CompressedOutputStream::removeListener(Listener& bl) { std::vector*>::iterator it = find(_listeners.begin(), _listeners.end(), &bl); if (it == _listeners.end()) return false; _listeners.erase(it); return true; } ostream& CompressedOutputStream::write(const char* data, streamsize length) { int off = 0; int remaining = int(length); if (remaining < 0) throw IOException("Invalid buffer size"); while (remaining > 0) { const int lenChunk = min(remaining, _bufferThreshold - _buffers[_bufferId]->_index); if (lenChunk > 0) { memcpy(&_buffers[_bufferId]->_array[_buffers[_bufferId]->_index], &data[off], lenChunk); _buffers[_bufferId]->_index += lenChunk; off += lenChunk; remaining -= lenChunk; if (_buffers[_bufferId]->_index >= _bufferThreshold) { processBuffer(); } } else { // Handle full buffer / closed stream processBuffer(); } } return *this; } void CompressedOutputStream::close() { if (LOAD_ATOMIC(_closed) == 1) return; string errMsg; try { // Submit the last partial block (if any) submitBlock(); #ifdef CONCURRENCY_ENABLED // Wait for ALL pending tasks to complete for (int i = 0; i < _jobs; i++) { if (_futures[i].valid()) { EncodingTaskResult res = _futures[i].get(); if (res._error != 0) throw IOException(res._msg, res._error); } } #endif // Write last block: length-3 (0) and 0 bits _obs->writeBits(uint64(0), 5); _obs->writeBits(uint64(0), 3); _obs->close(); } catch (const exception& e) { setstate(ios::badbit); errMsg = e.what(); } STORE_ATOMIC(_closed, 1); // Force subsequent writes to trigger submitBlock immediately _bufferThreshold = 0; // Release resources for (int i = 0; i < 2 * _jobs; i++) { if (_buffers[i]->_array != nullptr) delete[] _buffers[i]->_array; _buffers[i]->_array = nullptr; _buffers[i]->_length = 0; _buffers[i]->_index = 0; } if (errMsg != "") throw IOException(errMsg, Error::ERR_WRITE_FILE); setstate(ios::eofbit); } void CompressedOutputStream::processBuffer() { submitBlock(); _bufferId = (_bufferId + 1) % _jobs; #ifdef CONCURRENCY_ENABLED if (_futures[_bufferId].valid()) { EncodingTaskResult res = _futures[_bufferId].get(); if (res._error != 0) throw IOException(res._msg, res._error); } #endif const int bSize = _blockSize + (_blockSize >> 6); const int bufSize = max(bSize, 65536); if (_buffers[_bufferId]->_length == 0) { if (_buffers[_bufferId]->_array != nullptr) delete[] _buffers[_bufferId]->_array; _buffers[_bufferId]->_array = new kanzi::byte[bufSize]; _buffers[_bufferId]->_length = bufSize; } _buffers[_bufferId]->_index = 0; } void CompressedOutputStream::submitBlock() { if (LOAD_ATOMIC(_closed) == 1) throw IOException("Stream closed", Error::ERR_WRITE_FILE); writeHeader(); // Ensure header is written before first block processing const int dataLength = _buffers[_bufferId]->_index; if (dataLength == 0) return; // Increment input block counter (1-based for the Task logic) _inputBlockId++; Context copyCtx(_ctx); copyCtx.putLong("tType", _transformType); copyCtx.putInt("eType", _entropyType); copyCtx.putInt("blockId", _inputBlockId); copyCtx.putInt("size", dataLength); copyCtx.putInt("jobs", _jobsPerTask[_bufferId]); // Prepare the buffer for processing _buffers[_bufferId]->_index = 0; // Create the task // Note: Input is _buffers[_bufferId], Output is _buffers[_jobs + _bufferId] EncodingTask* task = new EncodingTask( _buffers[_bufferId], _buffers[_jobs + _bufferId], _obs, _hasher32, _hasher64, #ifdef CONCURRENCY_ENABLED &_blockMutex, &_blockCondition, #endif &_blockId, _listeners, copyCtx); #ifdef CONCURRENCY_ENABLED std::shared_ptr> safeTask(task); auto taskWrapper = [safeTask]() { return safeTask->run(); }; if (_pool == nullptr) { _futures[_bufferId] = std::async(std::launch::async, taskWrapper); } else { // REQUIRES: Pool size > Number of concurrent tasks to avoid deadlock _futures[_bufferId] = _pool->schedule(taskWrapper); } #else // Synchronous fallback try { EncodingTaskResult res = task->run(); if (res._error != 0) throw IOException(res._msg, res._error); } catch (...) { delete task; throw; } delete task; #endif } ostream& CompressedOutputStream::put(char c) { try { if (_buffers[_bufferId]->_index >= _bufferThreshold) { // Submit current buffer submitBlock(); // Rotate to next buffer _bufferId = (_bufferId + 1) % _jobs; // If concurrent, wait if the target buffer is still busy #ifdef CONCURRENCY_ENABLED if (_futures[_bufferId].valid()) { EncodingTaskResult res = _futures[_bufferId].get(); if (res._error != 0) throw IOException(res._msg, res._error); } #endif // Allocation / Reset logic const int bufSize = max(_blockSize + (_blockSize >> 6), 65536); if (_buffers[_bufferId]->_length == 0) { if (_buffers[_bufferId]->_array != nullptr) delete[] _buffers[_bufferId]->_array; _buffers[_bufferId]->_array = new kanzi::byte[bufSize]; _buffers[_bufferId]->_length = bufSize; } _buffers[_bufferId]->_index = 0; } _buffers[_bufferId]->_array[_buffers[_bufferId]->_index++] = kanzi::byte(c); return *this; } catch (const exception& e) { setstate(std::ios::badbit); throw std::ios_base::failure(e.what()); } } void CompressedOutputStream::notifyListeners(vector*>& listeners, const Event& evt) { for (vector*>::iterator it = listeners.begin(); it != listeners.end(); ++it) (*it)->processEvent(evt); } template EncodingTask::EncodingTask(SliceArray* iBuffer, SliceArray* oBuffer, DefaultOutputBitStream* obs, XXHash32* hasher32, XXHash64* hasher64, #ifdef CONCURRENCY_ENABLED std::mutex* blockMutex, std::condition_variable* blockCondition, #endif atomic_int_t* processedBlockId, vector*>& listeners, const Context& ctx) : _obs(obs) , _listeners(listeners) , _ctx(ctx) { _data = iBuffer; _buffer = oBuffer; _hasher32 = hasher32; _hasher64 = hasher64; #ifdef CONCURRENCY_ENABLED _blockMutex = blockMutex; _blockCondition = blockCondition; #endif _processedBlockId = processedBlockId; } // Encode mode + transformed entropy coded data // mode | 0b1yy0xxxx => copy block // | 0b0yy00000 => size(size(block))-1 // case 4 transforms or less // | 0b0001xxxx => transform sequence skip flags (1 means skip) // case more than 4 transforms // | 0b0yy00000 0bxxxxxxxx => transform sequence skip flags in next kanzi::byte (1 means skip) template T EncodingTask::run() { const int blockId = _ctx.getInt("blockId"); const int blockLength = _ctx.getInt("size"); TransformSequence* transform = nullptr; EntropyEncoder* ee = nullptr; auto storeProcessedBlockId = [this](int value) { #ifdef CONCURRENCY_ENABLED { std::lock_guard lock(*_blockMutex); STORE_ATOMIC(*_processedBlockId, value); } _blockCondition->notify_all(); #else STORE_ATOMIC(*_processedBlockId, value); #endif }; auto fetchAddProcessedBlockId = [this]() { #ifdef CONCURRENCY_ENABLED { std::lock_guard lock(*_blockMutex); FETCH_ADD_ATOMIC(*_processedBlockId, 1); } _blockCondition->notify_all(); #else FETCH_ADD_ATOMIC(*_processedBlockId, 1); #endif }; try { if (blockLength == 0) { // Last block (only block with 0 length) fetchAddProcessedBlockId(); return T(blockId, 0, "Success"); } kanzi::byte mode = kanzi::byte(0); int postTransformLength = blockLength; uint64 checksum = 0; uint64 tType = _ctx.getLong("tType"); short eType = short(_ctx.getInt("eType")); Event::HashType hashType = Event::NO_HASH; WallTimer timer; // Compute block checksum if (_hasher32 != nullptr) { checksum = _hasher32->hash(&_data->_array[_data->_index], blockLength); hashType = Event::SIZE_32; } else if (_hasher64 != nullptr) { checksum = _hasher64->hash(&_data->_array[_data->_index], blockLength); hashType = Event::SIZE_64; } if (_listeners.size() > 0) { // Notify before transform Event evt(Event::BEFORE_TRANSFORM, blockId, int64(blockLength), timer.getCurrentTime(), checksum, hashType); CompressedOutputStream::notifyListeners(_listeners, evt); } if (blockLength <= CompressedOutputStream::SMALL_BLOCK_SIZE) { tType = TransformFactory::NONE_TYPE; eType = EntropyEncoderFactory::NONE_TYPE; mode |= CompressedOutputStream::COPY_BLOCK_MASK; } else { int checkSkip = _ctx.getInt("skipBlocks", 0); if (checkSkip != 0) { bool skip = Magic::isCompressed(Magic::getType(&_data->_array[_data->_index])); if (skip == false) { uint histo[256] = { 0 }; Global::computeHistogram(&_data->_array[_data->_index], blockLength, histo); const int entropy = Global::computeFirstOrderEntropy1024(blockLength, histo); skip = entropy >= EntropyUtils::INCOMPRESSIBLE_THRESHOLD; //_ctx.putString("histo0", toString(histo, 256)); } if (skip == true) { tType = TransformFactory::NONE_TYPE; eType = EntropyEncoderFactory::NONE_TYPE; mode |= CompressedOutputStream::COPY_BLOCK_MASK; } } } _ctx.putInt("size", blockLength); transform = TransformFactory::newTransform(_ctx, tType); const int requiredSize = transform->getMaxEncodedLength(blockLength); if (blockLength >= 4) { uint magic = Magic::getType(&_data->_array[_data->_index]); if (Magic::isCompressed(magic) == true) _ctx.putInt("dataType", Global::BIN); else if (Magic::isMultimedia(magic) == true) _ctx.putInt("dataType", Global::MULTIMEDIA); else if (Magic::isExecutable(magic) == true) _ctx.putInt("dataType", Global::EXE); } if (_buffer->_length < requiredSize) { if (_buffer->_array != nullptr) delete[] _buffer->_array; _buffer->_array = new kanzi::byte[requiredSize]; _buffer->_length = requiredSize; } // Forward transform (ignore error, encode skipFlags) // _data->_length is at least blockLength _buffer->_index = 0; transform->forward(*_data, *_buffer, blockLength); const int nbTransforms = transform->getNbTransforms(); const kanzi::byte skipFlags = transform->getSkipFlags(); delete transform; transform = nullptr; postTransformLength = _buffer->_index; if (postTransformLength < 0) { storeProcessedBlockId(CompressedOutputStream::CANCEL_TASKS_ID); return T(blockId, Error::ERR_WRITE_FILE, "Invalid transform size"); } _ctx.putInt("size", postTransformLength); const int dataSize = (postTransformLength < 256) ? 1 : (Global::_log2(uint32(postTransformLength)) >> 3) + 1; if (dataSize > 4) { storeProcessedBlockId(CompressedOutputStream::CANCEL_TASKS_ID); return T(blockId, Error::ERR_WRITE_FILE, "Invalid block data length"); } // Record size of 'block size' - 1 in bytes mode |= kanzi::byte(((dataSize - 1) & 0x03) << 5); if (_listeners.size() > 0) { // Notify after transform Event evt(Event::AFTER_TRANSFORM, blockId, int64(postTransformLength), timer.getCurrentTime(), checksum, hashType); CompressedOutputStream::notifyListeners(_listeners, evt); } const int bufSize = max(CompressedOutputStream::DEFAULT_BUFFER_SIZE, max(postTransformLength, blockLength + (blockLength >> 3))); if (_data->_length < bufSize) { // Rare case where the transform expanded the input or // entropy coder may expand size. delete[] _data->_array; _data->_length = bufSize; _data->_array = new kanzi::byte[_data->_length]; } _data->_index = 0; ofixedbuf buf(reinterpret_cast(&_data->_array[_data->_index]), streamsize(_data->_length)); ostream os(&buf); DefaultOutputBitStream obs(os); // Write block 'header' (mode + compressed length) if (((mode & CompressedOutputStream::COPY_BLOCK_MASK) != kanzi::byte(0)) || (nbTransforms <= 4)) { mode |= kanzi::byte(skipFlags >> 4); obs.writeBits(uint64(mode), 8); } else { mode |= CompressedOutputStream::TRANSFORMS_MASK; obs.writeBits(uint64(mode), 8); obs.writeBits(uint64(skipFlags), 8); } obs.writeBits(postTransformLength, 8 * dataSize); // Write checksum if (_hasher32 != nullptr) obs.writeBits(checksum, 32); else if (_hasher64 != nullptr) obs.writeBits(checksum, 64); if (_listeners.size() > 0) { // Notify before entropy Event evt(Event::BEFORE_ENTROPY, blockId, int64(postTransformLength), timer.getCurrentTime(), checksum, hashType); CompressedOutputStream::notifyListeners(_listeners, evt); } // Each block is encoded separately // Rebuild the entropy encoder to reset block statistics ee = EntropyEncoderFactory::newEncoder(obs, _ctx, eType); // Entropy encode block if (ee->encode(_buffer->_array, 0, postTransformLength) != postTransformLength) { delete ee; storeProcessedBlockId(CompressedOutputStream::CANCEL_TASKS_ID); return T(blockId, Error::ERR_PROCESS_BLOCK, "Entropy coding failed"); } // Dispose before processing statistics (may write to the bitstream) ee->dispose(); delete ee; ee = nullptr; obs.close(); uint64 written = obs.written(); const uint lw = (written < 8) ? 3 : uint(Global::log2(uint32(written >> 3)) + 4); #ifdef CONCURRENCY_ENABLED { std::unique_lock lock(*_blockMutex); _blockCondition->wait(lock, [this, blockId]() { const int taskId = LOAD_ATOMIC(*_processedBlockId); return (taskId == CompressedOutputStream::CANCEL_TASKS_ID) || (taskId == blockId - 1); }); } if (LOAD_ATOMIC(*_processedBlockId) == CompressedOutputStream::CANCEL_TASKS_ID) return T(blockId, 0, "Canceled"); #endif // Emit block size in bits (max size pre-entropy is 1 GB = 1 << 30 bytes) #if !defined(_MSC_VER) || _MSC_VER > 1500 const int64 blockOffset = _obs->tell(); #endif _obs->writeBits(lw - 3, 5); // write length-3 (5 bits max) _obs->writeBits(written, lw); int64 ww = int64((written + 7) >> 3); // Emit data to shared bitstream for (uint n = 0; written > 0; ) { uint chkSize = uint(min(written, uint64(1) << 30)); _obs->writeBits(&_data->_array[n], chkSize); n += ((chkSize + 7) >> 3); written -= uint64(chkSize); } // After completion of the entropy coding, increment the block id. // It unblocks the task processing the next block (if any). storeProcessedBlockId(blockId); if (_listeners.size() > 0) { // Notify after entropy Event evt1(Event::AFTER_ENTROPY, blockId, ww, timer.getCurrentTime(), checksum, hashType); CompressedOutputStream::notifyListeners(_listeners, evt1); #if !defined(_MSC_VER) || _MSC_VER > 1500 if (_ctx.getInt("verbosity", 0) > 4) { string oName = _ctx.getString("outputName"); if (oName.length() == 4) { std::transform(oName.begin(), oName.end(), oName.begin(), ::toupper); } Event evt2(Event::BLOCK_INFO, blockId, int64((written + 7) >> 3), timer.getCurrentTime(), checksum, hashType, blockOffset, uint8(skipFlags)); CompressedOutputStream::notifyListeners(_listeners, evt2); } #endif } return T(blockId, 0, "Success"); } catch (const exception& e) { // Cancel any in-flight task waiting on this block. storeProcessedBlockId(CompressedOutputStream::CANCEL_TASKS_ID); if (transform != nullptr) delete transform; if (ee != nullptr) delete ee; return T(blockId, Error::ERR_PROCESS_BLOCK, e.what()); } } kanzi-cpp-2.5.2/src/io/CompressedOutputStream.hpp000066400000000000000000000147541516423635400220500ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_CompressedOutputStream #define knz_CompressedOutputStream #include #include #include "../concurrent.hpp" #include "../Context.hpp" #include "../Event.hpp" #include "../Listener.hpp" #include "../OutputStream.hpp" #include "../SliceArray.hpp" #include "../bitstream/DefaultOutputBitStream.hpp" #include "../util/XXHash.hpp" #if __cplusplus >= 201103L #include #endif #ifdef CONCURRENCY_ENABLED #include #endif namespace kanzi { class EncodingTaskResult FINAL { public: int _blockId; int _error; // 0 = OK std::string _msg; EncodingTaskResult() { _blockId = -1; _error = 0; } EncodingTaskResult(int blockId, int error, const std::string& msg) : _blockId(blockId) , _error(error) , _msg(msg) { } EncodingTaskResult(const EncodingTaskResult& result) : _blockId(result._blockId) , _error(result._error) , _msg(result._msg) { } EncodingTaskResult& operator = (const EncodingTaskResult& result) { _msg = result._msg; _blockId = result._blockId; _error = result._error; return *this; } #if __cplusplus >= 201103L EncodingTaskResult(EncodingTaskResult&& other) noexcept : _blockId(other._blockId) , _error(other._error) , _msg(std::move(other._msg)) // Transfer ownership of string buffer { } // Move Assignment Operator EncodingTaskResult& operator=(EncodingTaskResult&& other) noexcept { if (this != &other) { _blockId = other._blockId; _error = other._error; _msg = std::move(other._msg); // Transfer ownership of string buffer } return *this; } #endif ~EncodingTaskResult() {} }; // A task used to encode a block // Several tasks (transform+entropy) may run in parallel template class EncodingTask FINAL : public Task { private: SliceArray* _data; SliceArray* _buffer; DefaultOutputBitStream* _obs; XXHash32* _hasher32; XXHash64* _hasher64; #ifdef CONCURRENCY_ENABLED std::mutex* _blockMutex; std::condition_variable* _blockCondition; #endif atomic_int_t* _processedBlockId; std::vector*> _listeners; Context _ctx; public: EncodingTask(SliceArray* iBuffer, SliceArray* oBuffer, DefaultOutputBitStream* obs, XXHash32* hasher32, XXHash64* hasher64, #ifdef CONCURRENCY_ENABLED std::mutex* blockMutex, std::condition_variable* blockCondition, #endif atomic_int_t* processedBlockId, std::vector*>& listeners, const Context& ctx); ~EncodingTask(){} T run(); }; class CompressedOutputStream : public OutputStream { friend class EncodingTask; public: CompressedOutputStream(OutputStream& os, int jobs = 1, const std::string& entropy = "NONE", const std::string& transform = "NONE", int blockSize = 4*1024*1024, int checksum = 0, uint64 originalSize = 0, #ifdef CONCURRENCY_ENABLED ThreadPool* pool = nullptr, #endif bool headerless = false); CompressedOutputStream(OutputStream& os, Context& ctx, bool headerless = false); ~CompressedOutputStream(); bool addListener(Listener& bl); bool removeListener(Listener& bl); std::ostream& write(const char* s, std::streamsize n); std::ostream& put(char c); std::ostream& flush(); std::streampos tellp(); std::ostream& seekp(std::streampos pos); void close(); uint64 getWritten() const { return (_obs->written() + 7) >> 3; } protected: void writeHeader(); private: static const int BITSTREAM_TYPE; static const int BITSTREAM_FORMAT_VERSION; static const int DEFAULT_BUFFER_SIZE; static const byte COPY_BLOCK_MASK; static const byte TRANSFORMS_MASK; static const int MIN_BITSTREAM_BLOCK_SIZE; static const int MAX_BITSTREAM_BLOCK_SIZE; static const int SMALL_BLOCK_SIZE; static const int CANCEL_TASKS_ID; static const int MAX_CONCURRENCY; int _blockSize; int _bufferId; // index of current write buffer int _jobs; int _bufferThreshold; int _nbInputBlocks; int64 _inputSize; XXHash32* _hasher32; XXHash64* _hasher64; SliceArray** _buffers; // input & output per block short _entropyType; uint64 _transformType; DefaultOutputBitStream* _obs; atomic_int_t _initialized; atomic_int_t _closed; atomic_int_t _blockId; atomic_int_t _inputBlockId; // Counter for input blocks std::vector*> _listeners; std::vector _jobsPerTask; Context _ctx; bool _headless; #ifdef CONCURRENCY_ENABLED ThreadPool* _pool; std::vector > _futures; // Futures for async tasks std::mutex _blockMutex; std::condition_variable _blockCondition; #endif void processBuffer(); void submitBlock(); static void notifyListeners(std::vector*>& listeners, const Event& evt); }; inline std::streampos CompressedOutputStream::tellp() { throw std::ios_base::failure("Not supported"); } inline std::ostream& CompressedOutputStream::seekp(std::streampos) { throw std::ios_base::failure("Not supported"); } inline std::ostream& CompressedOutputStream::flush() { // NOOP: let the underlying output stream flush itself when needed return *this; } } #endif kanzi-cpp-2.5.2/src/io/IOException.hpp000066400000000000000000000026541516423635400175310ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_IOException #define knz_IOException #include #include #include "../Error.hpp" #include "../types.hpp" #include "../util/strings.hpp" namespace kanzi { class IOException : public std::runtime_error { private: int _code; public: IOException(const std::string& msg) : std::runtime_error(msg + ". Error code: " + TOSTR(Error::ERR_UNKNOWN)) { _code = Error::ERR_UNKNOWN; } IOException(const std::string& msg, int error) : std::runtime_error(msg + ". Error code: " + TOSTR(error)) { _code = error; } #if __cplusplus >= 201103L IOException(const IOException&) = default; IOException& operator=(const IOException&) = default; #endif int error() const { return _code; } ~IOException() NOEXCEPT {} }; } #endif kanzi-cpp-2.5.2/src/io/IOUtil.hpp000066400000000000000000000231001516423635400164750ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_IOUtil #define knz_IOUtil #include #include #include #include #include #include #if __cplusplus >= 201703L #include #endif #include "../types.hpp" #ifdef _MSC_VER #include "../msvc_dirent.hpp" #include #else #include #endif // Note stat64/lstat64 are deprecated on MacOS/Linux // Use _FILE_OFFSET_BITS and stat/lstat instead #ifdef _WIN32 #define STAT _stat64 #define LSTAT _stat64 #else #define _FILE_OFFSET_BITS 64 #define STAT stat #define LSTAT lstat #endif #if defined(__MINGW32__) // Missing from stat in mingw32 const int S_IFLNK = 40960; #endif namespace kanzi { struct FileData { std::string _path; std::string _name; int64 _size; int64 _modifTime; FileData(const std::string& path, int64 size, int64 _modifTime = 0) : _size(size) , _modifTime(_modifTime) { size_t idx = path.find_last_of(PATH_SEPARATOR); if (idx != std::string::npos) { _path = path.substr(0, idx + 1); _name = path.substr(idx + 1); } else { _path = ""; _name = path; } } std::string fullPath() const { return (_path.length() == 0) ? _name : _path + _name; } }; struct FileListConfig { bool _recursive; bool _ignoreLinks; // Do not follow links bool _continueOnErrors; bool _ignoreDotFiles; // Do not process dot files }; static inline void createFileList(std::string& target, std::vector& files, const FileListConfig& cfg, std::vector& errors) { if (target.size() == 0) return; // Note: old version of Windows/Visual Studio require a trailing '/' to stat network folders ! // In this scenario, "//PC/share" does not work but "//PC/share/" does #ifndef _MSC_VER if ((target.size() > 1) && (target[target.size() - 1] == PATH_SEPARATOR)) target.resize(target.size() - 1); #endif if (cfg._ignoreDotFiles == true) { size_t idx = target.rfind(PATH_SEPARATOR); if ((idx != std::string::npos) && (idx < target.length() - 1) && (target[idx + 1] == '.')) return; } struct STAT buffer; int res = cfg._ignoreLinks ? LSTAT(target.c_str(), &buffer) : STAT(target.c_str(), &buffer); if (res != 0) { std::stringstream ss; ss << "Cannot access input file '" << target << "': " << strerror(errno); errors.push_back(ss.str()); if (cfg._continueOnErrors == false) return; } #ifdef _WIN32 if (S_ISREG(buffer.st_mode)) { #else if (S_ISREG(buffer.st_mode) || (!cfg._ignoreLinks && S_ISLNK(buffer.st_mode))) { #endif #if __cplusplus >= 201103L files.emplace_back(target, buffer.st_size, buffer.st_mtime); #else files.push_back(FileData(target, buffer.st_size, buffer.st_mtime)); #endif return; } if (!S_ISDIR(buffer.st_mode)) { // Target is neither regular file nor directory, ignore return; } if (cfg._recursive) { if (target[target.size() - 1] != PATH_SEPARATOR) target += PATH_SEPARATOR; } else { target.resize(target.size() - 1); } DIR* dir = opendir(target.c_str()); if (dir != nullptr) { const struct dirent* ent; while ((ent = readdir(dir)) != nullptr) { std::string dirName = ent->d_name; if ((dirName == ".") || (dirName == "..")) continue; std::string fullpath = target + dirName; res = cfg._ignoreLinks ? LSTAT(fullpath.c_str(), &buffer) : STAT(fullpath.c_str(), &buffer); if (res != 0) { std::stringstream ss; ss << "Cannot access input file '" << fullpath << "': " << strerror(errno); errors.push_back(ss.str()); if (cfg._continueOnErrors == false) { closedir(dir); return; } } #ifdef _WIN32 if (S_ISREG(buffer.st_mode)) { #else if (S_ISREG(buffer.st_mode) || (!cfg._ignoreLinks && S_ISLNK(buffer.st_mode))) { #endif // Target is regular file if (cfg._ignoreDotFiles == true) { size_t idx = fullpath.rfind(PATH_SEPARATOR); if ((idx != std::string::npos) && (idx < fullpath.length() - 1) && (fullpath[idx + 1] == '.')) continue; } #if __cplusplus >= 201103L files.emplace_back(fullpath, buffer.st_size, buffer.st_mtime); #else files.push_back(FileData(fullpath, buffer.st_size, buffer.st_mtime)); #endif } else if (cfg._recursive && S_ISDIR(buffer.st_mode)) { if (cfg._ignoreDotFiles == true) { size_t idx = fullpath.rfind(PATH_SEPARATOR); if ((idx != std::string::npos) && (idx < fullpath.length() - 1) && (fullpath[idx + 1] == '.')) continue; } createFileList(fullpath, files, cfg, errors); } } closedir(dir); } else { std::stringstream ss; ss << "Cannot read directory '" << target << "'"; errors.push_back(ss.str()); } } struct FileDataComparator { bool _sortBySize; bool operator() (const FileData& f1, const FileData& f2) const { // First, compare parent directory paths if (f1._path != f2._path) return f1._path < f2._path; // Then compare file sizes (decreasing order) return _sortBySize ? f1._size > f2._size : f1._name < f2._name; } }; static inline void sortFilesByPathAndSize(std::vector& files, bool sortBySize = false) { if (files.size() > 1) { FileDataComparator c = { sortBySize }; sort(files.begin(), files.end(), c); } } #if defined(WIN32) || defined(_WIN32) || defined(_WIN64) static inline int mkdirAll(const std::string& path) { bool foundDrive = false; #else static inline int mkdirAll(const std::string& path, mode_t mode = S_IRWXU | S_IRWXG | S_IROTH | S_IWOTH | S_IXOTH) { #endif int res = 0; errno = 0; // Scan path, ignoring potential PATH_SEPARATOR at position 0 for (size_t i = 1; i < path.size(); i++) { if (path[i] == PATH_SEPARATOR) { std::string curPath = path; curPath.resize(i); if (curPath.length() == 0) continue; #if defined(WIN32) || defined(_WIN32) || defined(_WIN64) //Skip if drive if ((foundDrive == false) && (curPath.length() == 2) && (curPath[1] == ':')) { foundDrive = true; continue; } #endif #if defined(_MSC_VER) res = _mkdir(curPath.c_str()); #elif defined(__MINGW32__) res = mkdir(curPath.c_str()); #else res = mkdir(curPath.c_str(), mode); #endif if ((res != 0) && (errno != EEXIST)) return errno; } } errno = 0; #if defined(_MSC_VER) res = _mkdir(path.c_str()); #elif defined(__MINGW32__) res = mkdir(path.c_str()); #else res = mkdir(path.c_str(), mode); #endif return (res == 0) ? 0 : errno; } static inline bool samePaths(const std::string& f1, const std::string& f2) { #if __cplusplus >= 201703L // Simpler and safer code with C++17 std::error_code ec; return std::filesystem::equivalent(f1, f2, ec); #else if (f1.compare(f2) == 0) return true; struct STAT buf1; int s1 = STAT(f1.c_str(), &buf1); struct STAT buf2; int s2 = STAT(f2.c_str(), &buf2); if ((s1 < 0) && (s2 < 0)) return false; if (s1 != s2) return false; if (buf1.st_dev != buf2.st_dev) return false; if (buf1.st_ino != buf2.st_ino) return false; if (buf1.st_mode != buf2.st_mode) return false; if (buf1.st_nlink != buf2.st_nlink) return false; if (buf1.st_uid != buf2.st_uid) return false; if (buf1.st_gid != buf2.st_gid) return false; if (buf1.st_rdev != buf2.st_rdev) return false; if (buf1.st_size != buf2.st_size) return false; if (buf1.st_atime != buf2.st_atime) return false; if (buf1.st_mtime != buf2.st_mtime) return false; if (buf1.st_ctime != buf2.st_ctime) return false; return true; #endif } } #endif kanzi-cpp-2.5.2/src/io/NullOutputStream.hpp000066400000000000000000000025411516423635400206450ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_NullOutputStream #define knz_NullOutputStream namespace kanzi { template > class basic_nullbuf : public std::basic_streambuf { typename traits::int_type overflow(typename traits::int_type c) { return traits::not_eof(c); } void close() {} }; template > class basic_onullstream : public std::basic_ostream { public: basic_onullstream() : std::basic_ios(&_sbuf), std::basic_ostream(&_sbuf) { this->init(&_sbuf); } private: basic_nullbuf _sbuf; }; typedef basic_onullstream NullOutputStream; } #endif kanzi-cpp-2.5.2/src/msvc/000077500000000000000000000000001516423635400151645ustar00rootroot00000000000000kanzi-cpp-2.5.2/src/msvc/Kanzi_VS2008.sln000066400000000000000000000015721516423635400177050ustar00rootroot00000000000000 Microsoft Visual Studio Solution File, Format Version 10.00 # Visual C++ Express 2008 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Kanzi_VS2008", "Kanzi_VS2008.vcproj", "{A7FB6DB8-D93B-4287-8ED8-0E39A52509B3}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Win32 = Debug|Win32 Release|Win32 = Release|Win32 EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution {A7FB6DB8-D93B-4287-8ED8-0E39A52509B3}.Debug|Win32.ActiveCfg = Debug|Win32 {A7FB6DB8-D93B-4287-8ED8-0E39A52509B3}.Debug|Win32.Build.0 = Debug|Win32 {A7FB6DB8-D93B-4287-8ED8-0E39A52509B3}.Release|Win32.ActiveCfg = Debug|Win32 {A7FB6DB8-D93B-4287-8ED8-0E39A52509B3}.Release|Win32.Build.0 = Debug|Win32 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE EndGlobalSection EndGlobal kanzi-cpp-2.5.2/src/msvc/Kanzi_VS2008.vcproj000066400000000000000000000330451516423635400204140ustar00rootroot00000000000000 kanzi-cpp-2.5.2/src/msvc/Kanzi_VS2022.sln000066400000000000000000000050361516423635400177000ustar00rootroot00000000000000Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio Version 17 VisualStudioVersion = 17.5.33516.290 MinimumVisualStudioVersion = 10.0.40219.1 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{B04B8A8D-9F64-4F00-A01D-BE3392588862}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Kanzi", "Kanzi_VS2022.vcxproj", "{A7FB6DB8-D93B-4287-8ED8-0E39A52509B3}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Libkanzi", "Libkanzi_VS2022.vcxproj", "{3E5CE550-982F-42A4-B631-0C65582F1EAF}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Win32 = Debug|Win32 Debug|x64 = Debug|x64 Release|Win32 = Release|Win32 Release|x64 = Release|x64 EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution {A7FB6DB8-D93B-4287-8ED8-0E39A52509B3}.Debug|Win32.ActiveCfg = Debug|Win32 {A7FB6DB8-D93B-4287-8ED8-0E39A52509B3}.Debug|Win32.Build.0 = Debug|Win32 {A7FB6DB8-D93B-4287-8ED8-0E39A52509B3}.Debug|x64.ActiveCfg = Release|x64 {A7FB6DB8-D93B-4287-8ED8-0E39A52509B3}.Debug|x64.Build.0 = Release|x64 {A7FB6DB8-D93B-4287-8ED8-0E39A52509B3}.Release|Win32.ActiveCfg = Release|Win32 {A7FB6DB8-D93B-4287-8ED8-0E39A52509B3}.Release|Win32.Build.0 = Release|Win32 {A7FB6DB8-D93B-4287-8ED8-0E39A52509B3}.Release|x64.ActiveCfg = Release|x64 {A7FB6DB8-D93B-4287-8ED8-0E39A52509B3}.Release|x64.Build.0 = Release|x64 {3E5CE550-982F-42A4-B631-0C65582F1EAF}.Debug|Win32.ActiveCfg = Debug|Win32 {3E5CE550-982F-42A4-B631-0C65582F1EAF}.Debug|Win32.Build.0 = Debug|Win32 {3E5CE550-982F-42A4-B631-0C65582F1EAF}.Debug|x64.ActiveCfg = Release|x64 {3E5CE550-982F-42A4-B631-0C65582F1EAF}.Debug|x64.Build.0 = Release|x64 {3E5CE550-982F-42A4-B631-0C65582F1EAF}.Release|Win32.ActiveCfg = Release|Win32 {3E5CE550-982F-42A4-B631-0C65582F1EAF}.Release|Win32.Build.0 = Release|Win32 {3E5CE550-982F-42A4-B631-0C65582F1EAF}.Release|x64.ActiveCfg = Release|x64 {3E5CE550-982F-42A4-B631-0C65582F1EAF}.Release|x64.Build.0 = Release|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE EndGlobalSection GlobalSection(NestedProjects) = preSolution {A7FB6DB8-D93B-4287-8ED8-0E39A52509B3} = {B04B8A8D-9F64-4F00-A01D-BE3392588862} {3E5CE550-982F-42A4-B631-0C65582F1EAF} = {B04B8A8D-9F64-4F00-A01D-BE3392588862} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {3D56059B-87AA-45B6-B008-6705531314F5} EndGlobalSection EndGlobal kanzi-cpp-2.5.2/src/msvc/Kanzi_VS2022.vcxproj000066400000000000000000000407061516423635400206020ustar00rootroot00000000000000 Debug Win32 Debug x64 Release Win32 Release x64 {A7FB6DB8-D93B-4287-8ED8-0E39A52509B3} Kanzi 10.0 Kanzi Application v143 MultiByte true Application v143 MultiByte true Application v143 MultiByte Application v143 MultiByte <_ProjectFileVersion>14.0.25431.1 $(SolutionDir)$(Configuration)\ $(Configuration)\ Kanzi64 NativeMinimumRules.ruleset $(SolutionDir)$(Configuration)\ $(Configuration)\ Kanzi64 NativeRecommendedRules.ruleset true Disabled Default false Neither false false _CRT_SECURE_NO_WARNINGS;_SCL_SECURE_NO_WARNINGS;%(PreprocessorDefinitions) Default MultiThreadedDebugDLL Level3 EditAndContinue true MachineX64 Disabled AnySuitable false Neither false false _CRT_SECURE_NO_WARNINGS;_SCL_SECURE_NO_WARNINGS;%(PreprocessorDefinitions) Default MultiThreadedDebugDLL Level3 ProgramDatabase stdcpp17 /Zc:__cplusplus AdvancedVectorExtensions true true MaxSpeed AnySuitable true _CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions) MultiThreadedDLL true Level3 None Speed true Fast StreamingSIMDExtensions CompileAsCpp true true true MachineX86 MaxSpeed AnySuitable true _CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions) MultiThreaded true Level3 ProgramDatabase Speed true Fast true CompileAsCpp AdvancedVectorExtensions stdcpp17 false /Zc:__cplusplus %(AdditionalOptions) Default true true true true false kanzi-cpp-2.5.2/src/msvc/README.md000066400000000000000000000001111516423635400164340ustar00rootroot00000000000000Copy the files corresponding to your version of Visual Studio to ..\src. kanzi-cpp-2.5.2/src/msvc/libkanzi_VS2022.vcxproj000066400000000000000000000332711516423635400213300ustar00rootroot00000000000000 Debug Win32 Release Win32 Debug x64 Release x64 15.0 {3E5CE550-982F-42A4-B631-0C65582F1EAF} libkanzi 10.0.17763.0 Application true v141 MultiByte Application false v141 true MultiByte StaticLibrary true v141 MultiByte StaticLibrary false v143 true MultiByte .lib .lib Level3 Disabled true true _CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions) MultiThreadedDebug Console Level3 Disabled true true Console Level3 MaxSpeed true true true true Console true true Level3 MaxSpeed true true true true AnySuitable Speed true MultiThreaded _CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions) stdcpp17 /Zc:__cplusplus %(AdditionalOptions) Console true true kanzi-cpp-2.5.2/src/msvc_dirent.hpp000066400000000000000000000666241516423635400172600ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* * Dirent interface for Microsoft Visual Studio * * Copyright (C) 2006-2012 Toni Ronkko * This file is part of dirent. Dirent may be freely distributed * under the MIT license. For all details and documentation, see * https://github.com/tronkko/dirent */ #ifndef DIRENT_H #define DIRENT_H #if defined(WIN32) || defined(_WIN32) || defined(_WIN64) /* * Include windows.h without Windows Sockets 1.1 to prevent conflicts with * Windows Sockets 2.0. */ #ifndef WIN32_LEAN_AND_MEAN # define WIN32_LEAN_AND_MEAN #endif #include #include #include #include #include #include /* Indicates that d_type field is available in dirent structure */ #define _DIRENT_HAVE_D_TYPE /* Indicates that d_namlen field is available in dirent structure */ #define _DIRENT_HAVE_D_NAMLEN /* Entries missing from MSVC 6.0 */ #if !defined(FILE_ATTRIBUTE_DEVICE) # define FILE_ATTRIBUTE_DEVICE 0x40 #endif /* File type and permission flags for stat(), general mask */ #if !defined(S_IFMT) # define S_IFMT _S_IFMT #endif /* Directory bit */ #if !defined(S_IFDIR) # define S_IFDIR _S_IFDIR #endif /* Character device bit */ #if !defined(S_IFCHR) # define S_IFCHR _S_IFCHR #endif /* Pipe bit */ #if !defined(S_IFFIFO) # define S_IFFIFO _S_IFFIFO #endif /* Regular file bit */ #if !defined(S_IFREG) # define S_IFREG _S_IFREG #endif /* Read permission */ #if !defined(S_IREAD) # define S_IREAD _S_IREAD #endif /* Write permission */ #if !defined(S_IWRITE) # define S_IWRITE _S_IWRITE #endif /* Execute permission */ #if !defined(S_IEXEC) # define S_IEXEC _S_IEXEC #endif /* Pipe */ #if !defined(S_IFIFO) # define S_IFIFO _S_IFIFO #endif /* Block device */ #if !defined(S_IFBLK) # define S_IFBLK 0 #endif /* Link */ #if !defined(S_IFLNK) # define S_IFLNK 0 #endif /* Socket */ #if !defined(S_IFSOCK) # define S_IFSOCK 0 #endif /* Read user permission */ #if !defined(S_IRUSR) # define S_IRUSR S_IREAD #endif /* Write user permission */ #if !defined(S_IWUSR) # define S_IWUSR S_IWRITE #endif /* Execute user permission */ #if !defined(S_IXUSR) # define S_IXUSR S_IEXEC #endif /* Read group permission */ #if !defined(S_IRGRP) # define S_IRGRP 0x20 #endif /* Write group permission */ #if !defined(S_IWGRP) # define S_IWGRP 0x10 #endif /* Execute group permission */ #if !defined(S_IXGRP) # define S_IXGRP 0x08 #endif /* Read others permission */ #if !defined(S_IROTH) # define S_IROTH 0x04 #endif /* Write others permission */ #if !defined(S_IWOTH) # define S_IWOTH 0x02 #endif /* Execute others permission */ #if !defined(S_IXOTH) # define S_IXOTH 0x01 #endif /* Maximum length of file name */ #if !defined(PATH_MAX) # define PATH_MAX MAX_PATH #endif #if !defined(FILENAME_MAX) # define FILENAME_MAX MAX_PATH #endif #if !defined(NAME_MAX) # define NAME_MAX FILENAME_MAX #endif /* File type flags for d_type */ #define DT_UNKNOWN 0 #define DT_REG S_IFREG #define DT_DIR S_IFDIR #define DT_FIFO S_IFIFO #define DT_SOCK S_IFSOCK #define DT_CHR S_IFCHR #define DT_BLK S_IFBLK #define DT_LNK S_IFLNK /* Macros for converting between st_mode and d_type */ #define IFTODT(mode) ((mode) & S_IFMT) #define DTTOIF(type) (type) /* * File type macros. Note that block devices, sockets and links cannot be * distinguished on Windows and the macros S_ISBLK, S_ISSOCK and S_ISLNK are * only defined for compatibility. These macros should always return false * on Windows. */ #if !defined(S_ISFIFO) # define S_ISFIFO(mode) (((mode) & S_IFMT) == S_IFIFO) #endif #if !defined(S_ISDIR) # define S_ISDIR(mode) (((mode) & S_IFMT) == S_IFDIR) #endif #if !defined(S_ISREG) # define S_ISREG(mode) (((mode) & S_IFMT) == S_IFREG) #endif #if !defined(S_ISLNK) # define S_ISLNK(mode) (((mode) & S_IFMT) == S_IFLNK) #endif #if !defined(S_ISSOCK) # define S_ISSOCK(mode) (((mode) & S_IFMT) == S_IFSOCK) #endif #if !defined(S_ISCHR) # define S_ISCHR(mode) (((mode) & S_IFMT) == S_IFCHR) #endif #if !defined(S_ISBLK) # define S_ISBLK(mode) (((mode) & S_IFMT) == S_IFBLK) #endif /* Return the exact length of the file name without zero terminator */ #define _D_EXACT_NAMLEN(p) ((p)->d_namlen) /* Return the maximum size of a file name */ #define _D_ALLOC_NAMLEN(p) ((PATH_MAX)+1) #ifdef __cplusplus extern "C" { #endif /* Wide-character version */ struct _wdirent { /* Always zero */ long d_ino; /* File position within stream */ long d_off; /* Structure size */ unsigned short d_reclen; /* Length of name without \0 */ size_t d_namlen; /* File type */ int d_type; /* File name */ wchar_t d_name[PATH_MAX+1]; }; typedef struct _wdirent _wdirent; struct _WDIR { /* Current directory entry */ struct _wdirent ent; /* Private file data */ WIN32_FIND_DATAW data; /* True if data is valid */ int cached; /* Win32 search handle */ HANDLE handle; /* Initial directory name */ wchar_t *patt; }; typedef struct _WDIR _WDIR; /* Multi-byte character version */ struct dirent { /* Always zero */ long d_ino; /* File position within stream */ long d_off; /* Structure size */ unsigned short d_reclen; /* Length of name without \0 */ size_t d_namlen; /* File type */ int d_type; /* File name */ char d_name[PATH_MAX+1]; }; typedef struct dirent dirent; struct DIR { struct dirent ent; struct _WDIR *wdirp; }; typedef struct DIR DIR; /* Dirent functions */ static DIR *opendir (const char *dirname); static _WDIR *_wopendir (const wchar_t *dirname); static struct dirent *readdir (DIR *dirp); static struct _wdirent *_wreaddir (_WDIR *dirp); static int readdir_r( DIR *dirp, struct dirent *entry, struct dirent **result); static int _wreaddir_r( _WDIR *dirp, struct _wdirent *entry, struct _wdirent **result); static int closedir (DIR *dirp); static int _wclosedir (_WDIR *dirp); static void rewinddir (DIR* dirp); static void _wrewinddir (_WDIR* dirp); static int scandir (const char *dirname, struct dirent ***namelist, int (*filter)(const struct dirent*), int (*compare)(const void *, const void *)); static int alphasort (const struct dirent **a, const struct dirent **b); static int versionsort (const struct dirent **a, const struct dirent **b); /* For compatibility with Symbian */ #define wdirent _wdirent #define WDIR _WDIR #define wopendir _wopendir #define wreaddir _wreaddir #define wclosedir _wclosedir #define wrewinddir _wrewinddir /* Internal utility functions */ static WIN32_FIND_DATAW *dirent_first (_WDIR *dirp); static WIN32_FIND_DATAW *dirent_next (_WDIR *dirp); static int dirent_mbstowcs_s( size_t *pReturnValue, wchar_t *wcstr, size_t sizeInWords, const char *mbstr, size_t count); static int dirent_wcstombs_s( size_t *pReturnValue, char *mbstr, size_t sizeInBytes, const wchar_t *wcstr, size_t count); static void dirent_set_errno (int error); /* * Open directory stream DIRNAME for read and return a pointer to the * internal working area that is used to retrieve individual directory * entries. */ static _WDIR* _wopendir( const wchar_t *dirname) { _WDIR *dirp = nullptr; int error; /* Must have directory name */ if (dirname == nullptr || dirname[0] == '\0') { dirent_set_errno (ENOENT); return nullptr; } /* Allocate new _WDIR structure */ dirp = (_WDIR*) malloc (sizeof (struct _WDIR)); if (dirp != nullptr) { DWORD n; /* Reset _WDIR structure */ dirp->handle = INVALID_HANDLE_VALUE; dirp->patt = nullptr; dirp->cached = 0; /* Compute the length of full path plus zero terminator * * Note that on WinRT there's no way to convert relative paths * into absolute paths, so just assume it is an absolute path. */ # if defined(WINAPI_FAMILY) && (WINAPI_FAMILY == WINAPI_FAMILY_PHONE_APP) n = wcslen(dirname); # else n = GetFullPathNameW (dirname, 0, nullptr, nullptr); # endif /* Allocate room for absolute directory name and search pattern */ dirp->patt = (wchar_t*) malloc (sizeof (wchar_t) * n + 16); if (dirp->patt) { /* * Convert relative directory name to an absolute one. This * allows rewinddir() to function correctly even when current * working directory is changed between opendir() and rewinddir(). * * Note that on WinRT there's no way to convert relative paths * into absolute paths, so just assume it is an absolute path. */ # if defined(WINAPI_FAMILY) && (WINAPI_FAMILY == WINAPI_FAMILY_PHONE_APP) wcsncpy_s(dirp->patt, n+1, dirname, n); # else n = GetFullPathNameW (dirname, n, dirp->patt, nullptr); # endif if (n > 0) { wchar_t *p; /* Append search pattern \* to the directory name */ p = dirp->patt + n; if (dirp->patt < p) { switch (p[-1]) { case '\\': case '/': case ':': /* Directory ends in path separator, e.g. c:\temp\ */ /*NOP*/; break; default: /* Directory name doesn't end in path separator */ *p++ = '\\'; } } *p++ = '*'; *p = '\0'; /* Open directory stream and retrieve the first entry */ if (dirent_first (dirp)) { /* Directory stream opened successfully */ error = 0; } else { /* Cannot retrieve first entry */ error = 1; dirent_set_errno (ENOENT); } } else { /* Cannot retrieve full path name */ dirent_set_errno (ENOENT); error = 1; } } else { /* Cannot allocate memory for search pattern */ error = 1; } } else { /* Cannot allocate _WDIR structure */ error = 1; } /* Clean up in case of error */ if (error && dirp) { _wclosedir (dirp); dirp = nullptr; } return dirp; } /* * Read next directory entry. * * Returns pointer to static directory entry which may be overwritten by * subsequent calls to _wreaddir(). */ static struct _wdirent* _wreaddir( _WDIR *dirp) { struct _wdirent *entry; /* * Read directory entry to buffer. We can safely ignore the return value * as entry will be set to nullptr in case of error. */ (void) _wreaddir_r (dirp, &dirp->ent, &entry); /* Return pointer to statically allocated directory entry */ return entry; } /* * Read next directory entry. * * Returns zero on success. If end of directory stream is reached, then sets * result to nullptr and returns zero. */ static int _wreaddir_r( _WDIR *dirp, struct _wdirent *entry, struct _wdirent **result) { WIN32_FIND_DATAW *datap; /* Read next directory entry */ datap = dirent_next (dirp); if (datap) { size_t n; DWORD attr; /* * Copy file name as wide-character string. If the file name is too * long to fit in to the destination buffer, then truncate file name * to PATH_MAX characters and zero-terminate the buffer. */ n = 0; while (n < PATH_MAX && datap->cFileName[n] != 0) { entry->d_name[n] = datap->cFileName[n]; n++; } entry->d_name[n] = 0; /* Length of file name excluding zero terminator */ entry->d_namlen = n; /* File type */ attr = datap->dwFileAttributes; if ((attr & FILE_ATTRIBUTE_DEVICE) != 0) { entry->d_type = DT_CHR; } else if ((attr & FILE_ATTRIBUTE_DIRECTORY) != 0) { entry->d_type = DT_DIR; } else { entry->d_type = DT_REG; } /* Reset dummy fields */ entry->d_ino = 0; entry->d_off = 0; entry->d_reclen = sizeof (struct _wdirent); /* Set result address */ *result = entry; } else { /* Return nullptr to indicate end of directory */ *result = nullptr; } return /*OK*/0; } /* * Close directory stream opened by opendir() function. This invalidates the * DIR structure as well as any directory entry read previously by * _wreaddir(). */ static int _wclosedir( _WDIR *dirp) { int ok; if (dirp) { /* Release search handle */ if (dirp->handle != INVALID_HANDLE_VALUE) { FindClose (dirp->handle); dirp->handle = INVALID_HANDLE_VALUE; } /* Release search pattern */ if (dirp->patt) { free (dirp->patt); dirp->patt = nullptr; } /* Release directory structure */ free (dirp); ok = /*success*/0; } else { /* Invalid directory stream */ dirent_set_errno (EBADF); ok = /*failure*/-1; } return ok; } /* * Rewind directory stream such that _wreaddir() returns the very first * file name again. */ static void _wrewinddir( _WDIR* dirp) { if (dirp) { /* Release existing search handle */ if (dirp->handle != INVALID_HANDLE_VALUE) { FindClose (dirp->handle); } /* Open new search handle */ dirent_first (dirp); } } /* Get first directory entry (internal) */ static WIN32_FIND_DATAW* dirent_first( _WDIR *dirp) { WIN32_FIND_DATAW *datap; /* Open directory and retrieve the first entry */ dirp->handle = FindFirstFileExW( dirp->patt, FindExInfoStandard, &dirp->data, FindExSearchNameMatch, nullptr, 0); if (dirp->handle != INVALID_HANDLE_VALUE) { /* a directory entry is now waiting in memory */ datap = &dirp->data; dirp->cached = 1; } else { /* Failed to re-open directory: no directory entry in memory */ dirp->cached = 0; datap = nullptr; } return datap; } /* * Get next directory entry (internal). * * Returns */ static WIN32_FIND_DATAW* dirent_next( _WDIR *dirp) { WIN32_FIND_DATAW *p; /* Get next directory entry */ if (dirp->cached != 0) { /* A valid directory entry already in memory */ p = &dirp->data; dirp->cached = 0; } else if (dirp->handle != INVALID_HANDLE_VALUE) { /* Get the next directory entry from stream */ if (FindNextFileW (dirp->handle, &dirp->data) != FALSE) { /* Got a file */ p = &dirp->data; } else { /* The very last entry has been processed or an error occurred */ FindClose (dirp->handle); dirp->handle = INVALID_HANDLE_VALUE; p = nullptr; } } else { /* End of directory stream reached */ p = nullptr; } return p; } /* * Open directory stream using plain old C-string. */ static DIR* opendir( const char *dirname) { struct DIR *dirp; int error; /* Must have directory name */ if (dirname == nullptr || dirname[0] == '\0') { dirent_set_errno (ENOENT); return nullptr; } /* Allocate memory for DIR structure */ dirp = (DIR*) malloc (sizeof (struct DIR)); if (dirp) { wchar_t wname[PATH_MAX + 1]; size_t n; /* Convert directory name to wide-character string */ error = dirent_mbstowcs_s( &n, wname, PATH_MAX + 1, dirname, PATH_MAX + 1); if (!error) { /* Open directory stream using wide-character name */ dirp->wdirp = _wopendir (wname); if (dirp->wdirp) { /* Directory stream opened */ error = 0; } else { /* Failed to open directory stream */ error = 1; } } else { /* * Cannot convert file name to wide-character string. This * occurs if the string contains invalid multi-byte sequences or * the output buffer is too small to contain the resulting * string. */ error = 1; } } else { /* Cannot allocate DIR structure */ error = 1; } /* Clean up in case of error */ if (error && dirp) { free (dirp); dirp = nullptr; } return dirp; } /* * Read next directory entry. */ static struct dirent* readdir( DIR *dirp) { struct dirent *entry; /* * Read directory entry to buffer. We can safely ignore the return value * as entry will be set to nullptr in case of error. */ (void) readdir_r (dirp, &dirp->ent, &entry); /* Return pointer to statically allocated directory entry */ return entry; } /* * Read next directory entry into called-allocated buffer. * * Returns zero on success. If the end of directory stream is reached, then * sets result to nullptr and returns zero. */ static int readdir_r( DIR *dirp, struct dirent *entry, struct dirent **result) { WIN32_FIND_DATAW *datap; /* Read next directory entry */ datap = dirent_next (dirp->wdirp); if (datap) { size_t n; int error; /* Attempt to convert file name to multi-byte string */ error = dirent_wcstombs_s( &n, entry->d_name, PATH_MAX + 1, datap->cFileName, PATH_MAX + 1); /* * If the file name cannot be represented by a multi-byte string, * then attempt to use old 8+3 file name. This allows traditional * Unix-code to access some file names despite of unicode * characters, although file names may seem unfamiliar to the user. * * Be ware that the code below cannot come up with a short file * name unless the file system provides one. At least * VirtualBox shared folders fail to do this. */ if (error && datap->cAlternateFileName[0] != '\0') { error = dirent_wcstombs_s( &n, entry->d_name, PATH_MAX + 1, datap->cAlternateFileName, PATH_MAX + 1); } if (!error) { DWORD attr; /* Length of file name excluding zero terminator */ entry->d_namlen = n - 1; /* File attributes */ attr = datap->dwFileAttributes; if ((attr & FILE_ATTRIBUTE_DEVICE) != 0) { entry->d_type = DT_CHR; } else if ((attr & FILE_ATTRIBUTE_DIRECTORY) != 0) { entry->d_type = DT_DIR; } else { entry->d_type = DT_REG; } /* Reset dummy fields */ entry->d_ino = 0; entry->d_off = 0; entry->d_reclen = sizeof (struct dirent); } else { /* * Cannot convert file name to multi-byte string so construct * an erroneous directory entry and return that. Note that * we cannot return nullptr as that would stop the processing * of directory entries completely. */ entry->d_name[0] = '?'; entry->d_name[1] = '\0'; entry->d_namlen = 1; entry->d_type = DT_UNKNOWN; entry->d_ino = 0; entry->d_off = -1; entry->d_reclen = 0; } /* Return pointer to directory entry */ *result = entry; } else { /* No more directory entries */ *result = nullptr; } return /*OK*/0; } /* * Close directory stream. */ static int closedir( DIR *dirp) { int ok; if (dirp) { /* Close wide-character directory stream */ ok = _wclosedir (dirp->wdirp); dirp->wdirp = nullptr; /* Release multi-byte character version */ free (dirp); } else { /* Invalid directory stream */ dirent_set_errno (EBADF); ok = /*failure*/-1; } return ok; } /* * Rewind directory stream to beginning. */ static void rewinddir( DIR* dirp) { /* Rewind wide-character string directory stream */ _wrewinddir (dirp->wdirp); } /* * Scan directory for entries. */ static int scandir( const char *dirname, struct dirent ***namelist, int (*filter)(const struct dirent*), int (*compare)(const void*, const void*)) { struct dirent **files = nullptr; size_t size = 0; const size_t init_size = 1; DIR *dir = nullptr; struct dirent *entry; struct dirent *tmp = nullptr; int result = 0; /* Open directory stream */ dir = opendir (dirname); if (dir) { size_t allocated = 0; /* Read directory entries to memory */ while (true) { /* Enlarge pointer table to make room for another pointer */ if (size >= allocated) { void *p; size_t num_entries; /* Compute number of entries in the enlarged pointer table */ if (size < init_size) { /* Allocate initial pointer table */ num_entries = init_size; } else { /* Double the size */ num_entries = size * 2; } /* Allocate first pointer table or enlarge existing table */ p = realloc (files, sizeof (void*) * num_entries); if (p != nullptr) { /* Got the memory */ files = (dirent**) p; allocated = num_entries; } else { /* Out of memory */ result = -1; break; } } /* Allocate room for temporary directory entry */ if (tmp == nullptr) { tmp = (struct dirent*) malloc (sizeof (struct dirent)); if (tmp == nullptr) { /* Cannot allocate temporary directory entry */ result = -1; break; } } /* Read directory entry to temporary area */ if (readdir_r (dir, tmp, &entry) == /*OK*/0) { /* Did we get an entry? */ if (entry != nullptr) { int pass; /* Determine whether to include the entry in result */ if (filter) { /* Let the filter function decide */ pass = filter (tmp); } else { /* No filter function, include everything */ pass = 1; } if (pass) { /* Store the temporary entry to pointer table */ files[size++] = tmp; tmp = nullptr; /* Keep up with the number of files */ result++; } } else { /* * End of directory stream reached => sort entries and * exit. */ qsort (files, size, sizeof (void*), compare); break; } } else { /* Error reading directory entry */ result = /*Error*/ -1; break; } } } else { /* Cannot open directory */ result = /*Error*/ -1; } /* Release temporary directory entry */ if (tmp) { free (tmp); } /* Release allocated memory on error */ if (files && (result < 0)) { for (size_t i = 0; i < size; i++) { free (files[i]); } free (files); files = nullptr; } /* Close directory stream */ if (dir) { closedir (dir); } /* Pass pointer table to caller */ if (namelist) { *namelist = files; } return result; } /* Alphabetical sorting */ static int alphasort( const struct dirent **a, const struct dirent **b) { return strcoll ((*a)->d_name, (*b)->d_name); } /* Sort versions */ static int versionsort( const struct dirent **a, const struct dirent **b) { /* FIXME: implement strverscmp and use that */ return alphasort (a, b); } /* Convert multi-byte string to wide character string */ static int dirent_mbstowcs_s( size_t *pReturnValue, wchar_t *wcstr, size_t sizeInWords, const char *mbstr, size_t count) { int error; #if defined(_MSC_VER) && _MSC_VER >= 1400 /* Microsoft Visual Studio 2005 or later */ error = mbstowcs_s (pReturnValue, wcstr, sizeInWords, mbstr, count); #else /* Older Visual Studio or non-Microsoft compiler */ size_t n; /* Convert to wide-character string (or count characters) */ n = mbstowcs (wcstr, mbstr, sizeInWords); if (!wcstr || n < count) { /* Zero-terminate output buffer */ if (wcstr && sizeInWords) { if (n >= sizeInWords) { n = sizeInWords - 1; } wcstr[n] = 0; } /* Length of resulting multi-byte string WITH zero terminator */ if (pReturnValue) { *pReturnValue = n + 1; } /* Success */ error = 0; } else { /* Could not convert string */ error = 1; } #endif return error; } /* Convert wide-character string to multi-byte string */ static int dirent_wcstombs_s( size_t *pReturnValue, char *mbstr, size_t sizeInBytes, /* max size of mbstr */ const wchar_t *wcstr, size_t count) { int error; #if defined(_MSC_VER) && _MSC_VER >= 1400 /* Microsoft Visual Studio 2005 or later */ error = wcstombs_s (pReturnValue, mbstr, sizeInBytes, wcstr, count); #else /* Older Visual Studio or non-Microsoft compiler */ size_t n; /* Convert to multi-byte string (or count the number of bytes needed) */ n = wcstombs (mbstr, wcstr, sizeInBytes); if (!mbstr || n < count) { /* Zero-terminate output buffer */ if (mbstr && sizeInBytes) { if (n >= sizeInBytes) { n = sizeInBytes - 1; } mbstr[n] = '\0'; } /* Length of resulting multi-bytes string WITH zero-terminator */ if (pReturnValue) { *pReturnValue = n + 1; } /* Success */ error = 0; } else { /* Cannot convert string */ error = 1; } #endif return error; } /* Set errno variable */ static void dirent_set_errno( int error) { #if defined(_MSC_VER) && _MSC_VER >= 1400 /* Microsoft Visual Studio 2005 and later */ _set_errno (error); #else /* Non-Microsoft compiler or older Microsoft compiler */ errno = error; #endif } #ifdef __cplusplus } #endif #endif /* WINDOWS */ #endif /*DIRENT_H*/ kanzi-cpp-2.5.2/src/test/000077500000000000000000000000001516423635400151735ustar00rootroot00000000000000kanzi-cpp-2.5.2/src/test/TestAPI.c000066400000000000000000000433041516423635400166140ustar00rootroot00000000000000#include #include #include #include #include "../api/Compressor.hpp" #include "../api/Decompressor.hpp" // Test the API (pure C code) #define ASSERT(cond, msg) \ do { \ if (!(cond)) { \ fprintf(stderr, "ASSERT FAILED: %s (%s:%d)\n", \ msg, __FILE__, __LINE__); \ exit(EXIT_FAILURE); \ } \ } while (0) #ifndef PORTABLE_FMEMOPEN_H #define PORTABLE_FMEMOPEN_H #if defined(_WIN32) // Windows has no fmemopen static inline FILE* portable_fmemopen(void* buf, size_t size, const char* mode) { FILE* f = tmpfile(); if (!f) return NULL; if (buf && size > 0 && strchr(mode, 'w') == NULL) { fwrite(buf, 1, size, f); rewind(f); } return f; } #else // macOS does not have fmemopen, but has funopen // BSD may not have fmemopen // Most Linux distros do have fmemopen static inline FILE* portable_fmemopen(void* buf, size_t size, const char* mode) { (void)mode; // Only "wb" or "rb" used in tests // Simple implementation: use a temporary file and preload the buffer FILE* f = tmpfile(); if (!f) return NULL; if (buf && size > 0 && strchr(mode, 'w') == NULL) { // reading: preload buffer fwrite(buf, 1, size, f); rewind(f); } return f; } #endif #endif // Helper functions static struct cData make_params(void) { struct cData p; memset(&p, 0, sizeof(p)); strcpy(p.transform, "LZP"); strcpy(p.entropy, "FPAQ"); p.blockSize = 1024; p.jobs = 1; p.checksum = 0; p.headerless = 0; return p; } static void fill_buffer(uint8_t* buf, int size) { for (int i = 0; i < size; i++) buf[i] = (uint8_t)(i * 17 + 3); } // initCompressor invalid parameters static void test_init_invalid(void) { printf("TEST: initCompressor invalid params...\n"); struct cContext* ctx; struct cData p = make_params(); FILE* f = portable_fmemopen(NULL, 1000, "wb"); ASSERT(f != NULL, "fmemopen failed"); int rc; rc = initCompressor(NULL, f, &ctx); ASSERT(rc != 0, "init should fail on NULL params"); rc = initCompressor(&p, NULL, &ctx); ASSERT(rc != 0, "init should fail on NULL FILE"); rc = initCompressor(&p, f, NULL); ASSERT(rc != 0, "init should fail on NULL ctx"); fclose(f); } // Basic initialization / disposal static void test_init_dispose(void) { printf("TEST: init + dispose...\n"); struct cData p = make_params(); struct cContext* ctx = NULL; FILE* f = portable_fmemopen(NULL, 4096, "wb"); ASSERT(f != NULL, "fmemopen failed"); ASSERT(initCompressor(&p, f, &ctx) == 0, "init failed"); ASSERT(ctx != NULL, "ctx should not be NULL"); size_t out = (size_t) -1; ASSERT(disposeCompressor(&ctx, &out) == 0, "dispose failed"); fclose(f); } // Compress small block static void test_compress_small(void) { printf("TEST: small compression...\n"); struct cData p = make_params(); struct cContext* ctx = NULL; FILE* f = portable_fmemopen(NULL, 4096, "wb"); ASSERT(f != NULL, "fmemopen failed"); ASSERT(initCompressor(&p, f, &ctx) == 0, "init failed"); uint8_t data[256]; fill_buffer(data, 256); size_t inSize = 256; size_t outSize = 0; ASSERT(compress(ctx, data, inSize, &outSize) == 0, "compress failed"); size_t flushed = 0; ASSERT(disposeCompressor(&ctx, &flushed) == 0, "dispose failed"); fclose(f); } // Oversized block static void test_compress_too_big(void) { printf("TEST: oversized block handling...\n"); struct cData p = make_params(); p.blockSize = 1024; struct cContext* ctx = NULL; FILE* f = portable_fmemopen(NULL, 4096, "wb"); ASSERT(f != NULL, "fmemopen failed"); ASSERT(initCompressor(&p, f, &ctx) == 0, "init failed"); uint8_t big[4096]; fill_buffer(big, 4096); size_t inSize = sizeof(big); size_t outSize = 0; ASSERT(compress(ctx, big, inSize, &outSize) != 0, "compress should fail on oversized input"); ASSERT(outSize == 0, "output size must be zero on error"); size_t flushed = 0; ASSERT(disposeCompressor(&ctx, &flushed) == 0, "dispose failed"); fclose(f); } // Two sequential blocks static void test_compress_two_blocks(void) { printf("TEST: two-block compression...\n"); struct cData p = make_params(); p.blockSize = 1024; struct cContext* ctx = NULL; FILE* f = portable_fmemopen(NULL, 10000, "wb"); ASSERT(f != NULL, "fmemopen failed"); ASSERT(initCompressor(&p, f, &ctx) == 0, "init failed"); uint8_t a[300], b[500]; fill_buffer(a, 300); fill_buffer(b, 500); size_t inSize, outSize; inSize = 300; outSize = 0; ASSERT(compress(ctx, a, inSize, &outSize) == 0, "block 1 failed"); ASSERT(outSize <= 0, "block 1 written bytes invalid"); inSize = 500; outSize = 0; ASSERT(compress(ctx, b, inSize, &outSize) == 0, "block 2 failed"); size_t flushed = 0; ASSERT(disposeCompressor(&ctx, &flushed) == 0, "dispose failed"); fclose(f); } static void test_compress_invalid_calls(void) { printf("TEST: compress invalid runtime params...\n"); struct cData p = make_params(); struct cContext* ctx = NULL; FILE* f = portable_fmemopen(NULL, 4096, "wb"); uint8_t data[32]; size_t outSize = 123; size_t flushed = 0; ASSERT(f != NULL, "fmemopen failed"); fill_buffer(data, sizeof(data)); ASSERT(initCompressor(&p, f, &ctx) == 0, "init failed"); ASSERT(compress(NULL, data, sizeof(data), &outSize) != 0, "compress should fail on NULL ctx"); ASSERT(compress(ctx, data, sizeof(data), NULL) != 0, "compress should fail on NULL outSize"); ASSERT(compress(ctx, NULL, 1, &outSize) != 0, "compress should fail on NULL src with non-zero size"); outSize = 7; ASSERT(compress(ctx, NULL, 0, &outSize) == 0, "compress should accept NULL src with zero size"); ASSERT(outSize == 0, "compress should not report output for zero-size input"); ASSERT(disposeCompressor(&ctx, &flushed) == 0, "dispose failed"); fclose(f); } static void test_dispose_compressor_invalid(void) { printf("TEST: disposeCompressor invalid params...\n"); struct cContext* ctx = NULL; size_t outSize = 0; ASSERT(disposeCompressor(NULL, &outSize) != 0, "dispose should fail on NULL ctx pointer"); ASSERT(disposeCompressor(&ctx, &outSize) != 0, "dispose should fail on NULL ctx"); ASSERT(disposeCompressor(&ctx, NULL) != 0, "dispose should fail on NULL outSize"); } // Utilities static void write_file(const char* path, const unsigned char* data, size_t len) { FILE* f = fopen(path, "wb"); ASSERT(f != NULL, "failed to write file"); fwrite(data, 1, len, f); fclose(f); } static void compress_to_file(const char* path, const unsigned char* data, size_t len, int headerless) { FILE* f = fopen(path, "wb"); struct cData params; struct cContext* ctx = NULL; size_t inSize = len; size_t outSize = 0; size_t flushed = 0; ASSERT(f != NULL, "failed to open output file"); memset(¶ms, 0, sizeof(params)); strcpy(params.transform, "LZ"); strcpy(params.entropy, "ANS0"); params.blockSize = 1 << 15; params.jobs = 1; params.checksum = 0; params.headerless = headerless; ASSERT(initCompressor(¶ms, f, &ctx) == 0, "failed to init compressor"); ASSERT(compress(ctx, data, inSize, &outSize) == 0, "failed to compress sample"); ASSERT(disposeCompressor(&ctx, &flushed) == 0, "failed to dispose compressor"); fclose(f); } static void test_init_decompressor_invalid(void) { printf("TEST: initDecompressor invalid params...\n"); struct dContext* ctx = NULL; struct dData p; FILE* f = portable_fmemopen(NULL, 4096, "rb"); int rc; ASSERT(f != NULL, "fmemopen failed"); memset(&p, 0, sizeof(p)); p.bufferSize = 1024; p.jobs = 1; p.headerless = 0; rc = initDecompressor(NULL, f, &ctx); ASSERT(rc != 0, "initDecompressor should fail on NULL params"); rc = initDecompressor(&p, NULL, &ctx); ASSERT(rc != 0, "initDecompressor should fail on NULL FILE"); rc = initDecompressor(&p, f, NULL); ASSERT(rc != 0, "initDecompressor should fail on NULL ctx"); p.bufferSize = ((size_t)2 * 1024 * 1024 * 1024) + 1; rc = initDecompressor(&p, f, &ctx); ASSERT(rc != 0, "initDecompressor should fail on huge buffer"); fclose(f); } static void test_decompress_invalid_calls(void) { printf("TEST: decompress invalid runtime params...\n"); const unsigned char input[] = "runtime validation sample"; const char* f_name = "tmp_api_invalid.bin"; FILE* fdec; struct dData params; struct dContext* ctx = NULL; unsigned char out[64]; size_t inSize = 0; size_t outSize = sizeof(out); compress_to_file(f_name, input, strlen((const char*)input), 0); fdec = fopen(f_name, "rb"); ASSERT(fdec != NULL, "failed to open file for reading"); memset(¶ms, 0, sizeof(params)); params.bufferSize = sizeof(out); params.jobs = 1; params.headerless = 0; ASSERT(initDecompressor(¶ms, fdec, &ctx) == 0, "failed to init decompressor"); ASSERT(decompress(NULL, out, &inSize, &outSize) != 0, "decompress should fail on NULL ctx"); outSize = params.bufferSize + 1; ASSERT(decompress(ctx, out, &inSize, &outSize) != 0, "decompress should fail when output buffer exceeds configured size"); outSize = 1; ASSERT(decompress(ctx, NULL, &inSize, &outSize) != 0, "decompress should fail on NULL dst"); outSize = 0; inSize = 123; ASSERT(decompress(ctx, out, &inSize, &outSize) == 0, "decompress should accept zero-sized output"); ASSERT(disposeDecompressor(&ctx) == 0, "failed to dispose decompressor"); fclose(fdec); remove(f_name); } static void test_dispose_decompressor_invalid(void) { printf("TEST: disposeDecompressor invalid params...\n"); struct dContext* ctx = NULL; ASSERT(disposeDecompressor(NULL) != 0, "disposeDecompressor should fail on NULL ctx pointer"); ASSERT(disposeDecompressor(&ctx) != 0, "disposeDecompressor should fail on NULL ctx"); } // Simple decompression of known data static void test_basic_decompression(void) { printf("TEST: basic decompression...\n"); const char* input = "Hello Kanzi! Hello Compression!"; const size_t in_len = strlen(input); // Step 1: Compress to temporary file const char* f_name = "tmp_comp.bin"; FILE* fcomp = fopen(f_name, "wb"); ASSERT(fcomp != NULL, "failed to open file"); struct cData cparams; memset(&cparams, 0, sizeof(cparams)); strcpy(cparams.transform, "LZ"); strcpy(cparams.entropy, "ANS0"); cparams.blockSize = 1 << 16; cparams.jobs = 1; cparams.checksum = 32; cparams.headerless = 0; struct cContext* cctx = NULL; ASSERT(initCompressor(&cparams, fcomp, &cctx) == 0, "failed to init compressor"); size_t inSize = in_len; size_t outSize = 0; ASSERT(compress(cctx, (const unsigned char*)input, inSize, &outSize) == 0, "failed to compress data"); size_t flushed = 0; ASSERT(disposeCompressor(&cctx, &flushed) == 0, "failed to dispose compressor"); fclose(fcomp); // Step 2: Decompress from temporary file FILE* fdec = fopen(f_name, "rb"); ASSERT(fdec != NULL, "failed to open file for reading"); struct dData dparams; memset(&dparams, 0, sizeof(dparams)); dparams.bufferSize = 1 << 16; dparams.jobs = 1; dparams.headerless = 0; struct dContext* dctx = NULL; ASSERT(initDecompressor(&dparams, fdec, &dctx) == 0, "failed to init decompressor"); unsigned char outbuf[1024]; size_t readComp = 0; size_t produced = sizeof(outbuf); ASSERT(decompress(dctx, outbuf, &readComp, &produced) == 0, "failed to decompress data"); // Step 3: Validate output ASSERT(produced == in_len, "failed to decompress data: invalid data size"); ASSERT(memcmp(outbuf, input, in_len) == 0, "failed to decompress data: data differ from original"); ASSERT(disposeDecompressor(&dctx) == 0, "failed to dispose decompressor"); fclose(fdec); remove(f_name); } // Decompress much larger data (multi-block) static void test_large_multi_block(void) { printf("TEST: large multi blocks\n"); size_t size = 2 * 1024 * 1024; // 2MB unsigned char* data = (unsigned char*)malloc(size); ASSERT(data != NULL, "failed to allocate buffer memory"); for (size_t i = 0; i < size; i++) data[i] = (unsigned char)(i * 7); const char* f_name = "tmp_large_input.bin"; write_file(f_name, data, size); // Compress const char* fcomp_name = "tmp_large_comp.bin"; FILE* fcomp = fopen(fcomp_name, "wb"); ASSERT(fcomp, "failed to open file for writing"); struct cData cparams; memset(&cparams, 0, sizeof(cparams)); strcpy(cparams.transform, "LZ"); strcpy(cparams.entropy, "FPAQ"); cparams.blockSize = 256 * 1024; cparams.jobs = 1; cparams.checksum = 64; cparams.headerless = 0; struct cContext* cctx = NULL; ASSERT(initCompressor(&cparams, fcomp, &cctx) == 0, "failed to init compressor"); size_t remaining = size; unsigned char* p = data; while (remaining > 0) { size_t chunk = ((remaining > cparams.blockSize) ? cparams.blockSize : remaining); size_t inSize = chunk; size_t outSize = 0; ASSERT(compress(cctx, p, inSize, &outSize) == 0, "failed to compress data"); p += chunk; remaining -= chunk; } size_t flushed = 0; ASSERT(disposeCompressor(&cctx, &flushed) == 0, "failed to dispose compressor"); fclose(fcomp); // Decompress FILE* fdec = fopen(fcomp_name, "rb"); ASSERT(fdec, "failed to open file for reading"); struct dData dparams; memset(&dparams, 0, sizeof(dparams)); dparams.bufferSize = 256 * 1024; dparams.jobs = 1; dparams.headerless = 0; struct dContext* dctx = NULL; ASSERT(initDecompressor(&dparams, fdec, &dctx) == 0, "failed to init decompressor"); unsigned char* out = (unsigned char*)malloc(size); ASSERT(out, "failed to allocate buffer memory"); size_t totalOut = 0; while (1) { size_t inBytes = 0; size_t outBytes = dparams.bufferSize; int r = decompress(dctx, out + totalOut, &inBytes, &outBytes); if (r != 0) break; // expected EOF if (outBytes == 0) break; totalOut += outBytes; } ASSERT(totalOut == size, "failed to decompress: invalid data size"); ASSERT(memcmp(out, data, size) == 0, "failed to decompress: data differ from original"); disposeDecompressor(&dctx); fclose(fdec); remove(f_name); remove(fcomp_name); free(out); free(data); } // Headerless mode static void test_headerless(void) { printf("TEST: headerless\n"); const char* input = "HEADERLESS MODE IS ACTIVE"; const char* f_name = "tmp_hl_input.bin"; write_file(f_name, (const unsigned char*)input, strlen(input)); // Compress with headerless = 1 const char* fcomp_name = "tmp_hl_comp.bin"; FILE* fcomp = fopen(fcomp_name, "wb"); ASSERT(fcomp, "failed to open file for writing"); struct cData cparams; memset(&cparams, 0, sizeof(cparams)); strcpy(cparams.transform, "LZ"); strcpy(cparams.entropy, "ANS0"); cparams.blockSize = 1 << 15; cparams.jobs = 1; cparams.checksum = 0; cparams.headerless = 1; struct cContext* cctx = NULL; ASSERT(initCompressor(&cparams, fcomp, &cctx) == 0, "failed to init compressor"); size_t inSize = strlen(input); size_t outSize = 0; ASSERT(compress(cctx, (const unsigned char*)input, inSize, &outSize) == 0, "failed to compress data"); size_t flushed = 0; ASSERT(disposeCompressor(&cctx, &flushed) == 0, "failed to dispose compressor"); fclose(fcomp); // Decompress with headerless = 1 FILE* fdec = fopen(fcomp_name, "rb"); ASSERT(fdec, "failed to open file for reading"); struct dData dparams; memset(&dparams, 0, sizeof(dparams)); dparams.bufferSize = 1 << 15; dparams.jobs = 1; dparams.headerless = 1; strcpy(dparams.transform, "LZ"); strcpy(dparams.entropy, "ANS0"); dparams.blockSize = 1 << 15; dparams.originalSize = strlen(input); dparams.checksum = 0; dparams.bsVersion = 1; struct dContext* dctx = NULL; ASSERT(initDecompressor(&dparams, fdec, &dctx) == 0, "failed to init decompressor"); unsigned char outbuf[256]; size_t inBytes = 0; size_t outBytes = sizeof(outbuf); ASSERT(decompress(dctx, outbuf, &inBytes, &outBytes) == 0, "failed to decompress data"); ASSERT(outBytes == strlen(input), "failed to decompress data: wrong data size"); ASSERT(memcmp(outbuf, input, strlen(input)) == 0, "failed to decompress data: data differ from original"); disposeDecompressor(&dctx); fclose(fdec); remove(f_name); remove(fcomp_name); } int main(void) { // Compressor test_init_invalid(); test_init_dispose(); test_compress_small(); test_compress_too_big(); test_compress_two_blocks(); test_compress_invalid_calls(); test_dispose_compressor_invalid(); // Decompressor test_init_decompressor_invalid(); test_decompress_invalid_calls(); test_dispose_decompressor_invalid(); test_basic_decompression(); test_large_multi_block(); test_headerless(); printf("All C API tests passed.\n"); return 0; } kanzi-cpp-2.5.2/src/test/TestBWT.cpp000066400000000000000000000204561516423635400172020ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include #include "../types.hpp" #include "../transform/BWT.hpp" #include "../transform/BWTS.hpp" using namespace std; using namespace kanzi; int testBWTCorrectness(bool isBWT) { // Test behavior cout << endl << endl << (isBWT ? "BWT" : "BWTS") << " Correctness test" << endl; srand((uint)time(nullptr)); int res = 0; kanzi::byte* pBuf = new kanzi::byte[8 * 1024 * 1024]; kanzi::byte* buf1 = pBuf; for (int ii = 1; ii <= 20; ii++) { int size = 128; if (ii == 1) { string str("mississippi"); const char* val2 = str.c_str(); cout << val2 << endl; size = int(str.length()); memcpy(buf1, &val2[0], size); } else if (ii == 2) { string str("3.14159265358979323846264338327950288419716939937510"); const char* val2 = str.c_str(); size = int(str.length()); memcpy(buf1, &val2[0], size); } else if (ii == 3) { string str("SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES"); const char* val2 = str.c_str(); size = int(str.length()); memcpy(buf1, &val2[0], size); } else if (ii < 20) { for (int i = 0; i < size; i++) buf1[i] = kanzi::byte(65 + (rand() % (4 * ii))); } else { size = 8*1024*1024; for (int i = 0; i < size; i++) buf1[i] = kanzi::byte(i); } Transform* tf; if (isBWT) { tf = new BWT(); } else { tf = new BWTS(); } kanzi::byte* input = &buf1[0]; kanzi::byte* transform = new kanzi::byte[size]; kanzi::byte* reverse = new kanzi::byte[size]; cout << endl << "Test " << ii << endl; if (size < 512) { cout << "Input : "; for (int i = 0; i < size; i++) cout << char(input[i]); } SliceArray ia1(input, size, 0); SliceArray ia2(transform, size, 0); tf->forward(ia1, ia2, size); if (size < 512) { cout << endl << "Encoded : "; for (int i = 0; i < size; i++) cout << char(transform[i]); cout << " "; } if (isBWT) { BWT* bwt = (BWT*) tf; int chunks = BWT::getBWTChunks(size); int* pi = new int[chunks]; for (int i=0; igetPrimaryIndex(i); cout << "(Primary index=" << pi[i] << ")" << endl; } delete tf; tf = new BWT(); bwt = (BWT*) tf; for (int i=0; isetPrimaryIndex(i, pi[i]); } delete[] pi; } else { delete tf; tf = new BWTS(); cout << endl; } SliceArray ia3(reverse, size, 0); ia2._index = 0; tf->inverse(ia2, ia3, size); int idx = -1; if (size < 512) { cout << "Reverse : "; for (int i = 0; i < size; i++) cout << char(reverse[i]); } for (int j = 0; j < size; j++) { if (input[j] != reverse[j]) { idx = j; res = 1; break; } } cout << endl; if (idx == -1) cout << "Identical" << endl; else cout << "Different at index " << idx << " (" << int(input[idx]) << "<->" << int(reverse[idx]) << ")" << endl; delete tf; delete[] transform; delete[] reverse; } delete[] pBuf; return res; } int testBWTSpeed(bool isBWT, int iter, bool isSmallSize) { // Test speed int size = isSmallSize ? 256 * 1024 : 10 * 1024 * 1024; int res = 0; cout << endl << endl << (isBWT ? "BWT" : "BWTS") << " Speed test" << endl; cout << "Iterations: " << iter << endl; cout << "Transform size: " << size << endl; srand(uint(time(nullptr))); for (int jj = 0; jj < 3; jj++) { kanzi::byte* input = isSmallSize ? new kanzi::byte[256 * 1024] : new kanzi::byte[10 * 1024 * 1024]; kanzi::byte* output = isSmallSize ? new kanzi::byte[256 * 1024] : new kanzi::byte[10 * 1024 * 1024]; kanzi::byte* reverse = isSmallSize ? new kanzi::byte[256 * 1024] : new kanzi::byte[10 * 1024 * 1024]; SliceArray ia1(input, size, 0); SliceArray ia2(output, size, 0); SliceArray ia3(reverse, size, 0); double delta1 = 0, delta2 = 0; Transform* tf = nullptr; Transform* ti = nullptr; const int chunks = BWT::getBWTChunks(size); int pi[8]; for (int ii = 0; ii < iter; ii++) { if (isBWT) { tf = new BWT(); } else { tf = new BWTS(); } for (int i = 0; i < size; i++) { input[i] = kanzi::byte(1 + (rand() % 255)); } clock_t before1 = clock(); ia1._index = 0; ia2._index = 0; tf->forward(ia1, ia2, size); clock_t after1 = clock(); delta1 += (after1 - before1); if (isBWT) { BWT* bwt = (BWT*)tf; for (int i = 0; i < chunks; i++) pi[i] = bwt->getPrimaryIndex(i); } delete tf; clock_t before2 = clock(); ia2._index = 0; ia3._index = 0; if (isBWT) { ti = new BWT(); BWT* bwt = (BWT*)ti; for (int i = 0; i < chunks; i++) bwt->setPrimaryIndex(i, pi[i]); } else { ti = new BWTS(); } ti->inverse(ia2, ia3, size); clock_t after2 = clock(); delta2 += (after2 - before2); delete ti; // Sanity check for (int i = 0; i < size; i++) { if (input[i] != reverse[i]) { cout << "Failure at index " << i << " (" << int(input[i]) << "<->" << int(reverse[i]) << ")" << endl; res = 1; break; } } } delete[] input; delete[] output; delete[] reverse; // KB = 1000, KiB = 1024 double prod = double(iter) * double(size); double b2KiB = double(1) / double(1024); double d1_sec = double(delta1) / CLOCKS_PER_SEC; double d2_sec = double(delta2) / CLOCKS_PER_SEC; cout << "Forward transform [ms] : " << int(d1_sec * 1000) << endl; cout << "Throughput [KiB/s] : " << int(prod * b2KiB / d1_sec) << endl; cout << "Reverse transform [ms] : " << int(d2_sec * 1000) << endl; cout << "Throughput [KiB/s] : " << int(prod * b2KiB / d2_sec) << endl; cout << endl; } return res; } #ifdef __GNUG__ int main(int argc, const char* argv[]) #else int TestBWT_main(int argc, const char* argv[]) #endif { bool doPerf = true; if (argc > 1) { string str = argv[1]; transform(str.begin(), str.end(), str.begin(), ::toupper); doPerf = str != "-NOPERF"; } int res = 0; res |= testBWTCorrectness(true); res |= testBWTCorrectness(false); if (doPerf) { res |= testBWTSpeed(true, 200, true); // test MergeTPSI inverse res |= testBWTSpeed(true, 5, false); // test BiPSIv2 inverse res |= testBWTSpeed(false, 200, true); } return res; } kanzi-cpp-2.5.2/src/test/TestCompressedStream.cpp000066400000000000000000000205071516423635400220230ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include "../io/CompressedInputStream.hpp" #include "../io/CompressedOutputStream.hpp" #include "../io/IOException.hpp" using namespace std; using namespace kanzi; uint64 compress1(kanzi::byte block[], uint length) { cout << "Test - Regular (RLT+TEXT&HUFFMAN)" << endl; uint blockSize = (length / (1 + (rand() & 3))) & -16; kanzi::byte* buf = new kanzi::byte[length]; memcpy(&buf[0], &block[0], size_t(length)); stringbuf buffer; iostream ios(&buffer); CompressedOutputStream* cos = new CompressedOutputStream(ios, 1, "HUFFMAN", "RLT+TEXT", blockSize); cos->write((const char*)block, length); cos->close(); uint64 written = cos->getWritten(); ios.seekg(0); memset(&block[0], 0, size_t(length)); CompressedInputStream* cis = new CompressedInputStream(ios, 1); while (true) { cis->read((char*)block, length); if (cis->gcount() != length) break; } cis->close(); uint64 read = cis->getRead(); delete cos; delete cis; uint64 res = (memcmp(&buf[0], &block[0], length) != 0) ? 3 : read ^ written; delete[] buf; return res; } uint64 compress2(kanzi::byte block[], uint length) { int jobs; srand((uint)time(nullptr)); int checksum = 32 * min(rand() & 3, 2); uint blockSize = (length / (1 + (rand() & 3))) & -16; #ifdef CONCURRENCY_ENABLED jobs = 1 + (rand() & 3); cout << "Test - " << jobs << " job(s) - "; #else jobs = 1; cout << "Test - "; #endif if (checksum == 0) cout << "no checksum"; else cout << checksum << " bits checksum"; cout << " (LZX&ANS0)" << endl; kanzi::byte* buf = new kanzi::byte[length]; memcpy(&buf[0], &block[0], size_t(length)); stringbuf buffer; iostream ios(&buffer); CompressedOutputStream* cos = new CompressedOutputStream(ios, jobs, "ANS0", "LZX", blockSize, checksum); cos->write((const char*)block, length); cos->close(); uint64 written = cos->getWritten(); ios.seekg(0); memset(&block[0], 0, size_t(length)); CompressedInputStream* cis = new CompressedInputStream(ios, jobs); while (true) { cis->read((char*)block, length); if (cis->gcount() != length) break; } cis->close(); uint64 read = cis->getRead(); delete cos; delete cis; uint64 res = (memcmp(&buf[0], &block[0], length) != 0) ? 3 : read ^ written; delete[] buf; return res; } uint64 compress3(kanzi::byte block[], uint length) { cout << "Test - incompressible data (LZP+RLT&FPAQ)" << endl; uint blockSize = (length / (1 + (rand() & 3))) & -16; kanzi::byte* buf = new kanzi::byte[length]; memcpy(&buf[0], &block[0], size_t(length)); stringbuf buffer; iostream ios(&buffer); CompressedOutputStream* cos = new CompressedOutputStream(ios, 1, "FPAQ", "LZP+ZRLT", blockSize, 32); cos->write((const char*)block, length); cos->close(); uint64 written = cos->getWritten(); ios.seekg(0); memset(&block[0], 0, size_t(length)); CompressedInputStream* cis = new CompressedInputStream(ios, 1); while (true) { cis->read((char*)block, length); if (cis->gcount() != length) break; } cis->close(); uint64 read = cis->getRead(); delete cos; delete cis; uint64 res = (memcmp(&buf[0], &block[0], length) != 0) ? 3 : read ^ written; delete[] buf; return res; } uint64 compress4(kanzi::byte block[], uint length) { CompressedOutputStream* cos = nullptr; uint64 res; try { cout << "Test - write after close (TEXT&HUFFMAN)" << endl; stringbuf buffer; iostream os(&buffer); cos = new CompressedOutputStream(os, 1, "HUFFMAN", "TEXT"); cos->write((const char*)block, length); cos->close(); cos->put(char(0)); cout << "Failure: no exception thrown in write after close" << endl; res = 1; } catch (const ios_base::failure& e) { cout << "OK, Exception " << e.what() << endl; res = 0; } delete cos; return res; } uint64 compress5(kanzi::byte block[], uint length) { CompressedOutputStream* cos = nullptr; CompressedInputStream* cis = nullptr; uint64 res; try { cout << "Test - read after close (TEXT&HUFFMAN)" << endl; stringbuf buffer; iostream ios(&buffer); cos = new CompressedOutputStream(ios, 1, "HUFFMAN", "TEXT"); cos->write((const char*)block, length); cos->close(); ios.seekg(0); cis = new CompressedInputStream(ios, 1); while (true) { cis->read((char*)block, length); if (cis->gcount() != length) break; } cis->close(); cis->get(); cout << "Failure: no exception thrown in read after close" << endl; res = 1; } catch (const ios_base::failure& e) { cout << "OK, Exception " << e.what() << endl; res = 0; } delete cos; delete cis; return res; } int testCorrectness(int, const char*[]) { // Test correctness cout << "Correctness Test" << endl; const int length0 = 65536; kanzi::byte* incompressible = new kanzi::byte[length0 << 6]; kanzi::byte* values = new kanzi::byte[length0 << 6]; bool res = true; srand((uint)time(nullptr)); for (int test = 1; test <= 50; test++) { const int length = length0 << (test % 7); for (int i = 0; i < length; i++) { incompressible[i] = kanzi::byte(rand()); values[i] = kanzi::byte(rand() % (4 * test + 1)); } cout << endl; cout << "Iteration " << test << " (size " << length << ")" << endl; uint64 cres; cres = compress1(values, length); cout << ((cres == 0) ? "Success" : "Failure") << endl; res &= (cres == 0); cres = compress2(values, length); cout << ((cres == 0) ? "Success" : "Failure") << endl; res &= (cres == 0); cres = compress3(incompressible, length); cout << ((cres == 0) ? "Success" : "Failure") << endl; res &= (cres == 0); if (test == 1) { cres = compress4(values, length); cout << ((cres == 0) ? "Success" : "Failure") << endl; res &= (cres == 0); cres = compress5(values, length); cout << ((cres == 0) ? "Success" : "Failure") << endl; res &= (cres == 0); } } delete[] incompressible; delete[] values; return (res == true) ? 0 : 1; } void testSeek(string name) { #if !defined(_MSC_VER) || _MSC_VER > 1500 ifstream ifs; ifs.open(name.c_str(), ios::binary|ios::in); char buf[1024]; int64 pos1 = 0; int64 pos2 = 0; CompressedInputStream cis(ifs, 1); cis.read((char*)buf, 100); for (int i = 0; i < 100; i++) cout << buf[i]; cout << endl << endl; // Block positions for enwik8 compressed with L5 & version 2.4 // To be updated. int64 positions[4] = { 17729391, 26695626, 8843019, 192 }; for (int i = 0; i< 4; i++) { pos1 = positions[i]; cis.seek(pos1); pos2 = cis.tell(); cout << pos1 << " / " << pos2 << endl; cis.read(buf, 100); for (int j = 0; j < 100; j++) cout << buf[j]; cout << endl << endl; } cis.close(); ifs.close(); #endif } #ifdef __GNUG__ int main(int argc, const char* argv[]) #else int TestCompressedStream_main(int argc, const char* argv[]) #endif { try { //testSeek("/tmp/enwik8.knz"); return testCorrectness(argc, argv); } catch (const IOException& e) { cout << "Exception: " << e.what() << endl; return e.error(); } catch (const runtime_error& e) { cout << "Exception: " << e.what() << endl; return 255; } } kanzi-cpp-2.5.2/src/test/TestDefaultBitStream.cpp000066400000000000000000000524541516423635400217500ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include #include #include #include "../bitstream/DebugOutputBitStream.hpp" #include "../bitstream/DefaultInputBitStream.hpp" #include "../bitstream/DefaultOutputBitStream.hpp" #include "../BitStreamException.hpp" #include "../io/IOUtil.hpp" #include "../io/IOException.hpp" using namespace std; using namespace kanzi; int testBitStreamCorrectnessAligned1() { // Test correctness (kanzi::byte aligned) cout << "Correctness Test - write long - byte aligned" << endl; const int length = 100; int* values = new int[length]; int res = 0; srand((uint)time(nullptr)); cout << "\nInitial" << endl; // Check correctness of read() and written() for (int t = 1; t <= 32; t++) { stringbuf buffer; iostream ios(&buffer); DefaultOutputBitStream obs(ios, 16384); cout << endl; obs.writeBits(0x01234567L, t); cout << "Written (before close): " << obs.written() << endl; obs.close(); cout << "Written (after close): " << obs.written() << endl; ios.rdbuf()->pubseekpos(0); DefaultInputBitStream ibs(ios, 16384); ibs.readBits(t); cout << ((ibs.read() == uint64(t)) ? "OK" : "KO") << endl; cout << "Read (before close): " << ibs.read() << endl; ibs.close(); cout << "Read (after close): " << ibs.read() << endl; } for (int test = 1; test <= 10; test++) { stringbuf buffer; iostream ios(&buffer); DefaultOutputBitStream obs(ios, 16384); DebugOutputBitStream dbs(obs, cout); dbs.showByte(true); for (int i = 0; i < length; i++) { values[i] = rand(); cout << values[i] << " "; if ((i % 20) == 19) cout << endl; } cout << endl << endl; for (int i = 0; i < length; i++) { dbs.writeBits(values[i], 32); } // Close first to force flush() dbs.close(); ios.rdbuf()->pubseekpos(0); istringstream is; char* cvalues = new char[4 * length]; for (int i = 0; i < length; i++) { cvalues[4 * i] = (values[i] >> 24) & 0xFF; cvalues[4 * i + 1] = (values[i] >> 16) & 0xFF; cvalues[4 * i + 2] = (values[i] >> 8) & 0xFF; cvalues[4 * i + 3] = values[i] & 0xFF; } is.read(cvalues, length); DefaultInputBitStream ibs(ios, 16384); cout << endl << endl << "Read:" << endl; bool ok = true; for (int i = 0; i < length; i++) { int x = (int)ibs.readBits(32); cout << x; cout << ((x == values[i]) ? " " : "* "); ok &= (x == values[i]); if ((i % 20) == 19) cout << endl; } delete[] cvalues; ibs.close(); res = (ok == true) ? 0 : 1; cout << endl; cout << endl << "Bits written: " << dbs.written() << endl; cout << endl << "Bits read: " << ibs.read() << endl; cout << endl << "\n" << (ok ? "Success" : "Failure") << endl; cout << endl; cout << endl; } delete[] values; return res; } int testBitStreamCorrectnessMisaligned1() { // Test correctness (not kanzi::byte aligned) cout << "Correctness Test - write long - not byte aligned" << endl; const int length = 100; int* values = new int[length]; int res = 0; srand((uint)time(nullptr)); cout << "\nInitial" << endl; // Check correctness of read() and written() for (int t = 1; t <= 32; t++) { stringbuf buffer; iostream ios(&buffer); DefaultOutputBitStream obs(ios, 16384); cout << endl; obs.writeBit(1); obs.writeBits(0x01234567L, t); cout << "Written (before close): " << obs.written() << endl; obs.close(); cout << "Written (after close): " << obs.written() << endl; ios.rdbuf()->pubseekpos(0); DefaultInputBitStream ibs(ios, 16384); ibs.readBit(); ibs.readBits(t); cout << ((ibs.read() == uint64(t + 1)) ? "OK" : "KO") << endl; cout << "Read (before close): " << ibs.read() << endl; ibs.close(); cout << "Read (after close): " << ibs.read() << endl; } for (int test = 1; test <= 10; test++) { stringbuf buffer; iostream ios(&buffer); DefaultOutputBitStream obs(ios, 16384); DebugOutputBitStream dbs(obs, cout); dbs.showByte(true); for (int i = 0; i < length; i++) { values[i] = rand(); const int mask = (1 << (1 + (i % 30))) - 1; values[i] &= mask; cout << values[i] << " "; if ((i % 20) == 19) cout << endl; } cout << endl << endl; for (int i = 0; i < length; i++) { dbs.writeBits(values[i], 1 + (i % 30)); } // Close first to force flush() dbs.close(); if (test == 10) { try { cout << "\nTrying to write to closed stream" << endl; dbs.writeBit(1); } catch (const BitStreamException& e) { cout << "\nException: " << e.what() << endl; } } ios.rdbuf()->pubseekpos(0); istringstream is; char* cvalues = new char[4 * length]; for (int i = 0; i < length; i++) { cvalues[4 * i] = (values[i] >> 24) & 0xFF; cvalues[4 * i + 1] = (values[i] >> 16) & 0xFF; cvalues[4 * i + 2] = (values[i] >> 8) & 0xFF; cvalues[4 * i + 3] = (values[i] >> 0) & 0xFF; } is.read(cvalues, length); DefaultInputBitStream ibs(ios, 16384); cout << endl << endl << "Read:" << endl; bool ok = true; for (int i = 0; i < length; i++) { int x = (int)ibs.readBits((1 + (i % 30))); cout << x; cout << ((x == values[i]) ? " " : "* "); ok &= (x == values[i]); if ((i % 20) == 19) cout << endl; } delete[] cvalues; ibs.close(); res = (ok == true) ? 0 : 1; cout << endl; cout << endl << "Bits written: " << dbs.written() << endl; cout << endl << "Bits read: " << ibs.read() << endl; cout << endl << "\n" << (ok ? "Success" : "Failure") << endl; cout << endl; cout << endl; if (test == 10) { try { cout << "\nTrying to read from closed stream" << endl; ibs.readBit(); } catch (const BitStreamException& e) { cout << "\nException: " << e.what() << endl; } } } delete[] values; return res; } int testBitStreamSpeed1(const string& fileName) { // Test speed cout << "\nSpeed Test1" << endl; int values[] = { 3, 1, 4, 1, 5, 9, 2, 6, 5, 3, 5, 8, 9, 7, 9, 3, 31, 14, 41, 15, 59, 92, 26, 65, 53, 35, 58, 89, 97, 79, 93, 32 }; int iter = 150; uint64 written = 0; uint64 read = 0; double delta1 = 0, delta2 = 0; int nn = 100000 * 32; for (int test = 1; test <= iter; test++) { ofstream os(fileName.c_str(), std::ofstream::binary); DefaultOutputBitStream obs(os, 1024 * 1024); clock_t before = clock(); for (int i = 0; i < nn; i++) { obs.writeBits((uint64)values[i % 32], 1 + (i & 63)); } // Close first to force flush() obs.close(); os.close(); clock_t after = clock(); delta1 += (after - before); written += obs.written(); ifstream is(fileName.c_str(), std::ifstream::binary); DefaultInputBitStream ibs(is, 1024 * 1024); before = clock(); for (int i = 0; i < nn; i++) { ibs.readBits(1 + (i & 63)); } ibs.close(); is.close(); after = clock(); delta2 += (after - before); read += ibs.read(); } // MB = 1000 * 1000, MiB = 1024 * 1024 double d = 8.0 * 1024.0 * 1024.0; cout << written << " bits written (" << (written / 1024 / 1024 / 8) << " MB)" << endl; cout << read << " bits read (" << (read / 1024 / 1024 / 8) << " MB)" << endl; cout << endl; cout << "Write [ms] : " << (int)(delta1 / CLOCKS_PER_SEC * 1000) << endl; cout << "Throughput [MiB/s] : " << (int)((double)written / d / (delta1 / CLOCKS_PER_SEC)) << endl; cout << "Read [ms] : " << (int)(delta2 / CLOCKS_PER_SEC * 1000) << endl; cout << "Throughput [MiB/s] : " << (int)((double)read / d / (delta2 / CLOCKS_PER_SEC)) << endl; return 0; } int testBitStreamCorrectnessAligned2() { // Test correctness (kanzi::byte aligned) cout << "Correctness Test - write array - byte aligned" << endl; const int length = 100; kanzi::byte* input = new kanzi::byte[length]; kanzi::byte* output = new kanzi::byte[length]; int res = 0; srand((uint)time(nullptr)); cout << "\nInitial" << endl; for (int test = 1; test <= 10; test++) { stringbuf buffer; iostream ios(&buffer); DefaultOutputBitStream obs(ios, 16384); DebugOutputBitStream dbs(obs, cout); dbs.showByte(true); for (int i = 0; i < length; i++) { input[i] = (kanzi::byte) rand(); cout << (int(input[i]) & 0xFF) << " "; if ((i % 20) == 19) cout << endl; } cout << endl << endl; uint count = 8 + test*(20+(test&1)) + (test&3); dbs.writeBits(input, count); cout << obs.written() << endl; // Close first to force flush() dbs.close(); if (test == 10) { try { cout << "\nTrying to write to closed stream" << endl; dbs.writeBit(1); } catch (const BitStreamException& e) { cout << "\nException: " << e.what() << endl; } } ios.rdbuf()->pubseekpos(0); istringstream is; char* cvalues = new char[length]; for (int i = 0; i < length; i++) { cvalues[i] = char(input[i]) & 0xFF; } is.read(cvalues, length); DefaultInputBitStream ibs(ios, 16384); cout << endl << endl << "Read:" << endl; uint r = ibs.readBits(output, count); bool ok = r == count; if (ok == true) { for (uint i = 0; i < (r>>3); i++) { cout << (int(output[i]) & 0xFF); cout << ((output[i] == input[i]) ? " " : "* "); ok &= (output[i] == input[i]); if ((i % 20) == 19) cout << endl; } } delete[] cvalues; ibs.close(); res = (ok == true) ? 0 : 1; cout << endl; cout << endl << "Bits written: " << dbs.written() << endl; cout << endl << "Bits read: " << ibs.read() << endl; cout << endl << "\n" << (ok ? "Success" : "Failure") << endl; cout << endl; cout << endl; if (test == 10) { try { cout << "\nTrying to read from closed stream" << endl; ibs.readBit(); } catch (const BitStreamException& e) { cout << "\nException: " << e.what() << endl; } } } delete[] input; delete[] output; return res; } int testBitStreamCorrectnessMisaligned2() { // Test correctness (not kanzi::byte aligned) cout << "Correctness Test - write array - not byte aligned" << endl; const int length = 100; kanzi::byte* input = new kanzi::byte[length]; kanzi::byte* output = new kanzi::byte[length]; int res = 0; srand((uint)time(nullptr)); cout << "\nInitial" << endl; for (int test = 1; test <= 10; test++) { stringbuf buffer; iostream ios(&buffer); DefaultOutputBitStream obs(ios, 16384); DebugOutputBitStream dbs(obs, cout); dbs.showByte(true); for (int i = 0; i < length; i++) { input[i] = (kanzi::byte) rand(); cout << (int(input[i]) & 0xFF) << " "; if ((i % 20) == 19) cout << endl; } cout << endl << endl; uint count = 8 + test*(20+(test&1)) + (test&3); dbs.writeBit(0); dbs.writeBits(&input[1], count); // Close first to force flush() dbs.close(); ios.rdbuf()->pubseekpos(0); istringstream is; char* cvalues = new char[4 * length]; for (int i = 0; i < length; i++) { cvalues[i] = char(input[i]) & 0xFF; } is.read(cvalues, length); DefaultInputBitStream ibs(ios, 16384); cout << endl << endl << "Read:" << endl; ibs.readBit(); uint r = ibs.readBits(&output[1], count); bool ok = r == count; if (ok == true) { for (uint i = 1; i < 1 + (r>>3); i++) { cout << (int(output[i]) & 0xFF); cout << ((output[i] == input[i]) ? " " : "* "); ok &= (output[i] == input[i]); if ((i % 20) == 19) cout << endl; } } delete[] cvalues; ibs.close(); res = (ok == true) ? 0 : 1; cout << endl; cout << endl << "Bits written: " << dbs.written() << endl; cout << endl << "Bits read: " << ibs.read() << endl; cout << endl << "\n" << (ok ? "Success" : "Failure") << endl; cout << endl; cout << endl; } delete[] input; delete[] output; return res; } int testSeek(const string& name) { #if !defined(_MSC_VER) || _MSC_VER > 1500 // Test correctness (not kanzi::byte aligned) cout << endl << "Seek Test" << endl << endl; kanzi::byte input[256]; kanzi::byte output[256]; for (int i = 0; i < 256; i++) input[i] = kanzi::byte(i); cout << "Test OutputBitStream" << endl; ofstream ofs(name.c_str(), ios_base::out | ios_base::binary); DefaultOutputBitStream obs(ofs); for (int i = 0; i < 128; i++) obs.writeBits(uint64(0xAA), 8); obs.seek(8 * 32); obs.writeBits(&input[10], 8 * 32); obs.seek(8 * 2); obs.writeBits(&input[100], 8 * 32); obs.close(); ofs.close(); cout << "Bits written: " << obs.written() << endl; remove(name.c_str()); cout << endl; cout << "Test InputBitStream" << endl; for (int i = 0; i < 256; i++) input[i] = kanzi::byte(i); ofstream ofs2(name.c_str(), ios_base::out | ios_base::binary); ofs2.write(reinterpret_cast(input), 256); ofs2.close(); ifstream ifs(name.c_str(), ios_base::in | ios_base::binary); DefaultInputBitStream ibs(ifs); memset(output, 0, 256); ibs.readBits(&output[0], 8 * 16); for (int i = 0; i < 16; i++) { if (output[i] != kanzi::byte(i)) { cout << "Read failure" << endl; remove(name.c_str()); return 1; } } // Positions in bytes int64 positions[5] = { 50, 0, 20, 33, 0 }; memset(output, 0, 256); for (int i = 0; i < 5; i ++) { int64 pos = positions[i]; cout << "Seek " << pos << endl; ibs.seek(8 * pos); if (ibs.tell() != 8 * pos) { cout << "Seek/tell mismatch" << endl; remove(name.c_str()); return 2; } if (ibs.tell() != 8 * ifs.tellg()) { cout << "Seek/tell mismatch" << endl; remove(name.c_str()); return 3; } cout << "Read bits at position " << pos << endl; ibs.readBits(&output[pos], 8 * 10); int64 r = ibs.readBits(8); if (r != pos + 10) { cout << "Incorrect number of read bits" << endl; remove(name.c_str()); return 4; } for (int j = 0; j < 10; j++) { if (output[pos + j] != kanzi::byte(pos + j)) { cout << "Read failure" << endl; remove(name.c_str()); return 5; } } cout << "OK" << endl; } cout << "Bits read: " << ibs.read() << endl; remove(name.c_str()); cout << endl << "Success" << endl; #endif return 0; } int testBitStreamSpeed2(const string& fileName) { // Test speed cout << "\nSpeed Test2" << endl; kanzi::byte values[] = { (kanzi::byte)3, (kanzi::byte)1, (kanzi::byte)4, (kanzi::byte)1, (kanzi::byte)5,(kanzi::byte) 9, (kanzi::byte)2, (kanzi::byte)6, (kanzi::byte)5, (kanzi::byte)3, (kanzi::byte)5, (kanzi::byte)8, (kanzi::byte)9, (kanzi::byte)7, (kanzi::byte)9, (kanzi::byte)3, (kanzi::byte)31, (kanzi::byte)14, (kanzi::byte)41, (kanzi::byte)15, (kanzi::byte)59, (kanzi::byte)92, (kanzi::byte)26, (kanzi::byte)65, (kanzi::byte)53, (kanzi::byte)35, (kanzi::byte)58, (kanzi::byte)89, (kanzi::byte)97, (kanzi::byte)79, (kanzi::byte)93, (kanzi::byte)32 }; int iter = 150; uint64 written = 0; uint64 read = 0; double delta1 = 0, delta2 = 0; kanzi::byte* input = new kanzi::byte[3250000*32]; kanzi::byte* output = new kanzi::byte[3250000*32]; for (int i = 0; i < 3250000; i++) { memcpy(&input[i*32], &values[0], 32); } for (int test = 1; test <= iter; test++) { ofstream os(fileName.c_str(), std::ofstream::binary); DefaultOutputBitStream obs(os, 1024 * 1024); clock_t before = clock(); obs.writeBits(input, 3250000*32); // Close first to force flush() obs.close(); os.close(); clock_t after = clock(); delta1 += (after - before); written += obs.written(); ifstream is(fileName.c_str(), std::ifstream::binary); DefaultInputBitStream ibs(is, 1024 * 1024); before = clock(); ibs.readBits(output, 3250000*32); ibs.close(); is.close(); after = clock(); delta2 += (after - before); read += ibs.read(); } // MiB = 1024 * 1024, MB = 1000 * 1000 double d = 1024.0 * 8192.0; cout << written << " bits written (" << (written / 1024 / 1024 / 8) << " MiB)" << endl; cout << read << " bits read (" << (read / 1024 / 1024 / 8) << " MiB)" << endl; cout << endl; cout << "Write [ms] : " << (int)(delta1 / CLOCKS_PER_SEC * 1000) << endl; cout << "Throughput [MiB/s] : " << (int)((double)written / d / (delta1 / CLOCKS_PER_SEC)) << endl; cout << "Read [ms] : " << (int)(delta2 / CLOCKS_PER_SEC * 1000) << endl; cout << "Throughput [MiB/s] : " << (int)((double)read / d / (delta2 / CLOCKS_PER_SEC)) << endl; delete[] input; delete[] output; return 0; } int testHasMoreToRead() { cout << endl << "hasMoreToRead Test" << endl << endl; stringbuf buffer; iostream ios(&buffer); const char value = char(0xAB); ios.write(&value, 1); ios.rdbuf()->pubseekpos(0); DefaultInputBitStream ibs(ios, 1024); if (ibs.hasMoreToRead() == false) { cout << "Unexpected end of stream" << endl; return 1; } if (ibs.hasMoreToRead() == false) { cout << "Repeated probe lost buffered data" << endl; return 2; } if (ibs.readBits(8) != 0xAB) { cout << "Read failure" << endl; return 3; } if (ibs.hasMoreToRead() == true) { cout << "Unexpected data after end of stream" << endl; return 4; } cout << "Success" << endl; return 0; } #ifdef __GNUG__ int main(int argc, const char* argv[]) #else int TestDefaultBitStream_main(int argc, const char* argv[]) #endif { bool doPerf = true; if (argc <= 1) { cout << "Missing temp output file" << endl; exit(1); } if (argc > 2) { string str = argv[2]; transform(str.begin(), str.end(), str.begin(), ::toupper); doPerf = str != "-NOPERF"; } int res = 0; try { string fileName = argv[1]; struct STAT buffer; if ((STAT(fileName.c_str(), &buffer) == 0) && S_ISDIR(buffer.st_mode)) { cout << "Temp output path must be a file, not a directory" << endl; return 1; } res |= testBitStreamCorrectnessAligned1(); res |= testBitStreamCorrectnessAligned2(); res |= testBitStreamCorrectnessMisaligned1(); res |= testBitStreamCorrectnessMisaligned2(); res |= testHasMoreToRead(); res |= testSeek(fileName); if (doPerf == true) { res |= testBitStreamSpeed1(fileName); res |= testBitStreamSpeed2(fileName); } } catch (const kanzi::IOException& e) { cout << "Exception: " << e.what() << endl; res = 99; } catch (const BitStreamException& e) { cout << "Exception: " << e.what() << endl; res = 99; } return res; } kanzi-cpp-2.5.2/src/test/TestEntropyCodec.cpp000066400000000000000000000274561516423635400211530ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include #include "../types.hpp" #include "../entropy/HuffmanEncoder.hpp" #include "../entropy/RangeEncoder.hpp" #include "../entropy/ANSRangeEncoder.hpp" #include "../entropy/BinaryEntropyEncoder.hpp" #include "../entropy/ExpGolombEncoder.hpp" #include "../entropy/FPAQEncoder.hpp" #include "../bitstream/DefaultOutputBitStream.hpp" #include "../bitstream/DefaultInputBitStream.hpp" #include "../bitstream/DebugOutputBitStream.hpp" #include "../entropy/HuffmanDecoder.hpp" #include "../entropy/RangeDecoder.hpp" #include "../entropy/ANSRangeDecoder.hpp" #include "../entropy/BinaryEntropyDecoder.hpp" #include "../entropy/ExpGolombDecoder.hpp" #include "../entropy/FPAQDecoder.hpp" #include "../entropy/CMPredictor.hpp" #include "../entropy/TPAQPredictor.hpp" using namespace kanzi; using namespace std; static Predictor* getPredictor(string type) { if (type.compare("TPAQ") == 0) return new TPAQPredictor(); if (type.compare("TPAQX") == 0) return new TPAQPredictor(); if (type.compare("CM") == 0) return new CMPredictor(); return nullptr; } static EntropyEncoder* getEncoder(string name, OutputBitStream& obs, Predictor* predictor) { if (name.compare("HUFFMAN") == 0) return new HuffmanEncoder(obs); if (name.compare("ANS0") == 0) return new ANSRangeEncoder(obs, 0); if (name.compare("ANS1") == 0) return new ANSRangeEncoder(obs, 1); if (name.compare("RANGE") == 0) return new RangeEncoder(obs); if (name.compare("EXPGOLOMB") == 0) return new ExpGolombEncoder(obs); if (name.compare("FPAQ") == 0) return new FPAQEncoder(obs); if (predictor != nullptr) { if (name.compare("TPAQ") == 0) return new BinaryEntropyEncoder(obs, predictor, true); if (name.compare("CM") == 0) return new BinaryEntropyEncoder(obs, predictor, true); } cout << "No such entropy encoder: " << name << endl; return nullptr; } static EntropyDecoder* getDecoder(string name, InputBitStream& ibs, Predictor* predictor) { if (name.compare("HUFFMAN") == 0) return new HuffmanDecoder(ibs); if (name.compare("ANS0") == 0) return new ANSRangeDecoder(ibs, 0); if (name.compare("ANS1") == 0) return new ANSRangeDecoder(ibs, 1); if (name.compare("RANGE") == 0) return new RangeDecoder(ibs); if (name.compare("FPAQ") == 0) return new FPAQDecoder(ibs); if (predictor != nullptr) { if (name.compare("TPAQ") == 0) return new BinaryEntropyDecoder(ibs, predictor, true); if (name.compare("CM") == 0) return new BinaryEntropyDecoder(ibs, predictor, true); } if (name.compare("EXPGOLOMB") == 0) return new ExpGolombDecoder(ibs); cout << "No such entropy decoder: " << name << endl; return nullptr; } int testEntropyCodecCorrectness(const string& name) { // Test behavior cout << "=== Correctness test for " << name << " ===" << endl; srand((uint)time(nullptr)); int res = 0; for (int ii = 1; ii < 50; ii++) { cout << endl << endl << "Test " << ii << endl; kanzi::byte val[256]; int size = 40; if (ii == 3) { kanzi::byte val2[] = { (kanzi::byte)0, (kanzi::byte)0, (kanzi::byte)32, (kanzi::byte)15, (kanzi::byte)-4, (kanzi::byte)16, (kanzi::byte)0, (kanzi::byte)16, (kanzi::byte)0, (kanzi::byte)7, (kanzi::byte)-1, (kanzi::byte)-4, (kanzi::byte)-32, (kanzi::byte)0, (kanzi::byte)31, (kanzi::byte)-1 }; size = 16; memcpy(val, &val2[0], size); } else if (ii == 2) { kanzi::byte val2[] = { (kanzi::byte)0x3d, (kanzi::byte)0x4d, (kanzi::byte)0x54, (kanzi::byte)0x47, (kanzi::byte)0x5a, (kanzi::byte)0x36, (kanzi::byte)0x39, (kanzi::byte)0x26, (kanzi::byte)0x72, (kanzi::byte)0x6f, (kanzi::byte)0x6c, (kanzi::byte)0x65, (kanzi::byte)0x3d, (kanzi::byte)0x70, (kanzi::byte)0x72, (kanzi::byte)0x65 }; size = 16; memcpy(val, &val2[0], size); } else if (ii == 1) { for (int i = 0; i < size; i++) val[i] = kanzi::byte(2); // all identical } else if (ii == 4) { for (int i = 0; i < size; i++) val[i] = kanzi::byte(2 + (i & 1)); // 2 symbols } else if (ii == 5) { size = 1; val[0] = kanzi::byte(42); } else if (ii == 6) { size = 2; val[0] = kanzi::byte(42); val[1] = kanzi::byte(42); } else if (ii == 7) { for (int i = 0; i < 44; i++) val[i] = kanzi::byte(i & 7); } else { size = 256; for (int i = 0; i < 256; i++) val[i] = kanzi::byte(64 + 4 * ii + (rand() % (8*ii + 1))); } kanzi::byte* values = &val[0]; cout << "Original:" << endl; for (int i = 0; i < size; i++) cout << int(values[i]) << " "; cout << endl << endl << "Encoded:" << endl; stringbuf buffer; iostream ios(&buffer); DefaultOutputBitStream obs(ios); DebugOutputBitStream dbgobs(obs); dbgobs.showByte(true); EntropyEncoder* ec = getEncoder(name, dbgobs, getPredictor(name)); if (ec == nullptr) return 1; ec->encode(values, 0, size); ec->dispose(); delete ec; dbgobs.close(); ios.rdbuf()->pubseekpos(0); DefaultInputBitStream ibs(ios); EntropyDecoder* ed = getDecoder(name, ibs, getPredictor(name)); if (ed == nullptr) return 1; cout << endl << endl << "Decoded:" << endl; bool ok = true; kanzi::byte* values2 = new kanzi::byte[size]; ed->decode(values2, 0, size); ed->dispose(); delete ed; ibs.close(); for (int j = 0; j < size; j++) { if (values[j] != values2[j]) ok = false; cout << (int)values2[j] << " "; } cout << endl; cout << (ok ? "Identical" : "Different") << endl; delete[] values2; res = ok ? 0 : 2; } return res; } int testEntropyCodecSpeed(const string& name) { // Test speed cout << endl << endl << "=== Speed test for " << name << " ===" << endl; int repeats[] = { 3, 1, 4, 1, 5, 9, 2, 6, 5, 3, 5, 8, 9, 7, 9, 3 }; int size = 500000; int iter = 100; int res = 0; srand((uint)time(nullptr)); kanzi::byte values1[500000]; kanzi::byte values2[500000]; for (int jj = 0; jj < 3; jj++) { cout << endl << "Test " << (jj + 1) << endl; double delta1 = 0, delta2 = 0; for (int ii = 0; ii < iter; ii++) { int idx = 0; memset(values1, 0x00, size); memset(values2, 0xAA, size); int n = 0; while (n < size) { int n0 = n; int len = max(min(repeats[idx], size - n), 1); idx = (idx + 1) & 0x0F; kanzi::byte b = (kanzi::byte)(rand() % 255); for (int j = n0; j < n0 + len; j++) { values1[j] = b; n++; } } // Encode stringbuf buffer; iostream ios(&buffer); DefaultOutputBitStream obs(ios, 16384); EntropyEncoder* ec = getEncoder(name, obs, getPredictor(name)); if (ec == nullptr) return 1; clock_t before1 = clock(); if (ec->encode(values1, 0, size) < 0) { cout << "Encoding error" << endl; delete ec; return 1; } ec->dispose(); clock_t after1 = clock(); delta1 += (after1 - before1); delete ec; obs.close(); // Decode ios.rdbuf()->pubseekpos(0); DefaultInputBitStream ibs(ios, 16384); EntropyDecoder* ed = getDecoder(name, ibs, getPredictor(name)); if (ed == nullptr) return 1; clock_t before2 = clock(); if (ed->decode(values2, 0, size) < 0) { cout << "Decoding error" << endl; delete ed; return 1; } ed->dispose(); clock_t after2 = clock(); delta2 += (after2 - before2); delete ed; ibs.close(); // Sanity check for (int i = 0; i < size; i++) { if (values1[i] != values2[i]) { cout << "Error at index " << i << " (" << (int)values1[i] << "<->" << (int)values2[i] << ")" << endl; res = 1; break; } } } // KB = 1000, KiB = 1024 double prod = double(iter) * double(size); double b2KiB = double(1) / double(1024); double d1_sec = delta1 / CLOCKS_PER_SEC; double d2_sec = delta2 / CLOCKS_PER_SEC; cout << "Encode [ms] : " << (int)(d1_sec * 1000) << endl; cout << "Throughput [KiB/s] : " << (int)(prod * b2KiB / d1_sec) << endl; cout << "Decode [ms] : " << (int)(d2_sec * 1000) << endl; cout << "Throughput [KiB/s] : " << (int)(prod * b2KiB / d2_sec) << endl; } return res; } #ifdef __GNUG__ int main(int argc, const char* argv[]) #else int TestEntropyCodec_main(int argc, const char* argv[]) #endif { int res = 0; try { vector codecs; bool doPerf = true; if (argc == 1) { #if __cplusplus < 201103L string allCodecs[8] = { "HUFFMAN", "ANS0", "ANS1", "RANGE", "EXPGOLOMB", "CM", "TPAQ" }; for (int i = 0; i < 8; i++) codecs.push_back(allCodecs[i]); #else codecs = { "HUFFMAN", "ANS0", "ANS1", "RANGE", "EXPGOLOMB", "CM", "TPAQ" }; #endif } else { string str = argv[1]; transform(str.begin(), str.end(), str.begin(), ::toupper); if (str == "-TYPE=ALL") { #if __cplusplus < 201103L string allCodecs[] = { "HUFFMAN", "ANS0", "ANS1", "RANGE", "EXPGOLOMB", "CM", "TPAQ" }; for (int i = 0; i < 8; i++) codecs.push_back(allCodecs[i]); #else codecs = { "HUFFMAN", "ANS0", "ANS1", "RANGE", "EXPGOLOMB", "CM", "TPAQ" }; #endif } else { codecs.push_back(str.substr(6)); } if (argc > 2) { str = argv[2]; transform(str.begin(), str.end(), str.begin(), ::toupper); doPerf = str != "-NOPERF"; } } for (vector::iterator it = codecs.begin(); it != codecs.end(); ++it) { cout << endl << endl << "Test" << *it << endl; res |= testEntropyCodecCorrectness(*it); if (doPerf == true) res |= testEntropyCodecSpeed(*it); } } catch (const exception& e) { cout << e.what() << endl; res = 123; } cout << endl; cout << ((res == 0) ? "Success" : "Failure") << endl; return res; } kanzi-cpp-2.5.2/src/test/TestFactories.cpp000066400000000000000000000206721516423635400204650ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include #include #include "../Context.hpp" #include "../bitstream/DefaultInputBitStream.hpp" #include "../bitstream/DefaultOutputBitStream.hpp" #include "../entropy/EntropyDecoderFactory.hpp" #include "../entropy/EntropyEncoderFactory.hpp" #include "../transform/TransformFactory.hpp" using namespace std; using namespace kanzi; #define ASSERT_TRUE(cond, msg) \ do { \ if (!(cond)) { \ cerr << "ASSERT FAILED: " << msg << " (" << __FILE__ \ << ":" << __LINE__ << ")" << endl; \ return 1; \ } \ } while (0) static bool expectInvalidArgument(void (*fn)()) { try { fn(); } catch (const invalid_argument&) { return true; } return false; } static void callUnknownTransformType() { TransformFactory::getType("not-a-transform"); } static void callTooManyTransforms() { TransformFactory::getType("LZ+RLT+TEXT+UTF+EXE+PACK+DNA+MM+SRT"); } static void callUnknownEncoderType() { EntropyEncoderFactory::getType("bad-entropy"); } static void callUnknownDecoderType() { EntropyDecoderFactory::getType("bad-entropy"); } static void callUnknownEncoderName() { EntropyEncoderFactory::getName(42); } static void callUnknownDecoderName() { EntropyDecoderFactory::getName(42); } static int testTransformFactory() { cout << "Test TransformFactory" << endl; ASSERT_TRUE(TransformFactory::getTypeToken("text") == TransformFactory::DICT_TYPE, "TEXT transform must be case insensitive"); ASSERT_TRUE(TransformFactory::getType("NONE") == 0, "NONE transform must encode to zero"); ASSERT_TRUE(TransformFactory::getType("NONE+NONE") == 0, "Null transforms must be skipped"); ASSERT_TRUE(TransformFactory::getName( TransformFactory::getType("LZ+NONE+RLT")) == "LZ+RLT", "Transform names must round-trip without null transforms"); ASSERT_TRUE(expectInvalidArgument(callUnknownTransformType), "Unknown transform type must throw"); ASSERT_TRUE(expectInvalidArgument(callTooManyTransforms), "More than eight transforms must throw"); { Context ctx; ctx.putString("entropy", "NONE"); TransformSequence* seq = TransformFactory::newTransform(ctx, TransformFactory::getType("TEXT")); ASSERT_TRUE(seq->getNbTransforms() == 1, "TEXT sequence must contain one transform"); ASSERT_TRUE(ctx.getInt("textcodec", 0) == 2, "TEXT must select codec 2 with NONE entropy"); delete seq; } { Context ctx; ctx.putString("entropy", "FPAQ"); TransformSequence* seq = TransformFactory::newTransform(ctx, TransformFactory::getType("TEXT")); ASSERT_TRUE(ctx.getInt("textcodec", 0) == 1, "TEXT must select codec 1 with FPAQ entropy"); delete seq; } { Context ctx; TransformSequence* seq = TransformFactory::newTransform(ctx, TransformFactory::getType("LZX")); ASSERT_TRUE(ctx.getInt("lz", 0) == TransformFactory::LZX_TYPE, "LZX transform must set lz context"); delete seq; } { Context ctx; TransformSequence* seq = TransformFactory::newTransform(ctx, TransformFactory::getType("LZP")); ASSERT_TRUE(ctx.getInt("lz", 0) == TransformFactory::LZP_TYPE, "LZP transform must set lz context"); delete seq; } { Context ctx; TransformSequence* seq = TransformFactory::newTransform(ctx, TransformFactory::getType("DNA")); ASSERT_TRUE(ctx.getInt("packOnlyDNA", 0) == 1, "DNA transform must set packOnlyDNA context"); delete seq; } return 0; } static int testEntropyFactories() { cout << "Test Entropy factories" << endl; ASSERT_TRUE(EntropyEncoderFactory::getType("none") == EntropyEncoderFactory::NONE_TYPE, "NONE entropy encoder must be case insensitive"); ASSERT_TRUE(EntropyDecoderFactory::getType("ans0") == EntropyDecoderFactory::ANS0_TYPE, "ANS0 entropy decoder must be case insensitive"); ASSERT_TRUE(string(EntropyEncoderFactory::getName(EntropyEncoderFactory::HUFFMAN_TYPE)) == "HUFFMAN", "Encoder name must round-trip"); ASSERT_TRUE(string(EntropyDecoderFactory::getName(EntropyDecoderFactory::TPAQX_TYPE)) == "TPAQX", "Decoder name must round-trip"); ASSERT_TRUE(expectInvalidArgument(callUnknownEncoderType), "Unknown encoder type must throw"); ASSERT_TRUE(expectInvalidArgument(callUnknownDecoderType), "Unknown decoder type must throw"); ASSERT_TRUE(expectInvalidArgument(callUnknownEncoderName), "Unknown encoder name must throw"); ASSERT_TRUE(expectInvalidArgument(callUnknownDecoderName), "Unknown decoder name must throw"); stringbuf buffer; iostream io(&buffer); DefaultOutputBitStream obs(io, 16384); Context ctx; EntropyEncoder* encoders[] = { EntropyEncoderFactory::newEncoder(obs, ctx, EntropyEncoderFactory::NONE_TYPE), EntropyEncoderFactory::newEncoder(obs, ctx, EntropyEncoderFactory::HUFFMAN_TYPE), EntropyEncoderFactory::newEncoder(obs, ctx, EntropyEncoderFactory::RANGE_TYPE), EntropyEncoderFactory::newEncoder(obs, ctx, EntropyEncoderFactory::ANS0_TYPE), EntropyEncoderFactory::newEncoder(obs, ctx, EntropyEncoderFactory::ANS1_TYPE), EntropyEncoderFactory::newEncoder(obs, ctx, EntropyEncoderFactory::FPAQ_TYPE), EntropyEncoderFactory::newEncoder(obs, ctx, EntropyEncoderFactory::CM_TYPE), EntropyEncoderFactory::newEncoder(obs, ctx, EntropyEncoderFactory::TPAQ_TYPE), EntropyEncoderFactory::newEncoder(obs, ctx, EntropyEncoderFactory::TPAQX_TYPE) }; for (size_t i = 0; i < sizeof(encoders) / sizeof(encoders[0]); i++) { ASSERT_TRUE(encoders[i] != nullptr, "Entropy encoder must be created"); encoders[i]->dispose(); delete encoders[i]; } obs.close(); io.rdbuf()->pubseekpos(0); DefaultInputBitStream ibs(io, 16384); EntropyDecoder* decoders[] = { EntropyDecoderFactory::newDecoder(ibs, ctx, EntropyDecoderFactory::NONE_TYPE), EntropyDecoderFactory::newDecoder(ibs, ctx, EntropyDecoderFactory::HUFFMAN_TYPE), EntropyDecoderFactory::newDecoder(ibs, ctx, EntropyDecoderFactory::RANGE_TYPE), EntropyDecoderFactory::newDecoder(ibs, ctx, EntropyDecoderFactory::ANS0_TYPE), EntropyDecoderFactory::newDecoder(ibs, ctx, EntropyDecoderFactory::ANS1_TYPE), EntropyDecoderFactory::newDecoder(ibs, ctx, EntropyDecoderFactory::FPAQ_TYPE), EntropyDecoderFactory::newDecoder(ibs, ctx, EntropyDecoderFactory::CM_TYPE), EntropyDecoderFactory::newDecoder(ibs, ctx, EntropyDecoderFactory::TPAQ_TYPE), EntropyDecoderFactory::newDecoder(ibs, ctx, EntropyDecoderFactory::TPAQX_TYPE) }; for (size_t i = 0; i < sizeof(decoders) / sizeof(decoders[0]); i++) { ASSERT_TRUE(decoders[i] != nullptr, "Entropy decoder must be created"); decoders[i]->dispose(); delete decoders[i]; } ibs.close(); return 0; } int main() { if (testTransformFactory() != 0) return 1; if (testEntropyFactories() != 0) return 1; cout << "All factory tests passed." << endl; return 0; } kanzi-cpp-2.5.2/src/test/TestMalformedStream.cpp000066400000000000000000000137431516423635400216310ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include #include "../Error.hpp" #include "../bitstream/DefaultOutputBitStream.hpp" #include "../entropy/EntropyDecoderFactory.hpp" #include "../io/CompressedInputStream.hpp" #include "../io/IOException.hpp" #include "../transform/TransformFactory.hpp" using namespace std; using namespace kanzi; #define ASSERT_TRUE(cond, msg) \ do { \ if (!(cond)) { \ cerr << "ASSERT FAILED: " << msg << " (" << __FILE__ \ << ":" << __LINE__ << ")" << endl; \ return 1; \ } \ } while (0) static uint32 computeHeaderChecksum(int bsVersion, uint64 checksumSize, short entropyType, uint64 transformType, int blockSize, int szMask, uint64 outputSize) { uint32 seed = uint32((bsVersion >= 6 ? 0x01030507 : 1) * bsVersion); const uint32 hash = 0x1E35A7BD; uint32 checksum = hash * seed; if (bsVersion >= 6) checksum ^= hash * uint32(~checksumSize); checksum ^= hash * uint32(~entropyType); checksum ^= hash * uint32((~transformType) >> 32); checksum ^= hash * uint32(~transformType); checksum ^= hash * uint32(~blockSize); if (szMask != 0) { checksum ^= hash * uint32((~outputSize) >> 32); checksum ^= hash * uint32(~outputSize); } return (checksum >> 23) ^ (checksum >> 3); } static string buildHeader(int type, int bsVersion, uint64 checksumSize, short entropyType, uint64 transformType, int blockSize, int szMask, uint64 outputSize, bool validChecksum) { stringbuf buffer; iostream io(&buffer); DefaultOutputBitStream obs(io, 16384); const int crcSize = bsVersion <= 5 ? 16 : 24; obs.writeBits(uint32(type), 32); obs.writeBits(uint32(bsVersion), 4); if (bsVersion >= 6) obs.writeBits(checksumSize, 2); else obs.writeBit(checksumSize == 0 ? 0 : 1); obs.writeBits(uint64(entropyType), 5); obs.writeBits(transformType, 48); obs.writeBits(uint64(blockSize >> 4), 28); obs.writeBits(uint64(szMask), 2); if (szMask != 0) obs.writeBits(outputSize, 16 * szMask); if (bsVersion >= 6) obs.writeBits(uint64(0), 15); uint32 checksum = computeHeaderChecksum(bsVersion, checksumSize, entropyType, transformType, blockSize, szMask, outputSize); if (validChecksum == false) checksum ^= 1; obs.writeBits(checksum, crcSize); obs.close(); return buffer.str(); } static int expectHeaderFailure(const string& name, const string& data, int expectedError, const string& expectedText) { cout << "Test malformed header: " << name << endl; istringstream is(data); CompressedInputStream cis(is, 1); char dst[1]; try { cis.read(dst, 1); } catch (const IOException& e) { ASSERT_TRUE(e.error() == expectedError, "Unexpected IOException error code"); ASSERT_TRUE(string(e.what()).find(expectedText) != string::npos, "Unexpected IOException message"); return 0; } catch (const exception& e) { cerr << "Unexpected exception: " << e.what() << endl; return 1; } cerr << "Expected an exception for malformed header: " << name << endl; return 1; } int main() { const int version = 6; const int type = 0x4B414E5A; const int blockSize = 1024; const short entropy = EntropyDecoderFactory::ANS0_TYPE; const uint64 transform = uint64(TransformFactory::LZ_TYPE) << 42; if (expectHeaderFailure("invalid type", buildHeader(type ^ 1, version, 0, entropy, transform, blockSize, 0, 0, true), Error::ERR_INVALID_FILE, "Invalid stream type") != 0) { return 1; } if (expectHeaderFailure("unsupported version", buildHeader(type, version + 1, 0, entropy, transform, blockSize, 0, 0, true), Error::ERR_STREAM_VERSION, "cannot read this version") != 0) { return 1; } if (expectHeaderFailure("invalid checksum size", buildHeader(type, version, 3, entropy, transform, blockSize, 0, 0, true), Error::ERR_INVALID_FILE, "incorrect block checksum size") != 0) { return 1; } if (expectHeaderFailure("unknown entropy type", buildHeader(type, version, 0, 31, transform, blockSize, 0, 0, true), Error::ERR_INVALID_CODEC, "unknown entropy type") != 0) { return 1; } if (expectHeaderFailure("unknown transform type", buildHeader(type, version, 0, entropy, uint64(63) << 42, blockSize, 0, 0, true), Error::ERR_INVALID_CODEC, "unknown transform type") != 0) { return 1; } if (expectHeaderFailure("invalid block size", buildHeader(type, version, 0, entropy, transform, blockSize - 16, 0, 0, true), Error::ERR_BLOCK_SIZE, "incorrect block size") != 0) { return 1; } if (expectHeaderFailure("header checksum mismatch", buildHeader(type, version, 0, entropy, transform, blockSize, 0, 0, false), Error::ERR_CRC_CHECK, "header checksum mismatch") != 0) { return 1; } cout << "All malformed stream tests passed." << endl; return 0; } kanzi-cpp-2.5.2/src/test/TestTransforms.cpp000066400000000000000000000424271516423635400207060ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include #include "../types.hpp" #include "../transform/AliasCodec.hpp" #include "../transform/FSDCodec.hpp" #include "../transform/LZCodec.hpp" #include "../transform/NullTransform.hpp" #include "../transform/RLT.hpp" #include "../transform/ROLZCodec.hpp" #include "../transform/SBRT.hpp" #include "../transform/SRT.hpp" #include "../transform/TransformFactory.hpp" #include "../transform/ZRLT.hpp" using namespace std; using namespace kanzi; static Transform* getByteTransform(string name, Context& ctx) { if (name.compare("SRT") == 0) return new SRT(ctx); if (name.compare("RLT") == 0) return new RLT(ctx); if (name.compare("ZRLT") == 0) return new ZRLT(ctx); if (name.compare("LZ") == 0) return new LZCodec(ctx); if (name.compare("LZX") == 0){ ctx.putInt("lz", TransformFactory::LZX_TYPE); return new LZCodec(ctx); } if (name.compare("LZP") == 0){ ctx.putInt("lz", TransformFactory::LZP_TYPE); return new LZCodec(ctx); } if (name.compare("ROLZ") == 0) return new ROLZCodec(ctx); if (name.compare("ROLZX") == 0) return new ROLZCodec(ctx); if (name.compare("RANK") == 0) return new SBRT(SBRT::MODE_RANK, ctx); if (name.compare("MTFT") == 0) return new SBRT(SBRT::MODE_MTF, ctx); if (name.compare("MM") == 0) return new FSDCodec(ctx); if (name.compare("NONE") == 0) return new NullTransform(ctx); if (name.compare("ALIAS") == 0) return new AliasCodec(ctx); cout << "No such byte transform: " << name << endl; return nullptr; } int testTransformsCorrectness(const string& name) { srand((uint)time(nullptr)); cout << endl << "Correctness for " << name << endl; int mod = (name == "ZRLT") ? 5 : 256; int res = 0; for (int ii = 0; ii < 51; ii++) { cout << endl << "Test " << ii << endl; int size = 80000; // Declare size, will be updated in conditions kanzi::byte values[1024 * 1024] = { kanzi::byte(0xAA) }; if (name == "ALIAS") mod = 15 + 12 * ii; if (ii == 0) { size = 32; kanzi::byte arr[32] = { (kanzi::byte)0, (kanzi::byte)1, (kanzi::byte)2, (kanzi::byte)2, (kanzi::byte)2, (kanzi::byte)2, (kanzi::byte)7, (kanzi::byte)9, (kanzi::byte)9, (kanzi::byte)16, (kanzi::byte)16, (kanzi::byte)16, (kanzi::byte)1, (kanzi::byte)3, (kanzi::byte)3, (kanzi::byte)3, (kanzi::byte)3, (kanzi::byte)3, (kanzi::byte)3, (kanzi::byte)3, (kanzi::byte)3, (kanzi::byte)3, (kanzi::byte)3, (kanzi::byte)3, (kanzi::byte)3, (kanzi::byte)3, (kanzi::byte)3, (kanzi::byte)3, (kanzi::byte)3, (kanzi::byte)3, (kanzi::byte)3, (kanzi::byte)3 }; memcpy(values, &arr[0], size); } else if (ii < 10) { size = ii; memset(values, ii, size); } else if (ii == 10) { size = 255; memset(values, ii, size); values[127] = kanzi::byte(255); } else if (ii == 11) { size = 80000; kanzi::byte arr[80000]; arr[0] = kanzi::byte(1); for (int i = 1; i < 80000; i++) arr[i] = kanzi::byte(8); memcpy(values, &arr[0], size); } else if (ii == 12) { size = 8; kanzi::byte arr[8] = { (kanzi::byte)0, (kanzi::byte)0, (kanzi::byte)1, (kanzi::byte)1, (kanzi::byte)2, (kanzi::byte)2, (kanzi::byte)3, (kanzi::byte)3 }; memcpy(values, &arr[0], size); } else if (ii == 13) { // For RLT size = 512; kanzi::byte arr[512]; for (int i = 0; i < 256; i++) { arr[2 * i] = kanzi::byte(i); arr[2 * i + 1] = kanzi::byte(i); } arr[1] = kanzi::byte(255); // force RLT escape to be first symbol memcpy(values, &arr[0], size); } else if (ii == 14) { // Lots of zeros size = 1024; kanzi::byte arr[1024] = { kanzi::byte(0) }; for (int i = 0; i < size; i++) { int val = rand() % 100; if (val >= 33) val = 0; arr[i] = kanzi::byte(val); } memcpy(values, &arr[0], size); } else if (ii == 15) { // Lots of zeros size = 2048; kanzi::byte arr[2048] = { kanzi::byte(0) }; for (int i = 0; i < size; i++) { int val = rand() % 100; if (val >= 33) val = 0; arr[i] = kanzi::byte(val); } memcpy(values, &arr[0], size); } else if (ii == 16) { // Totally random size = 512; kanzi::byte arr[512] = { kanzi::byte(0) }; // Leave zeros at the beginning for ZRLT to succeed for (int j = 20; j < 512; j++) arr[j] = kanzi::byte(rand() % mod); memcpy(values, &arr[0], size); } else if (ii < 25) { size = 2048; kanzi::byte arr[2048] = { kanzi::byte(0) }; const int step = max(ii - 5, 2); arr[60] = kanzi::byte(rand() % mod); arr[61] = kanzi::byte(rand() % mod); arr[62] = kanzi::byte(rand() % mod); arr[63] = kanzi::byte(rand() % mod); // Simulate interleaved channels for MM for (int j = 64; j + step < size; j += step) { for (int k = 0; k < step; k++) arr[j + k] = arr[j + k - step]; } memcpy(values, &arr[0], size); } else if (ii == 50) { cout << "Large random data" << endl; size = 1024 * 1024; kanzi::byte* arr = new kanzi::byte[size]; for (int i = 0; i < size; i++) arr[i] = kanzi::byte(rand() % 256); memcpy(values, arr, size); delete[] arr; } else { size = 1024; kanzi::byte arr[1024] = { kanzi::byte(0) }; // Leave zeros at the beginning for ZRLT to succeed int idx = 20; while (idx < 1024) { int len = rand() % 120; // above LZP min match threshold if (len % 3 == 0) len = 1; kanzi::byte val = kanzi::byte(rand() % mod); int end = (idx + len) < size ? idx + len : size; for (int j = idx; j < end; j++) arr[j] = val; idx += len; } memcpy(values, &arr[0], size); } Context ctx; ctx.putInt("bsVersion", 6); ctx.putString("transform", name); Transform* ff = getByteTransform(name, ctx); if (ff == nullptr) return 1; Transform* fi = getByteTransform(name, ctx); if (fi == nullptr) { delete ff; return 1; } const int dstSize = ff->getMaxEncodedLength(size); kanzi::byte* input = new kanzi::byte[size]; kanzi::byte* output = new kanzi::byte[dstSize]; kanzi::byte* reverse = new kanzi::byte[size]; SliceArray iba1(input, size, 0); SliceArray iba2(output, dstSize, 0); SliceArray iba3(reverse, size, 0); memset(output, 0xAA, dstSize); memset(reverse, 0xAA, size); int count; for (int i = 0; i < size; i++) input[i] = values[i]; cout << endl << "Original: " << endl; if (ii == 11) { cout << "1 8 (" << (size - 1) << " times)"; } else { if (size > 1024) { cout << "Large data block - not printing all values."; } else { for (int i = 0; i < size; i++) cout << (int(input[i]) & 0xFF) << " "; } } if (ff->forward(iba1, iba2, size) == false) { if ((iba1._index != size) || (iba2._index >= iba1._index)) { cout << endl << "No compression (ratio > 1.0), skip reverse" << endl; delete ff; delete fi; delete[] input; delete[] output; delete[] reverse; continue; } cout << endl << "Encoding error" << endl; res = 1; ff = nullptr; goto End; } if (name != "MM") { // MM can expand if ((iba1._index != size) || (iba1._index < iba2._index)) { cout << endl << "No compression (ratio > 1.0), skip reverse" << endl; delete ff; delete fi; delete[] input; delete[] output; delete[] reverse; continue; } } cout << endl; cout << "Coded: " << endl; if (iba2._index > 1024) { cout << "Large data block - not printing all values."; } else { for (int i = 0; i < iba2._index; i++) cout << (int(output[i]) & 0xFF) << " "; } cout << " (Compression ratio: " << (iba2._index * 100 / size) << "%)" << endl; count = iba2._index; iba1._index = 0; iba2._index = 0; iba3._index = 0; if (fi->inverse(iba2, iba3, count) == false) { cout << "Decoding error" << endl; res = 1; goto End; } cout << "Decoded: " << endl; if (ii == 11) { cout << "1 8 (" << (size - 1) << " times)"; } else { if (size > 1024) { cout << "Large data block - not printing all values."; } else { for (int i = 0; i < size; i++) cout << (int(reverse[i]) & 0xFF) << " "; } } cout << endl; for (int i = 0; i < size; i++) { if (input[i] != reverse[i]) { cout << "Different (index " << i << ": "; cout << (int(input[i]) & 0xFF) << " - " << (int(reverse[i]) & 0xFF); cout << ")" << endl; res = 1; goto End; } } cout << endl << "Identical" << endl << endl; End: if (ff != nullptr) delete ff; if (fi != nullptr) delete fi; delete[] input; delete[] output; delete[] reverse; } return res; } int testTransformsSpeed(const string& name) { // Test speed srand((uint)time(nullptr)); int iter = 50000; if ((name == "ROLZ") || (name == "SRT") || (name == "RANK") || (name == "MTFT")) iter = 4000; int size = 30000; int res = 0; cout << endl << endl << "Speed test for " << name << endl; cout << "Iterations: " << iter << endl; cout << endl; kanzi::byte input[50000] = { kanzi::byte(0) }; kanzi::byte output[50000] = { kanzi::byte(0) }; kanzi::byte reverse[50000] = { kanzi::byte(0) }; Context ctx; Transform* f = getByteTransform(name, ctx); if (f == nullptr) return 1; SliceArray iba1(input, size, 0); SliceArray iba2(output, f->getMaxEncodedLength(size), 0); SliceArray iba3(reverse, size, 0); int mod = (name == "ZRLT") ? 5 : 256; delete f; for (int jj = 0; jj < 3; jj++) { // Generate random data with runs // Leave zeros at the beginning for ZRLT to succeed int n = iter / 20; if (name == "ALIAS") mod = 5 + 80 * jj; while (n < size) { kanzi::byte val = kanzi::byte(rand() % mod); input[n++] = val; int run = rand() % 256; run -= 220; while ((--run > 0) && (n < size)) input[n++] = val; } clock_t before, after; double delta1 = 0; double delta2 = 0; for (int ii = 0; ii < iter; ii++) { Transform* ff = getByteTransform(name, ctx); iba1._index = 0; iba2._index = 0; before = clock(); if (ff->forward(iba1, iba2, size) == false) { if ((iba1._index != size) || (iba2._index >= iba1._index)) { cout << endl << "No compression (ratio > 1.0), skip reverse" << endl; continue; } cout << "Encoding error" << endl; delete ff; continue; } after = clock(); delta1 += (after - before); delete ff; } int count = iba2._index; for (int ii = 0; ii < iter; ii++) { Transform* fi = getByteTransform(name, ctx); iba3._index = 0; iba2._index = 0; before = clock(); if (fi->inverse(iba2, iba3, count) == false) { cout << "Decoding error" << endl; delete fi; return 1; } after = clock(); delta2 += (after - before); delete fi; } int idx = -1; // Sanity check for (int i = 0; i < iba1._index; i++) { if (iba1._array[i] != iba3._array[i]) { idx = i; break; } } if (idx >= 0) { cout << "Failure at index " << idx << " (" << (int)iba1._array[idx]; cout << "<->" << (int)iba3._array[idx] << ")" << endl; res = 1; } // MB = 1000 * 1000, MiB = 1024 * 1024 double prod = double(iter) * double(size); double b2MiB = double(1) / double(1024 * 1024); double d1_sec = delta1 / CLOCKS_PER_SEC; double d2_sec = delta2 / CLOCKS_PER_SEC; cout << name << " encoding [ms]: " << (int)(d1_sec * 1000) << endl; cout << "Throughput [MiB/s]: " << (int)(prod * b2MiB / d1_sec) << endl; cout << name << " decoding [ms]: " << (int)(d2_sec * 1000) << endl; cout << "Throughput [MiB/s]: " << (int)(prod * b2MiB / d2_sec) << endl; } return res; } #ifdef __GNUG__ int main(int argc, const char* argv[]) #else int TestTransforms_main(int argc, const char* argv[]) #endif { int res = 0; try { vector codecs; bool doPerf = true; if (argc == 1) { #if __cplusplus < 201103L string allCodecs[13] = { "LZ", "LZX", "LZP", "ROLZ", "ROLZX", "RLT", "ZRLT", "RANK", "SRT", "NONE", "ALIAS", "MM", "MTFT" }; for (int i = 0; i < 13; i++) codecs.push_back(allCodecs[i]); #else codecs = { "LZ", "LZX", "LZP", "ROLZ", "ROLZX", "RLT", "ZRLT", "RANK", "SRT", "NONE", "ALIAS", "MM", "MTFT" }; #endif } else { string str = argv[1]; transform(str.begin(), str.end(), str.begin(), ::toupper); if (str != "-TYPE=ALL") { codecs.push_back(str.substr(6)); if (str.compare(0, 6, "-TYPE=") != 0) { cout << "Missing transform type" << endl; return 1; } } else { #if __cplusplus < 201103L string allCodecs[13] = { "LZ", "LZX", "LZP", "ROLZ", "ROLZX", "RLT", "ZRLT", "RANK", "SRT", "NONE", "ALIAS", "MM", "MTFT" }; for (int i = 0; i < 13; i++) codecs.push_back(allCodecs[i]); #else codecs = { "LZ", "LZX", "LZP", "ROLZ", "ROLZX", "RLT", "ZRLT", "RANK", "SRT", "NONE", "ALIAS", "MM", "MTFT" }; #endif } if (argc > 2) { str = argv[2]; transform(str.begin(), str.end(), str.begin(), ::toupper); doPerf = str != "-NOPERF"; } } for (vector::iterator it = codecs.begin(); it != codecs.end(); ++it) { cout << endl << endl << "Test" << *it << endl; res = testTransformsCorrectness(*it); if (res) break; if ((doPerf == true) && (*it != "LZP") && (*it != "MM")) { // skip codecs with no good data res = testTransformsSpeed(*it); if (res) break; } } } catch (const exception& e) { cout << e.what() << endl; res = 123; } cout << endl; cout << ((res == 0) ? "Success" : "Failure") << endl; return res; } kanzi-cpp-2.5.2/src/test/test_api.py000066400000000000000000000132151516423635400173570ustar00rootroot00000000000000import os import tempfile from kanzi import Compressor, Decompressor, KanziError # ----------------------------------------------------------------------------- # Utilities # ----------------------------------------------------------------------------- def make_params(): return dict( transform=b"LZX", entropy=b"HUFFMAN", block_size=1024, jobs=1, checksum=0, headerless=0, ) def fill_buffer(size): return bytes((i * 17 + 3) & 0xFF for i in range(size)) def write_file(path, data: bytes): with open(path, "wb") as f: f.write(data) # ----------------------------------------------------------------------------- # Tests # ----------------------------------------------------------------------------- def test_init_invalid(): print("TEST: initCompressor invalid params...") # Our Python wrapper validates parameters eagerly, # so most invalid cases are raised as Python exceptions. try: Compressor( dst_path="/dev/null", transform=None, # invalid ) assert False, "init should fail on invalid transform" except Exception: pass def test_init_dispose(): print("TEST: init + dispose...") with tempfile.NamedTemporaryFile(delete=True) as tmp: with Compressor(tmp.name, **make_params()) as c: assert c is not None def test_compress_small(): print("TEST: small compression...") data = fill_buffer(256) with tempfile.NamedTemporaryFile(delete=True) as tmp: with Compressor(tmp.name, **make_params()) as c: written = c.compress(data) assert written >= 0 def test_compress_too_big(): print("TEST: oversized block handling...") params = make_params() params["block_size"] = 1024 big = fill_buffer(4096) with tempfile.NamedTemporaryFile(delete=True) as tmp: with Compressor(tmp.name, **params) as c: try: c.compress(big) assert False, "compress should fail on oversized input" except KanziError: pass def test_compress_two_blocks(): print("TEST: two-block compression...") params = make_params() params["block_size"] = 1024 a = fill_buffer(300) b = fill_buffer(500) with tempfile.NamedTemporaryFile(delete=True) as tmp: with Compressor(tmp.name, **params) as c: out1 = c.compress(a) out2 = c.compress(b) assert out1 >= 0 assert out2 >= 0 def test_basic_decompression(): print("TEST: basic decompression...") input_data = b"Hello Kanzi! Hello Compression!" with tempfile.NamedTemporaryFile(delete=False) as comp: comp_name = comp.name # Compress with Compressor( comp_name, transform=b"LZ", entropy=b"ANS0", block_size=1 << 16, jobs=1, checksum=32, headerless=0, ) as c: c.compress(input_data) # Decompress with Decompressor( comp_name, buffer_size=1 << 16, jobs=1, headerless=0, ) as d: out = d.decompress_block(1024) assert out == input_data, "decompressed data mismatch" os.remove(comp_name) def test_large_multi_block(): print("TEST: large multi blocks") size = 2 * 1024 * 1024 data = bytes((i * 7) & 0xFF for i in range(size)) with tempfile.NamedTemporaryFile(delete=False) as comp: comp_name = comp.name # Compress with Compressor( comp_name, transform=b"LZ", entropy=b"FPAQ", block_size=256 * 1024, jobs=1, checksum=64, headerless=0, ) as c: offset = 0 while offset < size: chunk = min(256 * 1024, size - offset) c.compress(data[offset:offset + chunk]) offset += chunk # Decompress out = bytearray() with Decompressor( comp_name, buffer_size=256 * 1024, jobs=1, headerless=0, ) as d: while True: try: block = d.decompress_block(256 * 1024) if not block: break out.extend(block) except KanziError: break # expected EOF assert bytes(out) == data, "large decompression mismatch" os.remove(comp_name) def test_headerless(): print("TEST: headerless") input_data = b"HEADERLESS MODE IS ACTIVE" with tempfile.NamedTemporaryFile(delete=False) as comp: comp_name = comp.name # Compress (headerless) with Compressor( comp_name, transform=b"LZ", entropy=b"ANS0", block_size=1 << 15, jobs=1, checksum=0, headerless=1, ) as c: c.compress(input_data) # Decompress (headerless) with Decompressor( comp_name, buffer_size=1 << 15, jobs=1, headerless=1, transform=b"LZ", entropy=b"ANS0", blockSize=1 << 15, originalSize=len(input_data), checksum=0, bsVersion=1, ) as d: out = d.decompress_block(256) assert out == input_data, "headerless decompression mismatch" os.remove(comp_name) # ----------------------------------------------------------------------------- # Main # ----------------------------------------------------------------------------- def main(): test_init_invalid() test_init_dispose() test_compress_small() test_compress_too_big() test_compress_two_blocks() test_basic_decompression() test_large_multi_block() test_headerless() print("All Python API tests passed.") if __name__ == "__main__": main() kanzi-cpp-2.5.2/src/transform/000077500000000000000000000000001516423635400162275ustar00rootroot00000000000000kanzi-cpp-2.5.2/src/transform/AliasCodec.cpp000066400000000000000000000260641516423635400207320ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include "AliasCodec.hpp" #include "../Global.hpp" #include "../Memory.hpp" using namespace kanzi; using namespace std; const int AliasCodec::MIN_BLOCK_SIZE = 1024; AliasCodec::AliasCodec(Context& ctx) : _pCtx(&ctx) { _onlyDNA = _pCtx->getInt("packOnlyDNA", 0) != 0; } bool AliasCodec::forward(SliceArray& input, SliceArray& output, int count) { if (count == 0) return true; if (count < MIN_BLOCK_SIZE) return false; if (!SliceArray::isValid(input)) throw invalid_argument("Alias codec: Invalid input block"); if (!SliceArray::isValid(output)) throw invalid_argument("Alias codec: Invalid output block"); if (output._length - output._index < getMaxEncodedLength(count)) return false; Global::DataType dt = Global::UNDEFINED; if (_pCtx != nullptr) { dt = (Global::DataType) _pCtx->getInt("dataType", Global::UNDEFINED); if ((dt == Global::MULTIMEDIA) || (dt == Global::UTF8)) return false; if ((dt == Global::EXE) || (dt == Global::BIN)) return false; if ((_onlyDNA == true) && (dt != Global::UNDEFINED) && (dt != Global::DNA)) return false; } kanzi::byte* dst = &output._array[output._index]; const kanzi::byte* src = &input._array[input._index]; // Find missing 1-kanzi::byte symbols uint freqs0[256] = { 0 }; Global::computeHistogram(&src[0], count, freqs0, true); int n0 = 0; int absent[256] = { 0 }; for (int i = 0; i < 256; i++) { if (freqs0[i] == 0) absent[n0++] = i; } if (n0 < 16) return false; if (dt == Global::UNDEFINED) { dt = Global::detectSimpleType(count, freqs0); if ((_pCtx != nullptr) && (dt != Global::UNDEFINED)) _pCtx->putInt("dataType", dt); if ((dt != Global::DNA) && (_onlyDNA == true)) return false; } int srcIdx, dstIdx; if (n0 >= 240) { // Small alphabet => pack bits dst[0] = kanzi::byte(n0); dstIdx = 1; if (n0 == 255) { // One symbol dst[1] = src[0]; LittleEndian::writeInt32(&dst[2], count); dstIdx = 6; srcIdx = count; } else { srcIdx = 0; kanzi::byte map8[256] = { kanzi::byte(0) }; for (int i = 0, j = 0; i < 256; i++) { if (freqs0[i] != 0) { dst[dstIdx++] = kanzi::byte(i); map8[i] = kanzi::byte(j++); } } if (n0 >= 252) { // 4 symbols or less const int c3 = count & 3; dst[dstIdx++] = kanzi::byte(c3); memcpy(&dst[dstIdx], &src[srcIdx], size_t(c3)); srcIdx += c3; dstIdx += c3; while (srcIdx < count) { dst[dstIdx++] = (map8[int(src[srcIdx + 0])] << 6) | (map8[int(src[srcIdx + 1])] << 4) | (map8[int(src[srcIdx + 2])] << 2) | map8[int(src[srcIdx + 3])]; srcIdx += 4; } } else { // 16 symbols or less dst[dstIdx++] = kanzi::byte(count & 1); if ((count & 1) != 0) dst[dstIdx++] = src[srcIdx++]; while (srcIdx < count) { dst[dstIdx++] = (map8[int(src[srcIdx])] << 4) | map8[int(src[srcIdx + 1])]; srcIdx += 2; } } } } else { // Digram encoding vector v; { // Find missing 2-kanzi::byte symbols uint* freqs1 = new uint[65536]; memset(freqs1, 0, 65536 * sizeof(uint)); Global::computeHistogram(&src[0], count, freqs1, false); int n1 = 0; for (uint32 i = 0; i < 65536; i++) { if (freqs1[i] == 0) continue; #if __cplusplus >= 201103L v.emplace_back(i, freqs1[i]); #else sdAlias a(i, freqs1[i]); v.push_back(a); #endif n1++; } delete[] freqs1; if (n1 < n0) { // Fewer distinct 2-kanzi::byte symbols than 1-kanzi::byte symbols n0 = n1; if (n0 < 16) return false; } // Sort by decreasing order 1 frequencies sort(v.begin(), v.end()); } int16 map16[65536]; // Build map symbol -> alias for (int i = 0; i < 65536; i++) map16[i] = 0x100 | int16(i >> 8); int savings = 0; dst[0] = kanzi::byte(n0); dst[1] = kanzi::byte(0); srcIdx = 0; dstIdx = 2; // Header: emit map data for (int i = 0; i < n0; i++) { const sdAlias& sd = v[i]; savings += sd.freq; // ignore factor 2 const int idx = sd.val; map16[idx] = int16(absent[i]) | 0x200; dst[dstIdx] = kanzi::byte(idx >> 8); dst[dstIdx + 1] = kanzi::byte(idx); dst[dstIdx + 2] = kanzi::byte(absent[i]); dstIdx += 3; } // Worth it? if (savings < count / 20) return false; v.clear(); const int srcEnd = count - 1; // Emit aliased data while (srcIdx < srcEnd) { const int16 alias = map16[(int(src[srcIdx]) << 8) | int(src[srcIdx + 1])]; dst[dstIdx++] = kanzi::byte(alias); srcIdx += (alias >> 8); } if (srcIdx != count) { dst[1] = kanzi::byte(1); dst[dstIdx++] = src[srcIdx++]; } } input._index += srcIdx; output._index += dstIdx; return dstIdx < count; } bool AliasCodec::inverse(SliceArray& input, SliceArray& output, int count) { if (count == 0) return true; if (!SliceArray::isValid(input)) throw invalid_argument("Alias codec: Invalid input block"); if (!SliceArray::isValid(output)) throw invalid_argument("Alias codec: Invalid output block"); if (input._index + count > input._length) return false; kanzi::byte* dst = &output._array[output._index]; const kanzi::byte* src = &input._array[input._index]; const int dstEnd = output._length - output._index; int n = int(src[0]); if (n < 16) return false; int srcIdx; int dstIdx = 0; if (n >= 240) { n = 256 - n; srcIdx = 1; if (n == 1) { // One symbol if (count < 6) return false; const kanzi::byte val = src[1]; const int oSize = LittleEndian::readInt32(&src[2]); if ((oSize < 0) || (oSize > dstEnd)) return false; memset(&dst[0], int(val), size_t(oSize)); srcIdx = count; dstIdx = oSize; } else { // Rebuild map alias -> symbol kanzi::byte idx2symb[16] = { kanzi::byte(0) }; if (srcIdx + n + 1 > count) return false; for (int i = 0; i < n; i++) idx2symb[i] = src[srcIdx++]; const int adjust = int(src[srcIdx++]); if ((adjust < 0) || (adjust >= 4)) return false; if (n <= 4) { // 4 symbols or less int32 decodeMap[256] = { 0 }; for (int i = 0; i < 256; i++) { int32 val; val = int32(idx2symb[(i >> 0) & 0x03]); val <<= 8; val |= int32(idx2symb[(i >> 2) & 0x03]); val <<= 8; val |= int32(idx2symb[(i >> 4) & 0x03]); val <<= 8; val |= int32(idx2symb[(i >> 6) & 0x03]); decodeMap[i] = val; } if ((srcIdx + adjust > count) || (dstIdx + adjust > dstEnd)) return false; memcpy(&dst[dstIdx], &src[srcIdx], size_t(adjust)); srcIdx += adjust; dstIdx += adjust; if ((count - srcIdx) > ((dstEnd - dstIdx) >> 2)) return false; while (srcIdx < count) { LittleEndian::writeInt32(&dst[dstIdx], decodeMap[int(src[srcIdx++])]); dstIdx += 4; } } else { // 16 symbols or less int16 decodeMap[256] = { 0 }; for (int i = 0; i < 256; i++) { int16 val = int16(idx2symb[i& 0x0F]); val <<= 8; val |= int16(idx2symb[i >> 4]); decodeMap[i] = val; } if (adjust != 0) { if ((srcIdx >= count) || (dstIdx >= dstEnd)) return false; dst[dstIdx++] = src[srcIdx++]; } if ((count - srcIdx) > ((dstEnd - dstIdx) >> 1)) return false; while (srcIdx < count) { LittleEndian::writeInt16(&dst[dstIdx], decodeMap[int(src[srcIdx++])]); dstIdx += 2; } } } } else { if (count < 2) return false; // Rebuild map alias -> symbol int map16[256] = { 0 }; const int adjust = int(src[1]); if ((adjust < 0) || (adjust > 1)) return false; const int srcEnd = count - adjust; srcIdx = 2; for (int i = 0; i < 256; i++) map16[i] = 0x10000 | i; if (srcIdx + 3 * n > srcEnd) return false; for (int i = 0; i < n; i++) { map16[int(src[srcIdx + 2])] = 0x20000 | int(src[srcIdx]) | (int(src[srcIdx + 1]) << 8); srcIdx += 3; } while (srcIdx < srcEnd) { const int val = map16[int(src[srcIdx++])]; dst[dstIdx] = kanzi::byte(val); dst[dstIdx + 1] = kanzi::byte(val >> 8); dstIdx += (val >> 16); } if (adjust != 0) { if ((srcIdx >= count) || (dstIdx >= dstEnd)) return false; dst[dstIdx++] = src[srcIdx++]; } } input._index += srcIdx; output._index += dstIdx; return srcIdx == count; } kanzi-cpp-2.5.2/src/transform/AliasCodec.hpp000066400000000000000000000033201516423635400207250ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_AliasCodec #define knz_AliasCodec #include "../Context.hpp" #include "../Transform.hpp" namespace kanzi { typedef struct ssAlias { uint32 val; uint32 freq; ssAlias(uint32 v, uint32 f) : val(v), freq(f) { } friend bool operator< (ssAlias const& lhs, ssAlias const& rhs) { int r; return ((r = int(lhs.freq - rhs.freq)) != 0) ? r > 0 : lhs.val > rhs.val; } } sdAlias; // Simple codec replacing large symbols with small aliases whenever possible class AliasCodec FINAL : public Transform { public: AliasCodec() { _pCtx = nullptr; _onlyDNA = false; } AliasCodec(Context& ctx); ~AliasCodec() {} bool forward(SliceArray& src, SliceArray& dst, int length); bool inverse(SliceArray& src, SliceArray& dst, int length); // Required encoding output buffer size int getMaxEncodedLength(int srcLen) const { return srcLen + 1024; } private: static const int MIN_BLOCK_SIZE; Context* _pCtx; bool _onlyDNA; }; } #endif kanzi-cpp-2.5.2/src/transform/BWT.cpp000066400000000000000000000431611516423635400173740ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include "BWT.hpp" #include "../Global.hpp" #include "../Memory.hpp" #ifdef CONCURRENCY_ENABLED #include #endif using namespace kanzi; using namespace std; const int BWT::MAX_BLOCK_SIZE = 1024 * 1024 * 1024; // 1024 MB const int BWT::NB_FASTBITS = 17; const int BWT::MASK_FASTBITS = (1 << NB_FASTBITS) - 1; const int BWT::BLOCK_SIZE_THRESHOLD1 = 256; const int BWT::BLOCK_SIZE_THRESHOLD2 = 2 * 1024 * 1024; BWT::BWT(int jobs) { _buffer = nullptr; _sa = nullptr; _bufferSize = 0; _saSize = 0; #ifdef CONCURRENCY_ENABLED _pool = nullptr; if (jobs < 1) throw invalid_argument("The number of jobs must be at least 1"); #else if (jobs != 1) throw invalid_argument("The number of jobs is limited to 1 in this version"); #endif _jobs = jobs; memset(_primaryIndexes, 0, sizeof(int) * 8); } BWT::BWT(Context& ctx) { _buffer = nullptr; _sa = nullptr; _bufferSize = 0; _saSize = 0; int jobs = ctx.getInt("jobs", 1); #ifdef CONCURRENCY_ENABLED _pool = ctx.getPool(); // can be null if (jobs < 1) throw invalid_argument("The number of jobs must be at least 1"); #else if (jobs != 1) throw invalid_argument("The number of jobs is limited to 1 in this version"); #endif _jobs = jobs; memset(_primaryIndexes, 0, sizeof(int) * 8); } bool BWT::setPrimaryIndex(int n, int primaryIndex) { if ((primaryIndex < 0) || (n < 0) || (n >= 8)) return false; _primaryIndexes[n] = primaryIndex; return true; } bool BWT::forward(SliceArray& input, SliceArray& output, int count) { if (count == 0) return true; if (!SliceArray::isValid(input)) throw invalid_argument("BWT: Invalid input block"); if (!SliceArray::isValid(output)) throw invalid_argument("BWT: Invalid output block"); if (count > MAX_BLOCK_SIZE) return false; if (count == 1) { output._array[output._index++] = input._array[input._index++]; return true; } const kanzi::byte* src = &input._array[input._index]; kanzi::byte* dst = &output._array[output._index]; // Lazy dynamic memory allocation if (_saSize < count) { if (_sa != nullptr) delete[] _sa; _saSize = count; _sa = new int[_saSize]; } if (_saAlgo.computeBWT(src, dst, _sa, count, _primaryIndexes, getBWTChunks(count)) == false) return false; input._index += count; output._index += count; return true; } bool BWT::inverse(SliceArray& input, SliceArray& output, int count) { if (count == 0) return true; if (!SliceArray::isValid(input)) throw invalid_argument("BWT: Invalid input block"); if (!SliceArray::isValid(output)) throw invalid_argument("BWT: Invalid output block"); if (count == 1) { output._array[output._index++] = input._array[input._index++]; return true; } // Find the fastest way to implement inverse based on block size if (count <= BLOCK_SIZE_THRESHOLD2) return inverseMergeTPSI(input, output, count); return inverseBiPSIv2(input, output, count); } // When count <= BLOCK_SIZE_THRESHOLD2, mergeTPSI algo bool BWT::inverseMergeTPSI(SliceArray& input, SliceArray& output, int count) { if (count == 0) return true; const int pIdx = getPrimaryIndex(0); if ((pIdx <= 0) || (pIdx > count)) return false; // Lazy dynamic memory allocation if (_bufferSize < count) { if (_buffer != nullptr) delete[] _buffer; _bufferSize = max(count, 256); _buffer = new uint[_bufferSize]; } // Build array of packed index + value (assumes block size < 1<<24) uint buckets[256] = { 0 }; Global::computeHistogram(&input._array[input._index], count, buckets); for (int i = 0, sum = 0; i < 256; i++) { const int tmp = buckets[i]; buckets[i] = sum; sum += tmp; } const kanzi::byte* src = &input._array[input._index]; kanzi::byte* dst = &output._array[output._index]; memset(&_buffer[0], 0, size_t(_bufferSize) * sizeof(uint)); const uint end1 = uint(pIdx); const uint end2 = uint(count); _buffer[buckets[uint8(src[0])]] = uint(src[0]); buckets[uint8(src[0])]++; for (uint i = 1; i < end1; i++) { const uint8 val = uint8(src[i]); _buffer[buckets[val]] = ((i - 1) << 8) | val; buckets[val]++; } for (uint i = end1; i < end2; i++) { const uint8 val = uint8(src[i]); _buffer[buckets[val]] = (i << 8) | val; buckets[val]++; } if (getBWTChunks(count) != 8) { int t = pIdx - 1; int n = 0; while (n < count) { const int ptr = _buffer[t]; dst[n++] = kanzi::byte(ptr); t = ptr >> 8; } } else { const int ckSize = ((count & 7) == 0) ? count >> 3 : (count >> 3) + 1; int t0 = getPrimaryIndex(0) - 1; if ((t0 < 0) || (t0 >= _bufferSize)) return false; int t1 = getPrimaryIndex(1) - 1; if ((t1 < 0) || (t1 >= _bufferSize)) return false; int t2 = getPrimaryIndex(2) - 1; if ((t2 < 0) || (t2 >= _bufferSize)) return false; int t3 = getPrimaryIndex(3) - 1; if ((t3 < 0) || (t3 >= _bufferSize)) return false; int t4 = getPrimaryIndex(4) - 1; if ((t4 < 0) || (t4 >= _bufferSize)) return false; int t5 = getPrimaryIndex(5) - 1; if ((t5 < 0) || (t5 >= _bufferSize)) return false; int t6 = getPrimaryIndex(6) - 1; if ((t6 < 0) || (t6 >= _bufferSize)) return false; int t7 = getPrimaryIndex(7) - 1; if ((t7 < 0) || (t7 >= _bufferSize)) return false; // Last interval [7*chunk:count] smaller when 8*ckSize != count const int end = count - ckSize * 7; kanzi::byte* d0 = &dst[end + ckSize * 0]; kanzi::byte* d1 = &dst[end + ckSize * 1]; kanzi::byte* d2 = &dst[end + ckSize * 2]; kanzi::byte* d3 = &dst[end + ckSize * 3]; kanzi::byte* d4 = &dst[end + ckSize * 4]; kanzi::byte* d5 = &dst[end + ckSize * 5]; kanzi::byte* d6 = &dst[end + ckSize * 6]; kanzi::byte* d7 = &dst[end + ckSize * 7]; int n = -end; int ptr; #define S(t, d) ptr = _buffer[t]; \ d[n] = kanzi::byte(ptr); \ t = ptr >> 8 while (n < 0) { S(t0, d0); S(t1, d1); S(t2, d2); S(t3, d3); S(t4, d4); S(t5, d5); S(t6, d6); S(t7, d7); n++; } while (n < ckSize - end) { S(t0, d0); S(t1, d1); S(t2, d2); S(t3, d3); S(t4, d4); S(t5, d5); S(t6, d6); n++; } } input._index += count; output._index += count; return true; } // When count > BLOCK_SIZE_THRESHOLD2, biPSIv2 algo bool BWT::inverseBiPSIv2(SliceArray& input, SliceArray& output, int count) { // Lazy dynamic memory allocations if (_bufferSize < count + 1) { if (_buffer != nullptr) delete[] _buffer; _bufferSize = max(count + 1, 256); _buffer = new uint[_bufferSize]; } const kanzi::byte* src = &input._array[input._index]; kanzi::byte* dst = &output._array[output._index]; const int pIdx = getPrimaryIndex(0); if ((pIdx < 0) || (pIdx > count)) return false; uint* buckets = new uint[65536]; memset(&buckets[0], 0, 65536 * sizeof(uint)); uint freqs[256] = { 0 }; Global::computeHistogram(&input._array[input._index], count, freqs); for (int sum = 1, c = 0; c < 256; c++) { const int f = sum; sum += int(freqs[c]); freqs[c] = f; if (f != sum) { uint* ptr = &buckets[c << 8]; const int hi = min(sum, pIdx); for (int i = f; i < hi; i++) ptr[int(src[i])]++; const int lo = max(f - 1, pIdx); for (int i = lo; i < sum - 1; i++) ptr[int(src[i])]++; } } const int lastc = int(src[0]); uint16* fastBits = new uint16[MASK_FASTBITS + 1]; memset(&fastBits[0], 0, size_t(MASK_FASTBITS + 1) * sizeof(uint16)); int shift = 0; while ((count >> shift) > MASK_FASTBITS) shift++; for (int v = 0, sum = 1, c = 0; c < 256; c++) { if (c == lastc) sum++; uint* ptr = &buckets[c]; for (int d = 0; d < 256; d++) { const int s = sum; sum += ptr[d << 8]; ptr[d << 8] = s; if (s == sum) continue; for (; v <= ((sum - 1) >> shift); v++) fastBits[v] = uint16((c << 8) | d); } } memset(&_buffer[0], 0, size_t(_bufferSize) * sizeof(uint)); int n = 0; while (n < pIdx) { const int c = int(src[n]); const int p = freqs[c]; if (p < pIdx) _buffer[buckets[(c << 8) | int(src[p])]++] = n; else if (p > pIdx) _buffer[buckets[(c << 8) | int(src[p - 1])]++] = n; freqs[c]++; n++; } while (n < count) { const int c = int(src[n]); const int p = freqs[c]; freqs[c]++; n++; if (p < pIdx) _buffer[buckets[(c << 8) | int(src[p])]++] = n; else if (p > pIdx) _buffer[buckets[(c << 8) | int(src[p - 1])]++] = n; } for (int c = 0; c < 256; c++) { for (int d = 0; d < c; d++) { swap(buckets[(d << 8) | c], buckets[(c << 8) | d]); } } const int chunks = getBWTChunks(count); // Build inverse const int st = count / chunks; const int ckSize = (chunks * st == count) ? st : st + 1; const int nbTasks = (_jobs < chunks) ? _jobs : chunks; if (nbTasks == 1) { InverseBiPSIv2Task task(_buffer, buckets, fastBits, dst, _primaryIndexes, count, 0, ckSize, 0, chunks); task.run(); } else { #ifdef CONCURRENCY_ENABLED // Several chunks may be decoded concurrently (depending on the availability // of jobs per block). int jobsPerTask[64]; Global::computeJobsPerTask(jobsPerTask, chunks, nbTasks); vector > futures; vector*> tasks; // Create one task per job for (int j = 0, c = 0; j < nbTasks; j++) { // Each task decodes jobsPerTask[j] chunks InverseBiPSIv2Task* task = new InverseBiPSIv2Task(_buffer, buckets, fastBits, dst, _primaryIndexes, count, c * ckSize, ckSize, c, c + jobsPerTask[j]); tasks.push_back(task); if (_pool == nullptr) futures.push_back(async(launch::async, &InverseBiPSIv2Task::run, task)); else futures.push_back(_pool->schedule(&InverseBiPSIv2Task::run, task)); c += jobsPerTask[j]; } // Wait for completion of all concurrent tasks for (int j = 0; j < nbTasks; j++) futures[j].get(); // Cleanup for (InverseBiPSIv2Task* task : tasks) delete task; #else // nbTasks > 1 but concurrency is not enabled (should never happen) delete[] fastBits; delete[] buckets; throw invalid_argument("Error during BWT inverse: concurrency not supported"); #endif } dst[count - 1] = kanzi::byte(lastc); delete[] fastBits; delete[] buckets; input._index += count; output._index += count; return true; } template InverseBiPSIv2Task::InverseBiPSIv2Task(uint* buf, uint* buckets, uint16* fastBits, kanzi::byte* output, int* primaryIndexes, int total, int start, int ckSize, int firstChunk, int lastChunk) : _data(buf) , _buckets(buckets) , _fastBits(fastBits) , _primaryIndexes(primaryIndexes) , _dst(output) , _total(total) , _start(start) , _ckSize(ckSize) , _firstChunk(firstChunk) , _lastChunk(lastChunk) { } template T InverseBiPSIv2Task::run() { uint sh = 0; while ((_total >> sh) > BWT::MASK_FASTBITS) sh++; const uint shift = sh; int c = _firstChunk; kanzi::byte* d0 = &_dst[0 * _ckSize]; kanzi::byte* d1 = &_dst[1 * _ckSize]; kanzi::byte* d2 = &_dst[2 * _ckSize]; kanzi::byte* d3 = &_dst[3 * _ckSize]; kanzi::byte* d4 = &_dst[4 * _ckSize]; kanzi::byte* d5 = &_dst[5 * _ckSize]; kanzi::byte* d6 = &_dst[6 * _ckSize]; kanzi::byte* d7 = &_dst[7 * _ckSize]; if (_start + 7 * _ckSize <= _total) { for (; c + 8 <= _lastChunk; c += 8) { const int end = _start + _ckSize; uint p0 = _primaryIndexes[c + 0]; uint p1 = _primaryIndexes[c + 1]; uint p2 = _primaryIndexes[c + 2]; uint p3 = _primaryIndexes[c + 3]; uint p4 = _primaryIndexes[c + 4]; uint p5 = _primaryIndexes[c + 5]; uint p6 = _primaryIndexes[c + 6]; uint p7 = _primaryIndexes[c + 7]; for (int i = _start + 1; i <= end; i += 2) { prefetchRead(&_data[p0]); prefetchRead(&_data[p1]); prefetchRead(&_data[p2]); prefetchRead(&_data[p3]); prefetchRead(&_data[p4]); prefetchRead(&_data[p5]); prefetchRead(&_data[p6]); prefetchRead(&_data[p7]); uint16 s0 = _fastBits[p0 >> shift]; uint16 s1 = _fastBits[p1 >> shift]; uint16 s2 = _fastBits[p2 >> shift]; uint16 s3 = _fastBits[p3 >> shift]; uint16 s4 = _fastBits[p4 >> shift]; uint16 s5 = _fastBits[p5 >> shift]; uint16 s6 = _fastBits[p6 >> shift]; uint16 s7 = _fastBits[p7 >> shift]; if (_buckets[s0] <= p0) { do { s0++; } while (_buckets[s0] <= p0); } if (_buckets[s1] <= p1) { do { s1++; } while (_buckets[s1] <= p1); } if (_buckets[s2] <= p2) { do { s2++; } while (_buckets[s2] <= p2); } if (_buckets[s3] <= p3) { do { s3++; } while (_buckets[s3] <= p3); } if (_buckets[s4] <= p4) { do { s4++; } while (_buckets[s4] <= p4); } if (_buckets[s5] <= p5) { do { s5++; } while (_buckets[s5] <= p5); } if (_buckets[s6] <= p6) { do { s6++; } while (_buckets[s6] <= p6); } if (_buckets[s7] <= p7) { do { s7++; } while (_buckets[s7] <= p7); } d0[i - 1] = kanzi::byte(s0 >> 8); d0[i] = kanzi::byte(s0); d1[i - 1] = kanzi::byte(s1 >> 8); d1[i] = kanzi::byte(s1); d2[i - 1] = kanzi::byte(s2 >> 8); d2[i] = kanzi::byte(s2); d3[i - 1] = kanzi::byte(s3 >> 8); d3[i] = kanzi::byte(s3); d4[i - 1] = kanzi::byte(s4 >> 8); d4[i] = kanzi::byte(s4); d5[i - 1] = kanzi::byte(s5 >> 8); d5[i] = kanzi::byte(s5); d6[i - 1] = kanzi::byte(s6 >> 8); d6[i] = kanzi::byte(s6); d7[i - 1] = kanzi::byte(s7 >> 8); d7[i] = kanzi::byte(s7); p0 = _data[p0]; p1 = _data[p1]; p2 = _data[p2]; p3 = _data[p3]; p4 = _data[p4]; p5 = _data[p5]; p6 = _data[p6]; p7 = _data[p7]; } _start += (8 * _ckSize); } } for (; c < _lastChunk; c++) { const int end = min(_start + _ckSize, _total - 1); uint p = _primaryIndexes[c]; for (int i = _start + 1; i <= end; i += 2) { uint16 s = _fastBits[p >> shift]; while (_buckets[s] <= p) s++; _dst[i - 1] = kanzi::byte(s >> 8); _dst[i] = kanzi::byte(s); p = _data[p]; } _start = end; } return T(0); } kanzi-cpp-2.5.2/src/transform/BWT.hpp000066400000000000000000000103101516423635400173670ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_BWT #define knz_BWT #include "../concurrent.hpp" #include "../Context.hpp" #include "../Transform.hpp" #include "DivSufSort.hpp" namespace kanzi { // The Burrows-Wheeler Transform is a reversible transform based on // permutation of the data in the original message to reduce the entropy. // The initial text can be found here: // Burrows M and Wheeler D, [A block sorting lossless data compression algorithm] // Technical Report 124, Digital Equipment Corporation, 1994 // See also Peter Fenwick, [Block sorting text compression - final report] // Technical Report 130, 1996 // This implementation replaces the 'slow' sorting of permutation strings // with the construction of a suffix array (faster but more complex). // // E.G. 0123456789A // Source: mississippi\0 // Suffixes: rank sorted // mississippi\0 0 -> 4 i\0 // ississippi\0 1 -> 3 ippi\0 // ssissippi\0 2 -> 10 issippi\0 // sissippi\0 3 -> 8 ississippi\0 // issippi\0 4 -> 2 mississippi\0 // ssippi\0 5 -> 9 pi\0 // sippi\0 6 -> 7 ppi\0 // ippi\0 7 -> 1 sippi\0 // ppi\0 8 -> 6 sissippi\0 // pi\0 9 -> 5 ssippi\0 // i\0 10 -> 0 ssissippi\0 // Suffix array SA : 10 7 4 1 0 9 8 6 3 5 2 // BWT[i] = input[SA[i]-1] => BWT(input) = ipssmpissii (+ primary index 5) // The suffix array and permutation vector are equal when the input is 0 terminated // The insertion of a guard is done internally and is entirely transparent. // // This implementation extends the canonical algorithm to use up to MAX_CHUNKS primary // indexes (based on input block size). Each primary index corresponds to a data chunk. // Chunks may be inverted concurrently. template class InverseBiPSIv2Task FINAL : public Task { private: uint* _data; uint* _buckets; uint16* _fastBits; int* _primaryIndexes; byte* _dst; int _total; int _start; int _ckSize; int _firstChunk; int _lastChunk; public: InverseBiPSIv2Task(uint* buf, uint* buckets, uint16* fastBits, byte* output, int* primaryIndexes, int total, int start, int ckSize, int firstChunk, int lastChunk); ~InverseBiPSIv2Task() {} T run(); }; class BWT FINAL : public Transform { private: static const int MAX_BLOCK_SIZE; static const int NB_FASTBITS; static const int BLOCK_SIZE_THRESHOLD1; static const int BLOCK_SIZE_THRESHOLD2; uint* _buffer; int* _sa; int _bufferSize; int _saSize; int _primaryIndexes[8]; DivSufSort _saAlgo; int _jobs; #ifdef CONCURRENCY_ENABLED ThreadPool* _pool; #endif bool inverseBiPSIv2(SliceArray& input, SliceArray& output, int count); bool inverseMergeTPSI(SliceArray& input, SliceArray& output, int count); public: static const int MASK_FASTBITS; BWT(int jobs = 1); BWT(Context& ctx); ~BWT() { if (_buffer != nullptr) delete[] _buffer; if (_sa != nullptr) delete[] _sa; } bool forward(SliceArray& input, SliceArray& output, int length); bool inverse(SliceArray& input, SliceArray& output, int length); int getPrimaryIndex(int n) const { return _primaryIndexes[n]; } bool setPrimaryIndex(int n, int primaryIndex); int getMaxEncodedLength(int srcLen) const { return srcLen; } static int getBWTChunks(int size); }; inline int BWT::getBWTChunks(int size) { return (size < BLOCK_SIZE_THRESHOLD1) ? 1 : 8; } } #endif kanzi-cpp-2.5.2/src/transform/BWTBlockCodec.cpp000066400000000000000000000120321516423635400212760ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include "BWTBlockCodec.hpp" #include "../Global.hpp" using namespace kanzi; BWTBlockCodec::BWTBlockCodec(Context& ctx) { _pBWT = new BWT(ctx); _bsVersion = ctx.getInt("bsVersion"); } // Return true if the compression chain succeeded. In this case, the input data // may be modified. If the compression failed, the input data is returned unmodified. bool BWTBlockCodec::forward(SliceArray& input, SliceArray& output, int blockSize) { if (blockSize == 0) return true; if (!SliceArray::isValid(input)) throw std::invalid_argument("BWTBlockCodec: Invalid input block"); if (!SliceArray::isValid(output)) throw std::invalid_argument("BWTBlockCodec: Invalid output block"); if (input._array == output._array) return false; if (output._length - output._index < getMaxEncodedLength(blockSize)) return false; int logBlockSize = Global::_log2(uint32(blockSize)); if ((blockSize & (blockSize - 1)) != 0) logBlockSize++; const int pIndexSize = (logBlockSize + 7) >> 3; if ((pIndexSize <= 0) || (pIndexSize >= 5)) return false; const int chunks = BWT::getBWTChunks(blockSize); const int logNbChunks = Global::_log2(uint32(chunks)); if (logNbChunks > 7) return false; kanzi::byte* dst = &output._array[output._index]; output._index += (1 + chunks * pIndexSize); // Apply forward transform if (_pBWT->forward(input, output, blockSize) == false) return false; const kanzi::byte mode = kanzi::byte((logNbChunks << 2) | (pIndexSize - 1)); // Emit header for (int i = 0, idx = 1; i < chunks; i++) { const int primaryIndex = _pBWT->getPrimaryIndex(i) - 1; int shift = (pIndexSize - 1) << 3; while (shift >= 0) { dst[idx++] = kanzi::byte(primaryIndex >> shift); shift -= 8; } } dst[0] = mode; return true; } bool BWTBlockCodec::inverse(SliceArray& input, SliceArray& output, int blockSize) { if (blockSize <= 1) return blockSize == 0; if (!SliceArray::isValid(input)) throw std::invalid_argument("BWTBlockCodec: Invalid input block"); if (!SliceArray::isValid(output)) throw std::invalid_argument("BWTBlockCodec: Invalid output block"); if (input._array == output._array) return false; if (_bsVersion > 5) { // Number of chunks and primary index size in bitstream since bsVersion 6 kanzi::byte mode = input._array[input._index++]; const uint logNbChunks = uint(mode >> 2) & 0x07; const int pIndexSize = (int(mode) & 0x03) + 1; const int chunks = 1 << logNbChunks; const int headerSize = 1 + chunks * pIndexSize; if ((input._length - input._index < headerSize) || (blockSize < headerSize)) return false; if (chunks != BWT::getBWTChunks(blockSize - headerSize)) return false; // Read header for (int i = 0; i < chunks; i++) { int shift = (pIndexSize - 1) << 3; int primaryIndex = 0; // Extract BWT primary index while (shift >= 0) { primaryIndex = (primaryIndex << 8) | int(input._array[input._index++]); shift -= 8; } if (_pBWT->setPrimaryIndex(i, primaryIndex + 1) == false) return false; } blockSize -= headerSize; } else { const int chunks = BWT::getBWTChunks(blockSize); for (int i = 0; i < chunks; i++) { // Read block header (mode + primary index) const int blockMode = int(input._array[input._index++]); const int pIndexSizeBytes = 1 + ((blockMode >> 6) & 0x03); if ((blockSize < pIndexSizeBytes) || (input._index + (pIndexSizeBytes - 1) > input._length)) return false; blockSize -= pIndexSizeBytes; int shift = (pIndexSizeBytes - 1) << 3; int primaryIndex = (blockMode & 0x3F) << shift; // Extract BWT primary index for (int n = 1; n < pIndexSizeBytes; n++) { shift -= 8; primaryIndex |= (int(input._array[input._index++]) << shift); } if (_pBWT->setPrimaryIndex(i, primaryIndex) == false) return false; } } // Apply inverse Transform return _pBWT->inverse(input, output, blockSize); } kanzi-cpp-2.5.2/src/transform/BWTBlockCodec.hpp000066400000000000000000000031321516423635400213040ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_BWTBlockCodec #define knz_BWTBlockCodec #include "../transform/BWT.hpp" #include "../Context.hpp" namespace kanzi { // Utility class to en/de-code a BWT data block and its associated primary index(es) // BWT stream format: Header (mode + primary index(es)) | Data (n bytes) // mode (8 bits): xxxyyyzz // xxx: ignored // yyy: log(chunks) // zz: primary index size - 1 (in bytes) // primary indexes (chunks * (8|16|24|32 bits)) class BWTBlockCodec FINAL : public Transform { public: BWTBlockCodec(Context& ctx); ~BWTBlockCodec() { delete _pBWT; } bool forward(SliceArray& input, SliceArray& output, int length); bool inverse(SliceArray& input, SliceArray& output, int length); // Required encoding output buffer size int getMaxEncodedLength(int srcLen) const { return srcLen + 1 + 32; // mode + 8 indexes } private: BWT* _pBWT; int _bsVersion; }; } #endif kanzi-cpp-2.5.2/src/transform/BWTS.cpp000066400000000000000000000136571516423635400175260ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include "BWTS.hpp" #include "../Global.hpp" using namespace kanzi; using namespace std; const int BWTS::MAX_BLOCK_SIZE = 1024 * 1024 * 1024; // 1024 MB bool BWTS::forward(SliceArray& input, SliceArray& output, int count) { if (count == 0) return true; if (!SliceArray::isValid(input)) throw invalid_argument("BWTS: Invalid input block"); if (!SliceArray::isValid(output)) throw invalid_argument("BWTS: Invalid output block"); if (count > MAX_BLOCK_SIZE) { // Not a recoverable error: instead of silently fail the transform, // issue a fatal error. stringstream ss; ss << "The max BWTS block size is " << MAX_BLOCK_SIZE << ", got " << count; throw invalid_argument(ss.str()); } if (count < 2) { if (count == 1) output._array[output._index++] = input._array[input._index++]; return true; } const kanzi::byte* src = &input._array[input._index]; kanzi::byte* dst = &output._array[output._index]; // Lazy dynamic memory allocation if (_bufferSize < count) { _bufferSize = count; if (_buffer1 != nullptr) delete[] _buffer1; _buffer1 = new int[_bufferSize]; if (_buffer2 != nullptr) delete[] _buffer2; _buffer2 = new int[_bufferSize]; } // Aliasing int* sa = _buffer1; int* isa = _buffer2; if (_saAlgo.computeSuffixArray(src, sa, count) == false) return false; for (int i = 0; i < count; i++) isa[sa[i]] = i; int min = isa[0]; int idxMin = 0; for (int i = 1; ((i < count) && (min > 0)); i++) { if (isa[i] >= min) continue; int refRank = moveLyndonWordHead(sa, isa, src, count, idxMin, i - idxMin, min); for (int j = i - 1; j > idxMin; j--) { // Iterate through the new Lyndon word from end to start int testRank = isa[j]; int startRank = testRank; while (testRank < count - 1) { int nextRankStart = sa[testRank + 1]; if ((j > nextRankStart) || (src[j] != src[nextRankStart]) || (refRank < isa[nextRankStart + 1])) break; sa[testRank] = nextRankStart; isa[nextRankStart] = testRank; testRank++; } sa[testRank] = j; isa[j] = testRank; refRank = testRank; if (startRank == testRank) break; } min = isa[i]; idxMin = i; } min = count; for (int i = 0; i < count; i++) { if (isa[i] >= min) { dst[isa[i]] = src[i - 1]; continue; } if (min < count) dst[min] = src[i - 1]; min = isa[i]; } dst[0] = src[count - 1]; input._index += count; output._index += count; return true; } int BWTS::moveLyndonWordHead(int sa[], int isa[], const kanzi::byte data[], int count, int start, int size, int rank) const { const int end = start + size; while (rank + 1 < count) { const int nextStart0 = sa[rank + 1]; if (nextStart0 <= end) break; int nextStart = nextStart0; int k = 0; while ((k < size) && (nextStart < count) && (data[start + k] == data[nextStart])) { k++; nextStart++; } if ((k == size) && (rank < isa[nextStart])) break; if ((k < size) && (nextStart < count) && (data[start + k] < data[nextStart])) break; sa[rank] = nextStart0; isa[nextStart0] = rank; rank++; } sa[rank] = start; isa[start] = rank; return rank; } bool BWTS::inverse(SliceArray& input, SliceArray& output, int count) { if (count == 0) return true; if (!SliceArray::isValid(input)) throw invalid_argument("BWTS: Invalid input block"); if (!SliceArray::isValid(output)) throw invalid_argument("BWTS: Invalid output block"); if (count < 2) { if (count == 1) output._array[output._index++] = input._array[input._index++]; return true; } // Lazy dynamic memory allocation if (_bufferSize < count) { _bufferSize = count; if (_buffer1 != nullptr) delete[] _buffer1; _buffer1 = new int[_bufferSize]; } // Initialize histogram uint buckets[256] = { 0 }; Global::computeHistogram(&input._array[input._index], count, buckets, true); // Histogram for (int i = 0, sum = 0; i < 256; i++) { sum += buckets[i]; buckets[i] = sum - buckets[i]; } // Aliasing int* lf = _buffer1; const kanzi::byte* src = &input._array[input._index]; kanzi::byte* dst = &output._array[output._index]; for (int i = 0; i < count; i++) lf[i] = buckets[int(src[i])]++; // Build inverse for (int i = 0, j = count - 1; j >= 0; i++) { if (lf[i] < 0) continue; int p = i; do { dst[j] = src[p]; j--; const int t = lf[p]; lf[p] = -1; p = t; } while (lf[p] >= 0); } input._index += count; output._index += count; return true; } kanzi-cpp-2.5.2/src/transform/BWTS.hpp000066400000000000000000000040711516423635400175210ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_BWTS #define knz_BWTS #include "../Context.hpp" #include "../Transform.hpp" #include "DivSufSort.hpp" namespace kanzi { // Bijective version of the Burrows-Wheeler Transform // The main advantage over the regular BWT is that there is no need for a primary // index (hence the bijectivity). BWTS is about 10% slower than BWT. // Forward transform based on the code at https://code.google.com/p/mk-bwts/ // by Neal Burns and DivSufSort (port of libDivSufSort by Yuta Mori) class BWTS FINAL : public Transform { private: static const int MAX_BLOCK_SIZE; int* _buffer1; int* _buffer2; int _bufferSize; DivSufSort _saAlgo; int moveLyndonWordHead(int sa[], int isa[], const byte data[], int count, int start, int size, int rank) const; public: BWTS() { _buffer1 = nullptr; _buffer2 = nullptr; _bufferSize = 0; } BWTS(Context&) { _buffer1 = nullptr; _buffer2 = nullptr; _bufferSize = 0; } ~BWTS() { if (_buffer1 != nullptr) delete[] _buffer1; if (_buffer2 != nullptr) delete[] _buffer2; } bool forward(SliceArray& input, SliceArray& output, int length); bool inverse(SliceArray& input, SliceArray& output, int length); int getMaxEncodedLength(int srcLen) const { return srcLen; } }; } #endif kanzi-cpp-2.5.2/src/transform/DivSufSort.cpp000066400000000000000000001632121516423635400210100ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include "DivSufSort.hpp" #include "../Memory.hpp" using namespace kanzi; const int DivSufSort::SS_INSERTIONSORT_THRESHOLD = 16; const int DivSufSort::SS_BLOCKSIZE = 8192; const int DivSufSort::SS_MISORT_STACKSIZE = 16; const int DivSufSort::SS_SMERGE_STACKSIZE = 32; const int DivSufSort::TR_STACKSIZE = 64; const int DivSufSort::TR_INSERTIONSORT_THRESHOLD = 16; const int DivSufSort::SQQ_TABLE[] = { 0, 16, 22, 27, 32, 35, 39, 42, 45, 48, 50, 53, 55, 57, 59, 61, 64, 65, 67, 69, 71, 73, 75, 76, 78, 80, 81, 83, 84, 86, 87, 89, 90, 91, 93, 94, 96, 97, 98, 99, 101, 102, 103, 104, 106, 107, 108, 109, 110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 128, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 144, 145, 146, 147, 148, 149, 150, 150, 151, 152, 153, 154, 155, 155, 156, 157, 158, 159, 160, 160, 161, 162, 163, 163, 164, 165, 166, 167, 167, 168, 169, 170, 170, 171, 172, 173, 173, 174, 175, 176, 176, 177, 178, 178, 179, 180, 181, 181, 182, 183, 183, 184, 185, 185, 186, 187, 187, 188, 189, 189, 190, 191, 192, 192, 193, 193, 194, 195, 195, 196, 197, 197, 198, 199, 199, 200, 201, 201, 202, 203, 203, 204, 204, 205, 206, 206, 207, 208, 208, 209, 209, 210, 211, 211, 212, 212, 213, 214, 214, 215, 215, 216, 217, 217, 218, 218, 219, 219, 220, 221, 221, 222, 222, 223, 224, 224, 225, 225, 226, 226, 227, 227, 228, 229, 229, 230, 230, 231, 231, 232, 232, 233, 234, 234, 235, 235, 236, 236, 237, 237, 238, 238, 239, 240, 240, 241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 248, 248, 249, 249, 250, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255 }; const int DivSufSort::LOG_TABLE[] = { -1, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 }; DivSufSort::DivSufSort() { _ssStack = new Stack(SS_MISORT_STACKSIZE); _trStack = new Stack(TR_STACKSIZE); _mergeStack = new Stack(SS_SMERGE_STACKSIZE); _sa = nullptr; _buffer = nullptr; _ssStack->_index = 0; _trStack->_index = 0; _mergeStack->_index = 0; memset(&_bucketA[0], 0, sizeof(int) * 256); memset(&_bucketB[0], 0, sizeof(int) * 65536); } DivSufSort::~DivSufSort() { delete _ssStack; delete _trStack; delete _mergeStack; } void DivSufSort::reset() { _ssStack->_index = 0; _trStack->_index = 0; _mergeStack->_index = 0; memset(&_bucketA[0], 0, sizeof(int) * 256); memset(&_bucketB[0], 0, sizeof(int) * 65536); } bool DivSufSort::computeSuffixArray(const kanzi::byte input[], int sa[], int length) { _buffer = reinterpret_cast(&input[0]); _sa = sa; reset(); const int m = sortTypeBstar(_bucketA, _bucketB, length); if (m < 0) return false; constructSuffixArray(_bucketA, _bucketB, length, m); return true; } void DivSufSort::constructSuffixArray(int bucketA[], int bucketB[], int n, int m) { if (m > 0) { for (int c1 = 254; c1 >= 0; c1--) { const int idx = c1 << 8; const int i = bucketB[idx + c1 + 1]; int k = 0; int c2 = -1; for (int j = bucketA[c1 + 1] - 1; j >= i; j--) { int s = _sa[j]; _sa[j] = ~s; if (s <= 0) continue; s--; const int c0 = _buffer[s]; if ((s > 0) && (_buffer[s - 1] > c0)) s = ~s; if (c0 != c2) { if (c2 >= 0) bucketB[idx + c2] = k; c2 = c0; k = bucketB[idx + c2]; } _sa[k--] = s; } } } int c2 = _buffer[n - 1]; int k = bucketA[c2]; _sa[k++] = (_buffer[n - 2] < c2) ? ~(n - 1) : (n - 1); // Scan the suffix array from left to right. for (int i = 0; i < n; i++) { int s = _sa[i]; if (s <= 0) { _sa[i] = ~s; continue; } s--; const int c0 = _buffer[s]; if ((s == 0) || (_buffer[s - 1] < c0)) s = ~s; if (c0 != c2) { bucketA[c2] = k; c2 = c0; k = bucketA[c2]; } _sa[k++] = s; } } bool DivSufSort::computeBWT(const kanzi::byte input[], kanzi::byte output[], int bwt[], int length, int indexes[], int idxCount) { _buffer = reinterpret_cast(&input[0]); _sa = bwt; reset(); const int m = sortTypeBstar(_bucketA, _bucketB, length); if (m < 0) return false; const int pIdx = constructBWT(_bucketA, _bucketB, length, m, indexes, idxCount); if (pIdx < 0) return false; output[0] = input[length - 1]; for (int i = 0; i < pIdx; i++) output[i + 1] = kanzi::byte(bwt[i]); for (int i = pIdx + 1; i < length; i++) output[i] = kanzi::byte(bwt[i]); return true; } int DivSufSort::constructBWT(int bucketA[], int bucketB[], int n, int m, int indexes[], int idxCount) { int pIdx = -1; const int st = n / idxCount; const int step = (idxCount * st == n) ? st : st + 1; if (m > 0) { for (int c1 = 254; c1 >= 0; c1--) { const int idx = c1 << 8; const int i = bucketB[idx + c1 + 1]; int k = 0; int c2 = -1; for (int j = bucketA[c1 + 1] - 1; j >= i; j--) { int s = _sa[j]; if (s <= 0) { if (s != 0) _sa[j] = ~s; continue; } if ((s % step) == 0) indexes[s / step] = j + 1; s--; const int c0 = _buffer[s]; _sa[j] = ~c0; if ((s > 0) && (_buffer[s - 1] > c0)) s = ~s; if (c0 != c2) { if (c2 >= 0) bucketB[idx + c2] = k; c2 = c0; k = bucketB[idx + c2]; } _sa[k--] = s; } } } int c2 = _buffer[n - 1]; int k = bucketA[c2]; if (_buffer[n - 2] < c2) { if (((n - 1) % step) == 0) indexes[(n - 1) / step] = n; _sa[k++] = ~_buffer[n - 2]; } else { _sa[k++] = n - 1; } // Scan the suffix array from left to right. for (int i = 0; i < n; i++) { int s = _sa[i]; if (s <= 0) { if (s != 0) _sa[i] = ~s; else pIdx = i; continue; } if ((s % step) == 0) indexes[s / step] = i + 1; s--; const int c0 = _buffer[s]; _sa[i] = c0; if (c0 != c2) { bucketA[c2] = k; c2 = c0; k = bucketA[c2]; } if ((s > 0) && (_buffer[s - 1] < c0)) { if ((s % step) == 0) { indexes[s / step] = k + 1; } s = ~_buffer[s - 1]; } _sa[k++] = s; } indexes[0] = pIdx + 1; return pIdx; } int DivSufSort::sortTypeBstar(int bucketA[], int bucketB[], int n) { int m = n; int c0 = _buffer[n - 1]; // Count the number of occurrences of the first one or two characters of each // type A, B and B* suffix. Moreover, store the beginning position of all // type B* suffixes into the array _sa. for (int i = n - 1; i >= 0;) { int c1; do { c1 = c0; bucketA[c1]++; i--; if (i < 0) break; } while ((c0 = _buffer[i]) >= c1); if (i < 0) break; bucketB[(c0 << 8) + c1]++; m--; _sa[m] = i; i--; c1 = c0; while (i >= 0) { if ((c0 = _buffer[i]) > c1) break; bucketB[(c1 << 8) + c0]++; c1 = c0; i--; } } m = n - m; c0 = 0; // A type B* suffix is lexicographically smaller than a type B suffix that // begins with the same first two characters. // Calculate the index of start/end point of each bucket. for (int i = 0, j = 0; c0 < 256; c0++) { const int t = i + bucketA[c0]; bucketA[c0] = i + j; // start point const int idx = c0 << 8; i = t + bucketB[idx + c0]; for (int c1 = c0 + 1; c1 < 256; c1++) { j += bucketB[idx + c1]; bucketB[idx + c1] = j; // end point i += bucketB[(c1 << 8) + c0]; } } if (m > 0) { // Sort the type B* suffixes by their first two characters. const int pab = n - m; for (int i = m - 2; i >= 0; i--) { const int t = _sa[pab + i]; const int idx = (_buffer[t] << 8) + _buffer[t + 1]; bucketB[idx]--; _sa[bucketB[idx]] = i; } const int t = _sa[pab + m - 1]; c0 = (_buffer[t] << 8) + _buffer[t + 1]; bucketB[c0]--; _sa[bucketB[c0]] = m - 1; // Sort the type B* substrings using ssSort. const int bufSize = n - m - m; c0 = 254; for (int j = m; j > 0; c0--) { const int idx = c0 << 8; for (int c1 = 255; c1 > c0; c1--) { const int i = bucketB[idx + c1]; if (j > i + 1) ssSort(pab, i, j, m, bufSize, 2, n, _sa[i] == m - 1); j = i; } } // Compute ranks of type B* substrings. for (int i = m - 1; i >= 0; i--) { if (_sa[i] >= 0) { const int j = i; do { _sa[m + _sa[i]] = i; i--; } while ((i >= 0) && (_sa[i] >= 0)); _sa[i + 1] = i - j; if (i <= 0) break; } const int j = i; do { _sa[i] = ~_sa[i]; _sa[m + _sa[i]] = j; i--; } while (_sa[i] < 0); _sa[m + _sa[i]] = j; } // Construct the inverse suffix array of type B* suffixes using trSort. trSort(m, 1); // Set the sorted order of type B* suffixes. c0 = _buffer[n - 1]; for (int i = n - 1, j = m; i >= 0;) { i--; for (int c1 = c0; i >= 0; i--) { if ((c0 = _buffer[i]) < c1) break; c1 = c0; } if (i >= 0) { const int tt = i; i--; for (int c1 = c0; i >= 0; i--) { if ((c0 = _buffer[i]) > c1) break; c1 = c0; } j--; _sa[_sa[m + j]] = ((tt == 0) || (tt - i > 1)) ? tt : ~tt; } } // Calculate the index of start/end point of each bucket. bucketB[65535] = n; // end int k = m - 1; for (c0 = 254; c0 >= 0; c0--) { int i = bucketA[c0 + 1] - 1; const int idx = c0 << 8; for (int c1 = 255; c1 > c0; c1--) { const int tt = i - bucketB[(c1 << 8) + c0]; bucketB[(c1 << 8) + c0] = i; // end point i = tt; const int j = bucketB[idx + c1]; // Move all type B* suffixes to the correct position. for (; k >= j; i--, k--) _sa[i] = _sa[k]; } bucketB[idx + c0 + 1] = i - bucketB[idx + c0] + 1; // start point bucketB[idx + c0] = i; // end point } } return m; } // Sub String Sort void DivSufSort::ssSort(const int pa, int first, int last, int buf, int bufSize, int depth, int n, bool lastSuffix) { if (lastSuffix == true) first++; int limit = 0; int middle = last; if ((bufSize < SS_BLOCKSIZE) && (bufSize < last - first)) { limit = ssIsqrt(last - first); if (bufSize < limit) { limit = limit > SS_BLOCKSIZE ? SS_BLOCKSIZE : limit; middle = last - limit; buf = middle; bufSize = limit; } else { limit = 0; } } int a; int i = 0; for (a = first; middle - a > SS_BLOCKSIZE; a += SS_BLOCKSIZE, i++) { ssMultiKeyIntroSort(pa, a, a + SS_BLOCKSIZE, depth); int curBufSize = last - (a + SS_BLOCKSIZE); int curBuf; if (curBufSize > bufSize) { curBuf = a + SS_BLOCKSIZE; } else { curBufSize = bufSize; curBuf = buf; } int k = SS_BLOCKSIZE; int b = a; for (int j = i; (j & 1) != 0; j >>= 1) { ssSwapMerge(pa, b - k, b, b + k, curBuf, curBufSize, depth); b -= k; k <<= 1; } } ssMultiKeyIntroSort(pa, a, middle, depth); for (int k = SS_BLOCKSIZE; i != 0; k <<= 1, i >>= 1) { if ((i & 1) == 0) continue; ssSwapMerge(pa, a - k, a, middle, buf, bufSize, depth); a -= k; } if (limit != 0) { ssMultiKeyIntroSort(pa, middle, last, depth); ssInplaceMerge(pa, first, middle, last, depth); } if (lastSuffix == true) { i = _sa[first - 1]; const int p1 = _sa[pa + i]; const int p11 = n - 2; for (a = first; (a < last) && ((_sa[a] < 0) || (ssCompare(p1, p11, pa + _sa[a], depth) > 0)); a++) _sa[a - 1] = _sa[a]; _sa[a - 1] = i; } } int DivSufSort::ssCompare(int pa, int pb, int p2, const int depth) const { int u1 = depth + pa; int u2 = depth + _sa[p2]; const int u1n = pb + 2; const int u2n = _sa[p2 + 1] + 2; if (u1n - u1 > u2n - u2) { while ((u2 < u2n) && (_buffer[u1] == _buffer[u2])) { u1++; u2++; } } else { while ((u1 < u1n) && (_buffer[u1] == _buffer[u2])) { u1++; u2++; } } return (u1 < u1n) ? ((u2 < u2n) ? _buffer[u1] - _buffer[u2] : 1) : ((u2 < u2n) ? -1 : 0); } int DivSufSort::ssCompare(const int sa1[], const int sa2[], const int depth) const { int u1 = depth + sa1[0]; int u2 = depth + sa2[0]; const int u1n = sa1[1] + 2; const int u2n = sa2[1] + 2; if (u1n - u1 > u2n - u2) { while ((u2 < u2n) && (_buffer[u1] == _buffer[u2])) { u1++; u2++; } } else { while ((u1 < u1n) && (_buffer[u1] == _buffer[u2])) { u1++; u2++; } } return (u1 < u1n) ? ((u2 < u2n) ? _buffer[u1] - _buffer[u2] : 1) : ((u2 < u2n) ? -1 : 0); } void DivSufSort::ssInplaceMerge(int pa, int first, int middle, int last, int depth) { while (true) { int p, x; if (_sa[last - 1] < 0) { x = 1; p = pa + ~_sa[last - 1]; } else { x = 0; p = pa + _sa[last - 1]; } int a = first; int r = -1; for (int len = middle - first, half = (len >> 1); len > 0; len = half, half >>= 1) { const int b = a + half; const int q = ssCompare(&_sa[pa + ((_sa[b] >= 0) ? _sa[b] : ~_sa[b])], &_sa[p], depth); if (q < 0) { a = b + 1; half -= ((len & 1) ^ 1); } else r = q; } if (a < middle) { if (r == 0) _sa[a] = ~_sa[a]; ssRotate(a, middle, last); last -= (middle - a); middle = a; if (first == middle) break; } last--; if (x != 0) { last--; while (_sa[last] < 0) last--; } if (middle == last) break; } } void DivSufSort::ssRotate(int first, int middle, int last) { int l = middle - first; int r = last - middle; while ((l > 0) && (r > 0)) { if (l == r) { ssBlockSwap(first, middle, l); break; } if (l < r) { int a = last - 1; int b = middle - 1; int t = _sa[a]; while (true) { _sa[a--] = _sa[b]; _sa[b--] = _sa[a]; if (b < first) { _sa[a] = t; last = a; r -= (l + 1); if (r <= l) break; a--; b = middle - 1; t = _sa[a]; } } } else { int a = first; int b = middle; int t = _sa[a]; while (true) { _sa[a++] = _sa[b]; _sa[b++] = _sa[a]; if (last <= b) { _sa[a] = t; first = a + 1; l -= (r + 1); if (l <= r) break; a++; b = middle; t = _sa[a]; } } } } } void DivSufSort::ssSwapMerge(int pa, int first, int middle, int last, int buf, int bufSize, int depth) { int check = 0; while (true) { if (last - middle <= bufSize) { if ((first < middle) && (middle < last)) ssMergeBackward(pa, first, middle, last, buf, depth); if (((check & 1) != 0) || (((check & 2) != 0) && (ssCompare(&_sa[pa + getIndex(_sa[first - 1])], &_sa[pa + _sa[first]], depth) == 0))) { _sa[first] = ~_sa[first]; } if (((check & 4) != 0) && (ssCompare(&_sa[pa + getIndex(_sa[last - 1])], &_sa[pa + _sa[last]], depth) == 0)) { _sa[last] = ~_sa[last]; } const StackElement* se = _mergeStack->pop(); if (se == nullptr) return; first = se->_a; middle = se->_b; last = se->_c; check = se->_d; continue; } if (middle - first <= bufSize) { if (first < middle) ssMergeForward(pa, first, middle, last, buf, depth); if (((check & 1) != 0) || (((check & 2) != 0) && (ssCompare(&_sa[pa + getIndex(_sa[first - 1])], &_sa[pa + _sa[first]], depth) == 0))) { _sa[first] = ~_sa[first]; } if (((check & 4) != 0) && (ssCompare(&_sa[pa + getIndex(_sa[last - 1])], &_sa[pa + _sa[last]], depth) == 0)) { _sa[last] = ~_sa[last]; } const StackElement* se = _mergeStack->pop(); if (se == nullptr) return; first = se->_a; middle = se->_b; last = se->_c; check = se->_d; continue; } int len = (middle - first < last - middle) ? middle - first : last - middle; int m = 0; for (int half = len >> 1; len > 0; len = half, half >>= 1) { if (ssCompare(&_sa[pa + getIndex(_sa[middle + m + half])], &_sa[pa + getIndex(_sa[middle - m - half - 1])], depth) < 0) { m += (half + 1); half -= ((len & 1) ^ 1); } } if (m > 0) { int lm = middle - m; int rm = middle + m; ssBlockSwap(lm, middle, m); int l = middle; int r = l; int next = 0; if (rm < last) { if (_sa[rm] < 0) { _sa[rm] = ~_sa[rm]; if (first < lm) { l--; while (_sa[l] < 0) l--; next |= 4; } next |= 1; } else if (first < lm) { while (_sa[r] < 0) r++; next |= 2; } } if (l - first <= last - r) { _mergeStack->push(r, rm, last, (next & 3) | (check & 4), 0); middle = lm; last = l; check = (check & 3) | (next & 4); } else { if ((r == middle) && ((next & 2) != 0)) next ^= 6; _mergeStack->push(first, lm, l, (check & 3) | (next & 4), 0); first = r; middle = rm; check = (next & 3) | (check & 4); } } else { if (ssCompare(&_sa[pa + getIndex(_sa[middle - 1])], &_sa[pa + _sa[middle]], depth) == 0) { _sa[middle] = ~_sa[middle]; } if (((check & 1) != 0) || (((check & 2) != 0) && (ssCompare(&_sa[pa + getIndex(_sa[first - 1])], &_sa[pa + _sa[first]], depth) == 0))) { _sa[first] = ~_sa[first]; } if (((check & 4) != 0) && (ssCompare(&_sa[pa + getIndex(_sa[last - 1])], &_sa[pa + _sa[last]], depth) == 0)) { _sa[last] = ~_sa[last]; } const StackElement* se = _mergeStack->pop(); if (se == nullptr) return; first = se->_a; middle = se->_b; last = se->_c; check = se->_d; } } } void DivSufSort::ssMergeForward(int pa, int first, int middle, int last, int buf, int depth) { const int bufEnd = buf + middle - first - 1; ssBlockSwap(buf, first, middle - first); int a = first; int b = buf; int c = middle; const int t = _sa[a]; while (true) { const int r = ssCompare(&_sa[pa + _sa[b]], &_sa[pa + _sa[c]], depth); if (r < 0) { do { _sa[a++] = _sa[b]; if (bufEnd <= b) { _sa[bufEnd] = t; return; } _sa[b++] = _sa[a]; } while (_sa[b] < 0); } else if (r > 0) { do { _sa[a++] = _sa[c]; _sa[c++] = _sa[a]; if (last <= c) { while (b < bufEnd) { _sa[a++] = _sa[b]; _sa[b++] = _sa[a]; } _sa[a] = _sa[b]; _sa[b] = t; return; } } while (_sa[c] < 0); } else { _sa[c] = ~_sa[c]; do { _sa[a++] = _sa[b]; if (bufEnd <= b) { _sa[bufEnd] = t; return; } _sa[b++] = _sa[a]; } while (_sa[b] < 0); do { _sa[a++] = _sa[c]; _sa[c++] = _sa[a]; if (last <= c) { while (b < bufEnd) { _sa[a++] = _sa[b]; _sa[b++] = _sa[a]; } _sa[a] = _sa[b]; _sa[b] = t; return; } } while (_sa[c] < 0); } } } void DivSufSort::ssMergeBackward(int pa, int first, int middle, int last, int buf, int depth) { const int bufEnd = buf + last - middle - 1; ssBlockSwap(buf, middle, last - middle); int x = 0; int p1, p2; if (_sa[bufEnd] < 0) { p1 = pa + ~_sa[bufEnd]; x |= 1; } else p1 = pa + _sa[bufEnd]; if (_sa[middle - 1] < 0) { p2 = pa + ~_sa[middle - 1]; x |= 2; } else p2 = pa + _sa[middle - 1]; int a = last - 1; int b = bufEnd; int c = middle - 1; const int t = _sa[a]; while (true) { const int r = ssCompare(&_sa[p1], &_sa[p2], depth); if (r > 0) { if ((x & 1) != 0) { do { _sa[a--] = _sa[b]; _sa[b--] = _sa[a]; } while (_sa[b] < 0); x ^= 1; } _sa[a--] = _sa[b]; if (b <= buf) { _sa[buf] = t; break; } _sa[b--] = _sa[a]; if (_sa[b] < 0) { p1 = pa + ~_sa[b]; x |= 1; } else p1 = pa + _sa[b]; } else if (r < 0) { if ((x & 2) != 0) { do { _sa[a--] = _sa[c]; _sa[c--] = _sa[a]; } while (_sa[c] < 0); x ^= 2; } _sa[a--] = _sa[c]; _sa[c--] = _sa[a]; if (c < first) { while (buf < b) { _sa[a--] = _sa[b]; _sa[b--] = _sa[a]; } _sa[a] = _sa[b]; _sa[b] = t; break; } if (_sa[c] < 0) { p2 = pa + ~_sa[c]; x |= 2; } else p2 = pa + _sa[c]; } else // r = 0 { if ((x & 1) != 0) { do { _sa[a--] = _sa[b]; _sa[b--] = _sa[a]; } while (_sa[b] < 0); x ^= 1; } _sa[a--] = ~_sa[b]; if (b <= buf) { _sa[buf] = t; break; } _sa[b--] = _sa[a]; if ((x & 2) != 0) { do { _sa[a--] = _sa[c]; _sa[c--] = _sa[a]; } while (_sa[c] < 0); x ^= 2; } _sa[a--] = _sa[c]; _sa[c--] = _sa[a]; if (c < first) { while (buf < b) { _sa[a--] = _sa[b]; _sa[b--] = _sa[a]; } _sa[a] = _sa[b]; _sa[b] = t; break; } if (_sa[b] < 0) { p1 = pa + ~_sa[b]; x |= 1; } else p1 = pa + _sa[b]; if (_sa[c] < 0) { p2 = pa + ~_sa[c]; x |= 2; } else p2 = pa + _sa[c]; } } } void DivSufSort::ssInsertionSort(const int pa, int first, int last, int depth) { for (int i = last - 2; i >= first; i--) { const int t = pa + _sa[i]; int j = i + 1; int r; while ((r = ssCompare(&_sa[t], &_sa[pa + _sa[j]], depth)) > 0) { do { _sa[j - 1] = _sa[j]; j++; } while ((j < last) && (_sa[j] < 0)); if (j >= last) break; } _sa[j] = r == 0 ? ~_sa[j] : _sa[j]; _sa[j - 1] = t - pa; } } void DivSufSort::ssMultiKeyIntroSort(int pa, int first, int last, int depth) { const int* sapa = &_sa[pa]; int limit = ssIlg(last - first); int x = 0; while (true) { if (last - first <= SS_INSERTIONSORT_THRESHOLD) { if (last - first > 1) ssInsertionSort(pa, first, last, depth); const StackElement* se = _ssStack->pop(); if (se == nullptr) return; first = se->_a; last = se->_b; depth = se->_c; limit = se->_d; continue; } const int idx = depth; const uint8* p = &_buffer[idx]; if (limit == 0) ssHeapSort(idx, pa, first, last - first); limit--; int a; if (limit < 0) { int v = p[sapa[_sa[first]]]; for (a = first + 1; a < last; a++) { if ((x = p[sapa[_sa[a]]]) != v) { if (a - first > 1) break; v = x; first = a; } } if (p[sapa[_sa[first]] - 1] < v) first = ssPartition(pa, first, a, depth); if (a - first <= last - a) { if (a - first > 1) { _ssStack->push(a, last, depth, -1, 0); last = a; depth++; limit = ssIlg(a - first); } else { first = a; limit = -1; } } else { if (last - a > 1) { _ssStack->push(first, a, depth + 1, ssIlg(a - first), 0); first = a; limit = -1; } else { last = a; depth++; limit = ssIlg(a - first); } } continue; } // choose pivot a = ssPivot(idx, pa, first, last); const int v = p[sapa[_sa[a]]]; std::swap(_sa[first], _sa[a]); int b = first; // partition while (++b < last) { if ((x = p[sapa[_sa[b]]]) != v) break; } a = b; if ((a < last) && (x < v)) { while (++b < last) { if ((x = p[sapa[_sa[b]]]) > v) break; if (x == v) { std::swap(_sa[b], _sa[a]); a++; } } } int c = last; while (--c > b) { if ((x = p[sapa[_sa[c]]]) != v) break; } int d = c; if ((b < d) && (x > v)) { while (--c > b) { if ((x = p[sapa[_sa[c]]]) < v) break; if (x == v) { std::swap(_sa[c], _sa[d]); d--; } } } while (b < c) { std::swap(_sa[b], _sa[c]); while (++b < c) { if ((x = p[sapa[_sa[b]]]) > v) break; if (x == v) { std::swap(_sa[b], _sa[a]); a++; } } while (--c > b) { if ((x = p[sapa[_sa[c]]]) < v) break; if (x == v) { std::swap(_sa[c], _sa[d]); d--; } } } if (a <= d) { c = b - 1; int s = (a - first > b - a) ? b - a : a - first; for (int e = first, f = b - s; s > 0; s--, e++, f++) std::swap(_sa[e], _sa[f]); s = (d - c > last - d - 1) ? last - d - 1 : d - c; for (int e = b, f = last - s; s > 0; s--, e++, f++) std::swap(_sa[e], _sa[f]); a = first + (b - a); c = last - (d - c); b = (v <= p[sapa[_sa[a]] - 1]) ? a : ssPartition(pa, a, c, depth); if (a - first <= last - c) { if (last - c <= c - b) { _ssStack->push(b, c, depth + 1, ssIlg(c - b), 0); _ssStack->push(c, last, depth, limit, 0); last = a; } else if (a - first <= c - b) { _ssStack->push(c, last, depth, limit, 0); _ssStack->push(b, c, depth + 1, ssIlg(c - b), 0); last = a; } else { _ssStack->push(c, last, depth, limit, 0); _ssStack->push(first, a, depth, limit, 0); first = b; last = c; depth++; limit = ssIlg(c - b); } } else { if (a - first <= c - b) { _ssStack->push(b, c, depth + 1, ssIlg(c - b), 0); _ssStack->push(first, a, depth, limit, 0); first = c; } else if (last - c <= c - b) { _ssStack->push(first, a, depth, limit, 0); _ssStack->push(b, c, depth + 1, ssIlg(c - b), 0); first = c; } else { _ssStack->push(first, a, depth, limit, 0); _ssStack->push(c, last, depth, limit, 0); first = b; last = c; depth++; limit = ssIlg(c - b); } } } else { if (p[sapa[_sa[first]] - 1] < v) { first = ssPartition(pa, first, last, depth); limit = ssIlg(last - first); } else { limit++; } depth++; } } } int DivSufSort::ssPivot(int td, int pa, int first, int last) const { int t = last - first; int middle = first + (t >> 1); if (t <= 512) { return (t <= 32) ? ssMedian3(&_buffer[td], pa, first, middle, last - 1) : ssMedian5(&_buffer[td], pa, first, first + (t >> 2), middle, last - 1 - (t >> 2), last - 1); } t >>= 3; first = ssMedian3(&_buffer[td], pa, first, first + t, first + (t << 1)); middle = ssMedian3(&_buffer[td], pa, middle - t, middle, middle + t); last = ssMedian3(&_buffer[td], pa, last - 1 - (t << 1), last - 1 - t, last - 1); return ssMedian3(&_buffer[td], pa, first, middle, last); } int DivSufSort::ssMedian5(const uint8 buf0[], int pa, int v1, int v2, int v3, int v4, int v5) const { const int* buf1 = &_sa[pa]; if (buf0[buf1[_sa[v2]]] > buf0[buf1[_sa[v3]]]) { std::swap(v2, v3); } if (buf0[buf1[_sa[v4]]] > buf0[buf1[_sa[v5]]]) { std::swap(v4, v5); } if (buf0[buf1[_sa[v2]]] > buf0[buf1[_sa[v4]]]) { //std::swap(v2, v4); v4 = v2; std::swap(v3, v5); } if (buf0[buf1[_sa[v1]]] > buf0[buf1[_sa[v3]]]) { std::swap(v1, v3); } if (buf0[buf1[_sa[v1]]] > buf0[buf1[_sa[v4]]]) { //std::swap(v1, v4); v4 = v1; //std::swap(v3, v5); v3 = v5; } return (buf0[buf1[_sa[v3]]] > buf0[buf1[_sa[v4]]]) ? v4 : v3; } int DivSufSort::ssMedian3(const uint8 buf0[], int pa, int v1, int v2, int v3) const { const int* buf1 = &_sa[pa]; if (buf0[buf1[_sa[v1]]] > buf0[buf1[_sa[v2]]]) { std::swap(v1, v2); } if (buf0[buf1[_sa[v2]]] > buf0[buf1[_sa[v3]]]) { return (buf0[buf1[_sa[v1]]] > buf0[buf1[_sa[v3]]]) ? v1 : v3; } return v2; } int DivSufSort::ssPartition(int pa, int first, int last, int depth) { int a = first - 1; int b = last; const int d = depth - 1; const int pb = pa + 1; while (true) { a++; while ((a < b) && (_sa[pa + _sa[a]] + d >= _sa[pb + _sa[a]])) { _sa[a] = ~_sa[a]; a++; } b--; while ((b > a) && (_sa[pa + _sa[b]] + d < _sa[pb + _sa[b]])) b--; if (b <= a) break; const int t = ~_sa[b]; _sa[b] = _sa[a]; _sa[a] = t; } if (first < a) _sa[first] = ~_sa[first]; return a; } void DivSufSort::ssHeapSort(int idx, int pa, int saIdx, int size) { int m = size; if ((size & 1) == 0) { m--; if (_buffer[idx + _sa[pa + _sa[saIdx + (m >> 1)]]] < _buffer[idx + _sa[pa + _sa[saIdx + m]]]) std::swap(_sa[saIdx + m], _sa[saIdx + (m >> 1)]); } for (int i = (m >> 1) - 1; i >= 0; i--) ssFixDown(idx, pa, saIdx, i, m); if ((size & 1) == 0) { std::swap(_sa[saIdx], _sa[saIdx + m]); ssFixDown(idx, pa, saIdx, 0, m); } for (int i = m - 1; i > 0; i--) { const int t = _sa[saIdx]; _sa[saIdx] = _sa[saIdx + i]; ssFixDown(idx, pa, saIdx, 0, i); _sa[saIdx + i] = t; } } void DivSufSort::ssFixDown(int idx, int pa, int saIdx, int i, int size) { const int v = _sa[saIdx + i]; const int c = _buffer[idx + _sa[pa + v]]; int j = (i << 1) + 1; while (j < size) { int k = j; j++; int d = _buffer[idx + _sa[pa + _sa[saIdx + k]]]; const int e = _buffer[idx + _sa[pa + _sa[saIdx + j]]]; if (d < e) { k = j; d = e; } if (d <= c) break; _sa[saIdx + i] = _sa[saIdx + k]; i = k; j = (i << 1) + 1; } _sa[i + saIdx] = v; } // Tandem Repeat Sort void DivSufSort::trSort(int n, int depth) { TRBudget budget(trIlg(n) * 2 / 3, n); for (int isad = n + depth; _sa[0] > -n; isad += (isad - n)) { int first = 0; int skip = 0; int unsorted = 0; do { const int t = _sa[first]; if (t < 0) { first -= t; skip += t; continue; } if (skip != 0) { _sa[first + skip] = skip; skip = 0; } const int last = _sa[n + t] + 1; if (last - first > 1) { budget._count = 0; trIntroSort(n, isad, first, last, budget); if (budget._count != 0) unsorted += budget._count; else skip = first - last; } else if (last - first == 1) skip = -1; first = last; } while (first < n); if (skip != 0) _sa[first + skip] = skip; if (unsorted == 0) break; } } uint64 DivSufSort::trPartition(int isad, int first, int middle, int last, int v) { int x = 0; int b = middle; const int* p = &_sa[isad]; while (b < last) { x = p[ _sa[b]]; if (x != v) break; b++; } int a = b; if ((a < last) && (x < v)) { while (++b < last) { if ((x = p[_sa[b]]) > v) break; if (x == v) { std::swap(_sa[a], _sa[b]); a++; } } } int c = last - 1; while (c > b) { x = p[_sa[c]]; if (x != v) break; c--; } int d = c; if ((b < d) && (x > v)) { while (--c > b) { if ((x = p[_sa[c]]) < v) break; if (x == v) { std::swap(_sa[c], _sa[d]); d--; } } } while (b < c) { std::swap(_sa[c], _sa[b]); while ((++b < c) && ((x = p[_sa[b]]) <= v)) { if (x == v) { std::swap(_sa[a], _sa[b]); a++; } } while ((--c > b) && ((x = p[_sa[c]]) >= v)) { if (x == v) { std::swap(_sa[c], _sa[d]); d--; } } } if (a <= d) { c = b - 1; int s = a - first; if (s > b - a) s = b - a; for (int e = first, f = b - s; s > 0; s--, e++, f++) std::swap(_sa[e], _sa[f]); s = d - c; if (s >= last - d) s = last - d - 1; for (int e = b, f = last - s; s > 0; s--, e++, f++) std::swap(_sa[e], _sa[f]); first += (b - a); last -= (d - c); } return ((uint64(first) << 32) | (uint64(last) & uint64(0xFFFFFFFF))); } void DivSufSort::trIntroSort(int isa, int isad, int first, int last, TRBudget& budget) { const int incr = isad - isa; int limit = trIlg(last - first); int trlink = -1; while (true) { if (limit < 0) { if (limit == -1) { // tandem repeat partition uint64 res = trPartition(isad - incr, first, first, last, last - 1); const int a = int(res >> 32); const int b = int(res); // update ranks if (a < last) { for (int c = first, v = a - 1; c < a; c++) _sa[isa + _sa[c]] = v; } if (b < last) { for (int c = a, v = b - 1; c < b; c++) _sa[isa + _sa[c]] = v; } // push if (b - a > 1) { _trStack->push(0, a, b, 0, 0); _trStack->push(isad - incr, first, last, -2, trlink); trlink = _trStack->size() - 2; } if (a - first <= last - b) { if (a - first > 1) { _trStack->push(isad, b, last, trIlg(last - b), trlink); last = a; limit = trIlg(a - first); } else if (last - b > 1) { first = b; limit = trIlg(last - b); } else { const StackElement* se = _trStack->pop(); if (se == nullptr) return; isad = se->_a; first = se->_b; last = se->_c; limit = se->_d; trlink = se->_e; } } else { if (last - b > 1) { _trStack->push(isad, first, a, trIlg(a - first), trlink); first = b; limit = trIlg(last - b); } else if (a - first > 1) { last = a; limit = trIlg(a - first); } else { const StackElement* se = _trStack->pop(); if (se == nullptr) return; isad = se->_a; first = se->_b; last = se->_c; limit = se->_d; trlink = se->_e; } } } else if (limit == -2) { // tandem repeat copy const StackElement* se = _trStack->pop(); if (se == nullptr) return; if (se->_d == 0) { trCopy(isa, first, se->_b, se->_c, last, isad - isa); } else { if (trlink >= 0) _trStack->get(trlink)->_d = -1; trPartialCopy(isa, first, se->_b, se->_c, last, isad - isa); } se = _trStack->pop(); if (se == nullptr) return; isad = se->_a; first = se->_b; last = se->_c; limit = se->_d; trlink = se->_e; } else { // sorted partition if (_sa[first] >= 0) { int a = first; do { _sa[isa + _sa[a]] = a; a++; } while ((a < last) && (_sa[a] >= 0)); first = a; } if (first < last) { int a = first; do { _sa[a] = ~_sa[a]; a++; } while (_sa[a] < 0); int next = (_sa[isa + _sa[a]] != _sa[isad + _sa[a]]) ? trIlg(a - first + 1) : -1; a++; if (a < last) { const int v = a - 1; for (int b = first; b < a; b++) _sa[isa + _sa[b]] = v; } // push if (budget.check(a - first) == true) { if (a - first <= last - a) { _trStack->push(isad, a, last, -3, trlink); isad += incr; last = a; limit = next; } else { if (last - a > 1) { _trStack->push(isad + incr, first, a, next, trlink); first = a; limit = -3; } else { isad += incr; last = a; limit = next; } } } else { if (trlink >= 0) _trStack->get(trlink)->_d = -1; if (last - a > 1) { first = a; limit = -3; } else { const StackElement* se = _trStack->pop(); if (se == nullptr) return; isad = se->_a; first = se->_b; last = se->_c; limit = se->_d; trlink = se->_e; } } } else { const StackElement* se = _trStack->pop(); if (se == nullptr) return; isad = se->_a; first = se->_b; last = se->_c; limit = se->_d; trlink = se->_e; } } continue; } if (last - first <= TR_INSERTIONSORT_THRESHOLD) { trInsertionSort(&_sa[isad], first, last); limit = -3; continue; } if (limit == 0) { trHeapSort(isad, first, last - first); int a = last - 1; while (first < a) { int b = a - 1; for (int x = _sa[isad + _sa[a]]; (first <= b) && (_sa[isad + _sa[b]] == x); b--) _sa[b] = ~_sa[b]; a = b; } limit = -3; continue; } limit--; // choose pivot std::swap(_sa[first], _sa[trPivot(_sa, isad, first, last)]); int v = _sa[isad + _sa[first]]; // partition uint64 res = trPartition(isad, first, first + 1, last, v); const int a = int(res >> 32); const int b = int(res & 0xFFFFFFFFL); if (last - first != b - a) { const int next = (_sa[isa + _sa[a]] != v) ? trIlg(b - a) : -1; v = a - 1; // update ranks for (int c = first; c < a; c++) _sa[isa + _sa[c]] = v; if (b < last) { v = b - 1; for (int c = a; c < b; c++) _sa[isa + _sa[c]] = v; } // push if ((b - a > 1) && (budget.check(b - a) == true)) { if (a - first <= last - b) { if (last - b <= b - a) { if (a - first > 1) { _trStack->push(isad + incr, a, b, next, trlink); _trStack->push(isad, b, last, limit, trlink); last = a; } else if (last - b > 1) { _trStack->push(isad + incr, a, b, next, trlink); first = b; } else { isad += incr; first = a; last = b; limit = next; } } else if (a - first <= b - a) { if (a - first > 1) { _trStack->push(isad, b, last, limit, trlink); _trStack->push(isad + incr, a, b, next, trlink); last = a; } else { _trStack->push(isad, b, last, limit, trlink); isad += incr; first = a; last = b; limit = next; } } else { _trStack->push(isad, b, last, limit, trlink); _trStack->push(isad, first, a, limit, trlink); isad += incr; first = a; last = b; limit = next; } } else { if (a - first <= b - a) { if (last - b > 1) { _trStack->push(isad + incr, a, b, next, trlink); _trStack->push(isad, first, a, limit, trlink); first = b; } else if (a - first > 1) { _trStack->push(isad + incr, a, b, next, trlink); last = a; } else { isad += incr; first = a; last = b; limit = next; } } else if (last - b <= b - a) { if (last - b > 1) { _trStack->push(isad, first, a, limit, trlink); _trStack->push(isad + incr, a, b, next, trlink); first = b; } else { _trStack->push(isad, first, a, limit, trlink); isad += incr; first = a; last = b; limit = next; } } else { _trStack->push(isad, first, a, limit, trlink); _trStack->push(isad, b, last, limit, trlink); isad += incr; first = a; last = b; limit = next; } } } else { if ((b - a > 1) && (trlink >= 0)) _trStack->get(trlink)->_d = -1; if (a - first <= last - b) { if (a - first > 1) { _trStack->push(isad, b, last, limit, trlink); last = a; } else if (last - b > 1) { first = b; } else { const StackElement* se = _trStack->pop(); if (se == nullptr) return; isad = se->_a; first = se->_b; last = se->_c; limit = se->_d; trlink = se->_e; } } else { if (last - b > 1) { _trStack->push(isad, first, a, limit, trlink); first = b; } else if (a - first > 1) { last = a; } else { const StackElement* se = _trStack->pop(); if (se == nullptr) return; isad = se->_a; first = se->_b; last = se->_c; limit = se->_d; trlink = se->_e; } } } } else { if (budget.check(last - first) == true) { limit = trIlg(last - first); isad += incr; } else { if (trlink >= 0) _trStack->get(trlink)->_d = -1; const StackElement* se = _trStack->pop(); if (se == nullptr) return; isad = se->_a; first = se->_b; last = se->_c; limit = se->_d; trlink = se->_e; } } } } int DivSufSort::trPivot(const int arr[], int isad, int first, int last) const { int t = last - first; int middle = first + (t >> 1); if (t <= 512) { if (t <= 32) return trMedian3(arr, isad, first, middle, last - 1); t >>= 2; return trMedian5(arr, isad, first, first + t, middle, last - 1 - t, last - 1); } t >>= 3; first = trMedian3(arr, isad, first, first + t, first + (t << 1)); middle = trMedian3(arr, isad, middle - t, middle, middle + t); last = trMedian3(arr, isad, last - 1 - (t << 1), last - 1 - t, last - 1); return trMedian3(arr, isad, first, middle, last); } void DivSufSort::trHeapSort(int isad, int saIdx, int size) { int m = size; if ((size & 1) == 0) { m--; if (_sa[isad + _sa[saIdx + (m >> 1)]] < _sa[isad + _sa[saIdx + m]]) std::swap(_sa[saIdx + m], _sa[saIdx + (m >> 1)]); } for (int i = (m >> 1) - 1; i >= 0; i--) trFixDown(isad, saIdx, i, m); if ((size & 1) == 0) { std::swap(_sa[saIdx], _sa[saIdx + m]); trFixDown(isad, saIdx, 0, m); } for (int i = m - 1; i > 0; i--) { const int t = _sa[saIdx]; _sa[saIdx] = _sa[saIdx + i]; trFixDown(isad, saIdx, 0, i); _sa[saIdx + i] = t; } } void DivSufSort::trFixDown(int isad, int saIdx, int i, int size) { const int v = _sa[saIdx + i]; const int c = _sa[isad + v]; int j = (i << 1) + 1; while (j < size) { int k = j; j++; int d = _sa[isad + _sa[saIdx + k]]; const int e = _sa[isad + _sa[saIdx + j]]; if (d < e) { k = j; d = e; } if (d <= c) break; _sa[saIdx + i] = _sa[saIdx + k]; i = k; j = (i << 1) + 1; } _sa[saIdx + i] = v; } void DivSufSort::trInsertionSort(const int arr[], int first, int last) { for (int a = first + 1; a < last; a++) { int b = a - 1; const int t = _sa[a]; int r; while ((r = arr[t] - arr[_sa[b]]) < 0) { do { _sa[b + 1] = _sa[b]; b--; } while ((b >= first) && (_sa[b] < 0)); if (b < first) break; } if (r == 0) _sa[b] = ~_sa[b]; _sa[b + 1] = t; } } void DivSufSort::trPartialCopy(int isa, int first, int a, int b, int last, int depth) { const int v = b - 1; int lastRank = -1; int newRank = -1; int d = a - 1; for (int c = first; c <= d; c++) { const int s = _sa[c] - depth; if ((s >= 0) && (_sa[isa + s] == v)) { d++; _sa[d] = s; const int rank = _sa[isa + s + depth]; if (lastRank != rank) { lastRank = rank; newRank = d; } _sa[isa + s] = newRank; } } lastRank = -1; for (int e = d; first <= e; e--) { const int rank = _sa[isa + _sa[e]]; if (lastRank != rank) { lastRank = rank; newRank = e; } if (newRank != rank) { _sa[isa + _sa[e]] = newRank; } } lastRank = -1; const int e = d + 1; d = b; for (int c = last - 1; d > e; c--) { const int s = _sa[c] - depth; if ((s >= 0) && (_sa[isa + s] == v)) { d--; _sa[d] = s; const int rank = _sa[isa + s + depth]; if (lastRank != rank) { lastRank = rank; newRank = d; } _sa[isa + s] = newRank; } } } void DivSufSort::trCopy(int isa, int first, int a, int b, int last, int depth) { const int v = b - 1; int d = a - 1; for (int c = first; c <= d; c++) { const int s = _sa[c] - depth; if ((s >= 0) && (_sa[isa + s] == v)) { d++; _sa[d] = s; _sa[isa + s] = d; } } const int e = d + 1; d = b; for (int c = last - 1; d > e; c--) { const int s = _sa[c] - depth; if ((s >= 0) && (_sa[isa + s] == v)) { d--; _sa[d] = s; _sa[isa + s] = d; } } } kanzi-cpp-2.5.2/src/transform/DivSufSort.hpp000066400000000000000000000213331516423635400210120ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_DivSufSort #define knz_DivSufSort #include "../types.hpp" #if __cplusplus >= 201103L #include #else #include #endif namespace kanzi { // DivSufSort is a fast two-stage suffix sorting algorithm by Yuta Mori. // The original C code is here: https://code.google.com/p/libdivsufsort/ // See also https://code.google.com/p/libdivsufsort/source/browse/wiki/SACA_Benchmarks.wiki // for comparison of different suffix array construction algorithms. // It is used to implement the forward stage of the BWT in linear time. struct StackElement { int _a, _b, _c, _d, _e; StackElement() { _a = _b = _c = _d = _e = 0; } }; // A stack of pre-allocated elements class Stack { friend class DivSufSort; private: StackElement* _arr; int _index; Stack(int size) { _arr = new StackElement[size]; _index = 0; } ~Stack() { delete[] _arr; } StackElement* get(int idx) const { return &_arr[idx]; } int size() const { return _index; } void push(int a, int b, int c, int d, int e) { StackElement* elt = &_arr[_index]; elt->_a = a; elt->_b = b; elt->_c = c; elt->_d = d; elt->_e = e; _index++; } StackElement* pop() { return (_index == 0) ? nullptr : &_arr[--_index]; } }; struct TRBudget { int _chance; int _remain; int _incVal; int _count; TRBudget(int chance, int incval) : _chance(chance) , _remain(incval) { _incVal = incval; _count = 0; } bool check(int size); }; inline bool TRBudget::check(int size) { if (size <= _remain) { _remain -= size; return true; } if (_chance == 0) { _count += size; return false; } _remain += (_incVal - size); _chance--; return true; } class DivSufSort { private: static const int SS_INSERTIONSORT_THRESHOLD; static const int SS_BLOCKSIZE; static const int SS_MISORT_STACKSIZE; static const int SS_SMERGE_STACKSIZE; static const int TR_STACKSIZE; static const int TR_INSERTIONSORT_THRESHOLD; static const int SQQ_TABLE[]; static const int LOG_TABLE[]; int* _sa; const uint8* _buffer; Stack* _ssStack; Stack* _trStack; Stack* _mergeStack; int _bucketA[256]; int _bucketB[65536]; void constructSuffixArray(int bucketA[], int bucketB[], int n, int m); int constructBWT(int bucketA[], int bucketB[], int n, int m, int indexes[], int idxCount); int sortTypeBstar(int bucketA[], int bucketB[], int n); void ssSort(int pa, int first, int last, int buf, int bufSize, int depth, int n, bool lastSuffix); int ssCompare(int pa, int pb, int p2, const int depth) const; int ssCompare(const int s1[], const int s2[], const int depth) const; void ssInplaceMerge(int pa, int first, int middle, int last, int depth); void ssRotate(int first, int middle, int last); void ssBlockSwap(int a, int b, int n); static int getIndex(int a) { return (a >= 0) ? a : ~a; } void ssSwapMerge(int pa, int first, int middle, int last, int buf, int bufSize, int depth); void ssMergeForward(int pa, int first, int middle, int last, int buf, int depth); void ssMergeBackward(int pa, int first, int middle, int last, int buf, int depth); void ssInsertionSort(int pa, int first, int last, int depth); int ssIsqrt(int x) const; void ssMultiKeyIntroSort(int pa, int first, int last, int depth); int ssPivot(int td, int pa, int first, int last) const; int ssMedian5(const uint8 buf[], int pa, int v1, int v2, int v3, int v4, int v5) const; int ssMedian3(const uint8 buf[], int pa, int v1, int v2, int v3) const; int ssPartition(int pa, int first, int last, int depth); void ssHeapSort(int idx, int pa, int saIdx, int size); void ssFixDown(int idx, int pa, int saIdx, int i, int size); static int ssIlg(int n); void trSort(int n, int depth); uint64 trPartition(int isad, int first, int middle, int last, int v); void trIntroSort(int isa, int isad, int first, int last, TRBudget& budget); int trPivot(const int arr[], int isad, int first, int last) const; int trMedian5(const int arr[], int isad, int v1, int v2, int v3, int v4, int v5) const; int trMedian3(const int arr[], int isad, int v1, int v2, int v3) const; void trHeapSort(int isad, int saIdx, int size); void trFixDown(int isad, int saIdx, int i, int size); void trInsertionSort(const int arr[], int first, int last); void trPartialCopy(int isa, int first, int a, int b, int last, int depth); void trCopy(int isa, int first, int a, int b, int last, int depth); void reset(); int trIlg(int n) const; public: DivSufSort(); ~DivSufSort(); bool computeSuffixArray(const byte input[], int sa[], int length); bool computeBWT(const byte input[], byte output[], int sa[], int length, int indexes[], int idxCount = 8); }; inline int DivSufSort::ssIlg(int n) { return (n > 255) ? 8 + LOG_TABLE[n >> 8] : LOG_TABLE[n & 0xFF]; } inline void DivSufSort::ssBlockSwap(int a, int b, int n) { while (n-- > 0) { std::swap(_sa[a], _sa[b]); a++; b++; } } inline int DivSufSort::trIlg(int n) const { return ((n & 0xFFFF0000) != 0) ? (((n & 0xFF000000) != 0) ? 24 + LOG_TABLE[(n >> 24) & 0xFF] : 16 + LOG_TABLE[(n >> 16) & 0xFF]) : (((n & 0x0000FF00) != 0) ? 8 + LOG_TABLE[(n >> 8) & 0xFF] : LOG_TABLE[n & 0xFF]); } inline int DivSufSort::trMedian5(const int sa[], int isad, int v1, int v2, int v3, int v4, int v5) const { if (sa[isad + sa[v2]] > sa[isad + sa[v3]]) { std::swap(v2, v3); } if (sa[isad + sa[v4]] > sa[isad + sa[v5]]) { const int t = v4; v4 = v5; v5 = t; } if (sa[isad + sa[v2]] > sa[isad + sa[v4]]) { std::swap(v2, v4); std::swap(v3, v5); } if (sa[isad + sa[v1]] > sa[isad + sa[v3]]) { std::swap(v1, v3); } if (sa[isad + sa[v1]] > sa[isad + sa[v4]]) { std::swap(v1, v4); std::swap(v3, v5); } if (sa[isad + sa[v3]] > sa[isad + sa[v4]]) return v4; return v3; } inline int DivSufSort::trMedian3(const int sa[], int isad, int v1, int v2, int v3) const { if (sa[isad + sa[v1]] > sa[isad + sa[v2]]) { std::swap(v1, v2); } if (sa[isad + sa[v2]] > sa[isad + sa[v3]]) { if (sa[isad + sa[v1]] > sa[isad + sa[v3]]) return v1; return v3; } return v2; } inline int DivSufSort::ssIsqrt(int x) const { if (x >= (SS_BLOCKSIZE * SS_BLOCKSIZE)) return SS_BLOCKSIZE; const int e = ((x & 0xFFFF0000) != 0) ? (((x & 0xFF000000) != 0) ? 24 + LOG_TABLE[(x >> 24) & 0xFF] : 16 + LOG_TABLE[(x >> 16) & 0xFF]) : (((x & 0x0000FF00) != 0) ? 8 + LOG_TABLE[(x >> 8) & 0xFF] : LOG_TABLE[x & 0xFF]); if (e < 8) return SQQ_TABLE[x] >> 4; int y; if (e >= 16) { y = SQQ_TABLE[x >> ((e - 6) - (e & 1))] << ((e >> 1) - 7); if (e >= 24) { y = (y + 1 + x / y) >> 1; } y = (y + 1 + x / y) >> 1; } else { y = (SQQ_TABLE[x >> ((e - 6) - (e & 1))] >> (7 - (e >> 1))) + 1; } return (x < y * y) ? y - 1 : y; } } #endif kanzi-cpp-2.5.2/src/transform/EXECodec.cpp000066400000000000000000000643751516423635400203310ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include "../Global.hpp" #include "../Magic.hpp" #include "EXECodec.hpp" using namespace kanzi; using namespace std; const kanzi::byte EXECodec::X86_MASK_JUMP = kanzi::byte(0xFE); const kanzi::byte EXECodec::X86_INSTRUCTION_JUMP = kanzi::byte(0xE8); const kanzi::byte EXECodec::X86_INSTRUCTION_JCC = kanzi::byte(0x80); const kanzi::byte EXECodec::X86_TWO_BYTE_PREFIX = kanzi::byte(0x0F); const kanzi::byte EXECodec::X86_MASK_JCC = kanzi::byte(0xF0); const kanzi::byte EXECodec::X86_ESCAPE = kanzi::byte(0x9B); const kanzi::byte EXECodec::NOT_EXE = kanzi::byte(0x80); const kanzi::byte EXECodec::X86 = kanzi::byte(0x40); const kanzi::byte EXECodec::ARM64 = kanzi::byte(0x20); const kanzi::byte EXECodec::MASK_DT = kanzi::byte(0x0F); const int EXECodec::X86_ADDR_MASK = (1 << 24) - 1; const int EXECodec::MASK_ADDRESS = 0xF0F0F0F0; const int EXECodec::ARM_B_ADDR_MASK = (1 << 26) - 1; const int EXECodec::ARM_B_OPCODE_MASK = 0xFFFFFFFF ^ ARM_B_ADDR_MASK; const int EXECodec::ARM_B_ADDR_SGN_MASK = 1 << 25; const int EXECodec::ARM_OPCODE_B = 0x14000000; // 6 bit opcode const int EXECodec::ARM_OPCODE_BL = 0x94000000; // 6 bit opcode const int EXECodec::ARM_CB_REG_BITS = 5; // lowest bits for register const int EXECodec::ARM_CB_ADDR_MASK = 0x00FFFFE0; // 18 bit addr mask const int EXECodec::ARM_CB_ADDR_SGN_MASK = 1 << 18; const int EXECodec::ARM_CB_OPCODE_MASK = 0x7F000000; const int EXECodec::ARM_OPCODE_CBZ = 0x34000000; // 8 bit opcode const int EXECodec::ARM_OPCODE_CBNZ = 0x35000000; // 8 bit opcode const int EXECodec::WIN_PE = 0x00004550; const uint16 EXECodec::WIN_X86_ARCH = 0x014C; const uint16 EXECodec::WIN_AMD64_ARCH = 0x8664; const uint16 EXECodec::WIN_ARM64_ARCH = 0xAA64; const int EXECodec::ELF_X86_ARCH = 0x03; const int EXECodec::ELF_AMD64_ARCH = 0x3E; const int EXECodec::ELF_ARM64_ARCH = 0xB7; const int EXECodec::MAC_AMD64_ARCH = 0x01000007; const int EXECodec::MAC_ARM64_ARCH = 0x0100000C; const int EXECodec::MAC_MH_EXECUTE = 0x02; const int EXECodec::MAC_LC_SEGMENT = 0x01; const int EXECodec::MAC_LC_SEGMENT64 = 0x19; const int EXECodec::MIN_BLOCK_SIZE = 4096; const int EXECodec::MAX_BLOCK_SIZE = (1 << (26 + 2)) - 1; // max offset << 2 bool EXECodec::forward(SliceArray& input, SliceArray& output, int count) { if (count == 0) return true; if ((count < MIN_BLOCK_SIZE) || (count > MAX_BLOCK_SIZE)) return false; if (!SliceArray::isValid(input)) throw std::invalid_argument("EXECodec: Invalid input block"); if (!SliceArray::isValid(output)) throw std::invalid_argument("EXECodec: Invalid output block"); if (output._length - output._index < getMaxEncodedLength(count)) return false; if (_pCtx != nullptr) { Global::DataType dt = (Global::DataType)_pCtx->getInt("dataType", Global::UNDEFINED); if ((dt != Global::UNDEFINED) && (dt != Global::EXE) && (dt != Global::BIN)) return false; } int codeStart = 0; int codeEnd = count - 8; kanzi::byte mode = detectType(&input._array[input._index], count - 4, codeStart, codeEnd); if ((mode & NOT_EXE) != kanzi::byte(0)) { if (_pCtx != nullptr) _pCtx->putInt("dataType", Global::DataType(mode & MASK_DT)); return false; } mode &= ~MASK_DT; if (_pCtx != nullptr) _pCtx->putInt("dataType", Global::EXE); if (mode == X86) return forwardX86(input, output, count, codeStart, codeEnd); if (mode == ARM64) return forwardARM(input, output, count, codeStart, codeEnd); return false; } bool EXECodec::forwardX86(SliceArray& input, SliceArray& output, int count, int codeStart, int codeEnd) { const kanzi::byte* src = &input._array[input._index]; kanzi::byte* dst = &output._array[output._index]; dst[0] = X86; int srcIdx = codeStart; int dstIdx = 9; int matches = 0; const int dstEnd = output._length - 5; bool boundaryReached = false; if ((codeStart < 0) || (codeStart > count) || (dstIdx + codeStart > output._length)) return false; if ((codeEnd < codeStart) || (codeEnd > count)) return false; if (codeStart > 0) { memcpy(&dst[dstIdx], &src[0], size_t(codeStart)); dstIdx += codeStart; } while ((srcIdx < codeEnd) && (dstIdx < dstEnd)) { if (src[srcIdx] == X86_TWO_BYTE_PREFIX) { if (srcIdx + 1 >= codeEnd) { boundaryReached = true; break; } dst[dstIdx++] = src[srcIdx++]; if ((src[srcIdx] & X86_MASK_JCC) != X86_INSTRUCTION_JCC) { // Not a relative jump if (src[srcIdx] == X86_ESCAPE) dst[dstIdx++] = X86_ESCAPE; dst[dstIdx++] = src[srcIdx++]; continue; } if (srcIdx + 4 >= codeEnd) { boundaryReached = true; break; } } else if ((src[srcIdx] & X86_MASK_JUMP) != X86_INSTRUCTION_JUMP) { // Not a relative call if (src[srcIdx] == X86_ESCAPE) dst[dstIdx++] = X86_ESCAPE; dst[dstIdx++] = src[srcIdx++]; continue; } else if (srcIdx + 4 >= codeEnd) { boundaryReached = true; break; } // Current instruction is a jump/call. const int sgn = int(src[srcIdx + 4]); const int offset = LittleEndian::readInt32(&src[srcIdx + 1]); if (((sgn != 0) && (sgn != 0xFF)) || (offset == int(0xFF000000))) { dst[dstIdx++] = X86_ESCAPE; dst[dstIdx++] = src[srcIdx++]; continue; } // Absolute target address = srcIdx + 5 + offset. Let us ignore the +5 const int addr = srcIdx + ((sgn == 0) ? offset : -(-offset & X86_ADDR_MASK)); dst[dstIdx++] = src[srcIdx++]; BigEndian::writeInt32(&dst[dstIdx], addr ^ MASK_ADDRESS); srcIdx += 4; dstIdx += 4; matches++; } if ((matches < 16) || ((srcIdx < codeEnd) && (boundaryReached == false))) return false; if (dstIdx + (count - srcIdx) > dstEnd) return false; LittleEndian::writeInt32(&dst[1], codeStart); LittleEndian::writeInt32(&dst[5], dstIdx); memcpy(&dst[dstIdx], &src[srcIdx], size_t(count - srcIdx)); dstIdx += (count - srcIdx); // Cap expansion due to false positives if (dstIdx > count + (count / 50)) return false; input._index += count; output._index += dstIdx; return true; } bool EXECodec::forwardARM(SliceArray& input, SliceArray& output, int count, int codeStart, int codeEnd) { const kanzi::byte* src = &input._array[input._index]; kanzi::byte* dst = &output._array[output._index]; dst[0] = ARM64; int srcIdx = codeStart; int dstIdx = 9; int matches = 0; const int dstEnd = output._length - 8; if ((codeStart < 0) || (codeStart > count) || (dstIdx + codeStart > output._length)) return false; if ((codeEnd < codeStart) || (codeEnd > count)) return false; if (codeStart > 0) { memcpy(&dst[dstIdx], &src[0], size_t(codeStart)); dstIdx += codeStart; } while ((srcIdx < codeEnd) && (dstIdx < dstEnd)) { const int instr = LittleEndian::readInt32(&src[srcIdx]); const int opcode1 = instr & ARM_B_OPCODE_MASK; //const int opcode2 = instr & ARM_CB_OPCODE_MASK; bool isBL = (opcode1 == ARM_OPCODE_B) || (opcode1 == ARM_OPCODE_BL); // unconditional jump bool isCB = false; // disable for now ... isCB = (opcode2 == ARM_OPCODE_CBZ) || (opcode2 == ARM_OPCODE_CBNZ); // conditional jump if ((isBL == false) && (isCB == false)) { // Not a relative jump memcpy(&dst[dstIdx], &src[srcIdx], 4); srcIdx += 4; dstIdx += 4; continue; } int addr, val; if (isBL == true) { // opcode(6) + sgn(1) + offset(25) // Absolute target address = srcIdx +/- (offset*4) const int offset = instr & ARM_B_ADDR_MASK; const int sgn = instr & ARM_B_ADDR_SGN_MASK; addr = srcIdx + 4 * ((sgn == 0) ? offset : -(-offset & ARM_B_ADDR_MASK)); if (addr < 0) addr = 0; val = opcode1 | (addr >> 2); } else { // isCB == true // opcode(8) + sgn(1) + offset(18) + register(5) // Absolute target address = srcIdx +/- (offset*4) const int offset = (instr & ARM_CB_ADDR_MASK) >> ARM_CB_REG_BITS; const int sgn = instr & ARM_CB_ADDR_SGN_MASK; addr = srcIdx + 4 * ((sgn == 0) ? offset : -(-offset & ARM_B_ADDR_MASK)); if (addr < 0) addr = 0; val = (instr & ~ARM_CB_ADDR_MASK) | ((addr >> 2) << ARM_CB_REG_BITS); } if (addr == 0) { LittleEndian::writeInt32(&dst[dstIdx], val); // 0 address as escape memcpy(&dst[dstIdx + 4], &src[srcIdx], 4); srcIdx += 4; dstIdx += 8; continue; } LittleEndian::writeInt32(&dst[dstIdx], val); srcIdx += 4; dstIdx += 4; matches++; } if ((srcIdx < codeEnd) || (matches < 16)) return false; if (dstIdx + (count - srcIdx) > dstEnd) return false; LittleEndian::writeInt32(&dst[1], codeStart); LittleEndian::writeInt32(&dst[5], dstIdx); memcpy(&dst[dstIdx], &src[srcIdx], size_t(count - srcIdx)); dstIdx += (count - srcIdx); // Cap expansion due to false positives if (dstIdx > count + (count / 50)) return false; input._index += count; output._index += dstIdx; return true; } bool EXECodec::inverse(SliceArray& input, SliceArray& output, int count) { if (count == 0) return true; if ((count < 9) || (count > input._length - input._index)) return false; if (!SliceArray::isValid(input)) throw std::invalid_argument("EXECodec: Invalid input block"); if (!SliceArray::isValid(output)) throw std::invalid_argument("EXECodec: Invalid output block"); if (output._length - output._index < count - 9) return false; kanzi::byte mode = input._array[input._index]; if (mode == X86) return inverseX86(input, output, count); if (mode == ARM64) return inverseARM(input, output, count); return false; } bool EXECodec::inverseX86(SliceArray& input, SliceArray& output, int count) { const kanzi::byte* src = &input._array[input._index]; kanzi::byte* dst = &output._array[output._index]; int srcIdx = 9; int dstIdx = 0; const int dstEnd = output._length - output._index; const int codeStart = LittleEndian::readInt32(&src[1]); const int codeEnd = LittleEndian::readInt32(&src[5]); // Sanity check if ((codeStart < 0) || (codeEnd < srcIdx) || (codeEnd > count) || (codeStart > codeEnd - srcIdx) || (codeStart > dstEnd - dstIdx)) return false; if (codeStart > 0) { memcpy(&dst[dstIdx], &src[srcIdx], size_t(codeStart)); dstIdx += codeStart; srcIdx += codeStart; } while (srcIdx < codeEnd) { if (src[srcIdx] == X86_TWO_BYTE_PREFIX) { if (srcIdx + 1 >= codeEnd) return false; dst[dstIdx++] = src[srcIdx++]; if ((src[srcIdx] & X86_MASK_JCC) != X86_INSTRUCTION_JCC) { // Not a relative jump if (src[srcIdx] == X86_ESCAPE) { srcIdx++; if (srcIdx >= codeEnd) return false; } dst[dstIdx++] = src[srcIdx++]; continue; } } else if ((src[srcIdx] & X86_MASK_JUMP) != X86_INSTRUCTION_JUMP) { // Not a relative call if (src[srcIdx] == X86_ESCAPE) { srcIdx++; if (srcIdx >= codeEnd) return false; } dst[dstIdx++] = src[srcIdx++]; continue; } if (srcIdx + 4 >= codeEnd) return false; // Current instruction is a jump/call. Decode absolute address const int addr = BigEndian::readInt32(&src[srcIdx + 1]) ^ MASK_ADDRESS; const int offset = addr - dstIdx; dst[dstIdx++] = src[srcIdx++]; LittleEndian::writeInt32(&dst[dstIdx], (offset >= 0) ? offset : -(-offset & X86_ADDR_MASK)); srcIdx += 4; dstIdx += 4; } if (srcIdx < count) { memcpy(&dst[dstIdx], &src[srcIdx], size_t(count - srcIdx)); dstIdx += (count - srcIdx); } input._index += count; output._index += dstIdx; return true; } bool EXECodec::inverseARM(SliceArray& input, SliceArray& output, int count) { const kanzi::byte* src = &input._array[input._index]; kanzi::byte* dst = &output._array[output._index]; int srcIdx = 9; int dstIdx = 0; const int dstEnd = output._length - output._index; const int codeStart = LittleEndian::readInt32(&src[1]); const int codeEnd = LittleEndian::readInt32(&src[5]); // Sanity check if ((codeStart < 0) || (codeEnd < srcIdx) || (codeEnd > count) || (codeStart > codeEnd - srcIdx) || (codeStart > dstEnd - dstIdx)) return false; if (codeStart > 0) { memcpy(&dst[dstIdx], &src[srcIdx], size_t(codeStart)); dstIdx += codeStart; srcIdx += codeStart; } while (srcIdx < codeEnd) { if (srcIdx + 4 > codeEnd) return false; const int instr = LittleEndian::readInt32(&src[srcIdx]); const int opcode1 = instr & ARM_B_OPCODE_MASK; //const int opcode2 = instr & ARM_CB_OPCODE_MASK; bool isBL = (opcode1 == ARM_OPCODE_B) || (opcode1 == ARM_OPCODE_BL); // unconditional jump bool isCB = false; // disable for now ... isCB = (opcode2 == ARM_OPCODE_CBZ) || (opcode2 == ARM_OPCODE_CBNZ); // conditional jump if ((isBL == false) && (isCB == false)) { // Not a relative jump memcpy(&dst[dstIdx], &src[srcIdx], 4); srcIdx += 4; dstIdx += 4; continue; } // Decode absolute address int val, addr; if (isBL == true) { addr = (instr & ARM_B_ADDR_MASK) << 2; const int offset = (addr - dstIdx) >> 2; val = opcode1 | (offset & ARM_B_ADDR_MASK); } else { addr = ((instr & ARM_CB_ADDR_MASK) >> ARM_CB_REG_BITS) << 2; const int offset = (addr - dstIdx) >> 2; val = (instr & ~ARM_CB_ADDR_MASK) | (offset << ARM_CB_REG_BITS); } if (addr == 0) { if (srcIdx + 8 > codeEnd) return false; memcpy(&dst[dstIdx], &src[srcIdx + 4], 4); srcIdx += 8; dstIdx += 4; continue; } LittleEndian::writeInt32(&dst[dstIdx], val); srcIdx += 4; dstIdx += 4; } if (srcIdx < count) { memcpy(&dst[dstIdx], &src[srcIdx], size_t(count - srcIdx)); dstIdx += (count - srcIdx); } input._index += count; output._index += dstIdx; return true; } kanzi::byte EXECodec::detectType(const kanzi::byte src[], int count, int& codeStart, int& codeEnd) { // Let us check the first bytes ... but this may not be the first block // Best effort const uint magic = Magic::getType(src); int arch = 0; if (parseHeader(src, count, magic, arch, codeStart, codeEnd) == true) { switch(arch) { case ELF_X86_ARCH: case ELF_AMD64_ARCH: case WIN_X86_ARCH: case WIN_AMD64_ARCH: case MAC_AMD64_ARCH: return X86; case ELF_ARM64_ARCH: case WIN_ARM64_ARCH: case MAC_ARM64_ARCH: return ARM64; default: count = codeEnd - codeStart; } } int jumpsX86 = 0; int jumpsARM64 = 0; uint histo[256] = { 0 }; for (int i = codeStart; i < codeEnd; i++) { histo[int(src[i])]++; // X86 if ((src[i] & X86_MASK_JUMP) == X86_INSTRUCTION_JUMP) { if ((src[i + 4] == kanzi::byte(0)) || (src[i + 4] == kanzi::byte(0xFF))) { // Count relative jumps (CALL = E8/ JUMP = E9 .. .. .. 00/FF) jumpsX86++; continue; } } else if (src[i] == X86_TWO_BYTE_PREFIX) { i++; if ((src[i] == kanzi::byte(0x38)) || (src[i] == kanzi::byte(0x3A))) i++; // Count relative conditional jumps (0x0F 0x8?) with 16/32 offsets if ((src[i] & X86_MASK_JCC) == X86_INSTRUCTION_JCC) { jumpsX86++; continue; } } // ARM if ((i & 3) != 0) continue; const int instr = LittleEndian::readInt32(&src[i]); const int opcode1 = instr & ARM_B_OPCODE_MASK; const int opcode2 = instr & ARM_CB_OPCODE_MASK; if ((opcode1 == ARM_OPCODE_B) || (opcode1 == ARM_OPCODE_BL) || (opcode2 == ARM_OPCODE_CBZ) || (opcode2 == ARM_OPCODE_CBNZ)) jumpsARM64++; } Global::DataType dt = Global::detectSimpleType(count, histo); if (dt != Global::BIN) return NOT_EXE | kanzi::byte(dt); // Filter out (some/many) multimedia files if ((histo[0] < uint(count / 10)) || (histo[255] < uint(count / 100))) return NOT_EXE | kanzi::byte(dt); int smallVals = 0; for (int i = 0; i < 16; i++) smallVals += histo[i]; if (smallVals > (count / 2)) return NOT_EXE | kanzi::byte(dt); // Ad-hoc thresholds if (jumpsX86 >= (count / 200)) return X86; if (jumpsARM64 >= (count / 200)) return ARM64; // Number of jump instructions too small => either not an exe or not worth the change, skip. return NOT_EXE | kanzi::byte(dt); } // Return true if known header bool EXECodec::parseHeader(const kanzi::byte src[], int count, uint magic, int& arch, int& codeStart, int& codeEnd) { if (magic == Magic::WIN_MAGIC) { if (count >= 64) { const int posPE = LittleEndian::readInt32(&src[60]); if ((posPE > 0) && (posPE <= count - 48) && (LittleEndian::readInt32(&src[posPE]) == WIN_PE)) { const kanzi::byte* pe = &src[posPE]; codeStart = min(LittleEndian::readInt32(&pe[44]), count); codeEnd = min(codeStart + LittleEndian::readInt32(&pe[28]), count); arch = LittleEndian::readInt16(&pe[4]); } return true; } } else if (magic == Magic::ELF_MAGIC) { bool isLittleEndian = src[5] == kanzi::byte(1); if (count >= 64) { codeStart = 0; if (isLittleEndian == true) { if (src[4] == kanzi::byte(2)) { // 64 bits int nbEntries = int(LittleEndian::readInt16(&src[0x3C])); int szEntry = int(LittleEndian::readInt16(&src[0x3A])); int posSection = int(LittleEndian::readLong64(&src[0x28])); for (int i = 0; i < nbEntries; i++) { int startEntry = posSection + i * szEntry; if (startEntry + 0x28 >= count) return false; int typeSection = int(LittleEndian::readInt32(&src[startEntry + 4])); int offSection = int(LittleEndian::readLong64(&src[startEntry + 0x18])); int lenSection = int(LittleEndian::readLong64(&src[startEntry + 0x20])); if ((typeSection == 1) && (lenSection >= 64)) { if (codeStart == 0) codeStart = offSection; codeEnd = offSection + lenSection; } } } else { // 32 bits int nbEntries = int(LittleEndian::readInt16(&src[0x30])); int szEntry = int(LittleEndian::readInt16(&src[0x2E])); int posSection = int(LittleEndian::readInt32(&src[0x20])); for (int i = 0; i < nbEntries; i++) { int startEntry = posSection + i * szEntry; if (startEntry + 0x18 >= count) return false; int typeSection = int(LittleEndian::readInt32(&src[startEntry + 4])); int offSection = int(LittleEndian::readInt32(&src[startEntry + 0x10])); int lenSection = int(LittleEndian::readInt32(&src[startEntry + 0x14])); if ((typeSection == 1) && (lenSection >= 64)) { if (codeStart == 0) codeStart = offSection; codeEnd = offSection + lenSection; } } } arch = LittleEndian::readInt16(&src[18]); } else { if (src[4] == kanzi::byte(2)) { // 64 bits int nbEntries = int(BigEndian::readInt16(&src[0x3C])); int szEntry = int(BigEndian::readInt16(&src[0x3A])); int posSection = int(BigEndian::readLong64(&src[0x28])); for (int i = 0; i < nbEntries; i++) { int startEntry = posSection + i * szEntry; if (startEntry + 0x28 >= count) return false; int typeSection = int(BigEndian::readInt32(&src[startEntry + 4])); int offSection = int(BigEndian::readLong64(&src[startEntry + 0x18])); int lenSection = int(BigEndian::readLong64(&src[startEntry + 0x20])); if ((typeSection == 1) && (lenSection >= 64)) { if (codeStart == 0) codeStart = offSection; codeEnd = offSection + lenSection; } } } else { // 32 bits int nbEntries = int(BigEndian::readInt16(&src[0x30])); int szEntry = int(BigEndian::readInt16(&src[0x2E])); int posSection = int(BigEndian::readInt32(&src[0x20])); for (int i = 0; i < nbEntries; i++) { int startEntry = posSection + i * szEntry; if (startEntry + 0x18 >= count) return false; int typeSection = int(BigEndian::readInt32(&src[startEntry + 4])); int offSection = int(BigEndian::readInt32(&src[startEntry + 0x10])); int lenSection = int(BigEndian::readInt32(&src[startEntry + 0x14])); if ((typeSection == 1) && (lenSection >= 64)) { if (codeStart == 0) codeStart = offSection; codeEnd = offSection + lenSection; } } } arch = BigEndian::readInt16(&src[18]); } codeStart = min(codeStart, count); codeEnd = min(codeEnd, count); return true; } } else if ((magic == Magic::MAC_MAGIC32) || (magic == Magic::MAC_CIGAM32) || (magic == Magic::MAC_MAGIC64) || (magic == Magic::MAC_CIGAM64)) { bool is64Bits = (magic == Magic::MAC_MAGIC64) || (magic == Magic::MAC_CIGAM64); codeStart = 0; static char MAC_TEXT_SEGMENT[] = "__TEXT"; static char MAC_TEXT_SECTION[] = "__text"; if (count >= 64) { int type = LittleEndian::readInt32(&src[12]); if (type != MAC_MH_EXECUTE) return false; arch = LittleEndian::readInt32(&src[4]); int nbCmds = LittleEndian::readInt32(&src[0x10]); int pos = (is64Bits == true) ? 0x20 : 0x1C; int cmd = 0; while (cmd < nbCmds) { int ldCmd = LittleEndian::readInt32(&src[pos]); int szCmd = LittleEndian::readInt32(&src[pos + 4]); int szSegHdr = (is64Bits == true) ? 0x48 : 0x38; if ((ldCmd == MAC_LC_SEGMENT) || (ldCmd == MAC_LC_SEGMENT64)) { if (pos + 14 >= count) return false; if (memcmp(&src[pos + 8], reinterpret_cast(MAC_TEXT_SEGMENT), 6) == 0) { int posSection = pos + szSegHdr; if (posSection + 0x34 >= count) return false; if (memcmp(&src[posSection], reinterpret_cast(MAC_TEXT_SECTION), 6) == 0) { // Text section in TEXT segment if (is64Bits == true) { codeStart = int(LittleEndian::readLong64(&src[posSection + 0x30])); codeEnd = codeStart + LittleEndian::readInt32(&src[posSection + 0x28]); break; } else { codeStart = LittleEndian::readInt32(&src[posSection + 0x2C]); codeEnd = codeStart + LittleEndian::readInt32(&src[posSection + 0x28]); break; } } } } cmd++; pos += szCmd; } codeStart = min(codeStart, count); codeEnd = min(codeEnd, count); return true; } } return false; } kanzi-cpp-2.5.2/src/transform/EXECodec.hpp000066400000000000000000000066731516423635400203330ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_EXECodec #define knz_EXECodec #include "../Context.hpp" #include "../Transform.hpp" namespace kanzi { class EXECodec FINAL : public Transform { public: EXECodec() { _pCtx = nullptr; } EXECodec(Context& ctx) : _pCtx(&ctx) {} ~EXECodec() {} bool forward(SliceArray& source, SliceArray& destination, int length); bool inverse(SliceArray& source, SliceArray& destination, int length); int getMaxEncodedLength(int inputLen) const; private: static const byte X86_MASK_JUMP; static const byte X86_INSTRUCTION_JUMP; static const byte X86_INSTRUCTION_JCC; static const byte X86_TWO_BYTE_PREFIX; static const byte X86_MASK_JCC; static const byte X86_ESCAPE; static const byte NOT_EXE; static const byte X86; static const byte ARM64; static const byte MASK_DT; static const int X86_ADDR_MASK; static const int MASK_ADDRESS; static const int ARM_B_ADDR_MASK; static const int ARM_B_OPCODE_MASK; static const int ARM_B_ADDR_SGN_MASK; static const int ARM_OPCODE_B; static const int ARM_OPCODE_BL; static const int ARM_CB_REG_BITS; static const int ARM_CB_ADDR_MASK; static const int ARM_CB_ADDR_SGN_MASK; static const int ARM_CB_OPCODE_MASK; static const int ARM_OPCODE_CBZ; static const int ARM_OPCODE_CBNZ; static const int WIN_PE; static const uint16 WIN_X86_ARCH; static const uint16 WIN_AMD64_ARCH; static const uint16 WIN_ARM64_ARCH; static const int ELF_X86_ARCH; static const int ELF_AMD64_ARCH; static const int ELF_ARM64_ARCH; static const int MAC_AMD64_ARCH; static const int MAC_ARM64_ARCH; static const int MAC_MH_EXECUTE; static const int MAC_LC_SEGMENT; static const int MAC_LC_SEGMENT64; static const int MIN_BLOCK_SIZE; static const int MAX_BLOCK_SIZE; bool forwardARM(SliceArray& source, SliceArray& destination, int length, int codeStart, int codeEnd); bool forwardX86(SliceArray& source, SliceArray& destination, int length, int codeStart, int codeEnd); bool inverseARM(SliceArray& source, SliceArray& destination, int length); bool inverseX86(SliceArray& source, SliceArray& destination, int length); static byte detectType(const byte src[], int count, int& codeStart, int& codeEnd); static bool parseHeader(const byte src[], int count, uint magic, int& arch, int& codeStart, int& codeEnd); Context* _pCtx; }; inline int EXECodec::getMaxEncodedLength(int srcLen) const { // Allocate some extra buffer for incompressible data. return (srcLen <= 256) ? srcLen + 32 : srcLen + srcLen / 8; } } #endif kanzi-cpp-2.5.2/src/transform/FSDCodec.cpp000066400000000000000000000304431516423635400203110ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include "FSDCodec.hpp" #include "../Global.hpp" #include "../Magic.hpp" using namespace kanzi; using namespace std; const int FSDCodec::MIN_LENGTH = 1024; const kanzi::byte FSDCodec::ESCAPE_TOKEN = kanzi::byte(255); const kanzi::byte FSDCodec::DELTA_CODING = kanzi::byte(0); const kanzi::byte FSDCodec::XOR_CODING = kanzi::byte(1); const uint8 FSDCodec::ZIGZAG1[256] = { 253, 251, 249, 247, 245, 243, 241, 239, 237, 235, 233, 231, 229, 227, 225, 223, 221, 219, 217, 215, 213, 211, 209, 207, 205, 203, 201, 199, 197, 195, 193, 191, 189, 187, 185, 183, 181, 179, 177, 175, 173, 171, 169, 167, 165, 163, 161, 159, 157, 155, 153, 151, 149, 147, 145, 143, 141, 139, 137, 135, 133, 131, 129, 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97, 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 160, 162, 164, 166, 168, 170, 172, 174, 176, 178, 180, 182, 184, 186, 188, 190, 192, 194, 196, 198, 200, 202, 204, 206, 208, 210, 212, 214, 216, 218, 220, 222, 224, 226, 228, 230, 232, 234, 236, 238, 240, 242, 244, 246, 248, 250, 252, 254, 255, }; const int8 FSDCodec::ZIGZAG2[256] = { 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7, -8, 8, -9, 9, -10, 10, -11, 11, -12, 12, -13, 13, -14, 14, -15, 15, -16, 16, -17, 17, -18, 18, -19, 19, -20, 20, -21, 21, -22, 22, -23, 23, -24, 24, -25, 25, -26, 26, -27, 27, -28, 28, -29, 29, -30, 30, -31, 31, -32, 32, -33, 33, -34, 34, -35, 35, -36, 36, -37, 37, -38, 38, -39, 39, -40, 40, -41, 41, -42, 42, -43, 43, -44, 44, -45, 45, -46, 46, -47, 47, -48, 48, -49, 49, -50, 50, -51, 51, -52, 52, -53, 53, -54, 54, -55, 55, -56, 56, -57, 57, -58, 58, -59, 59, -60, 60, -61, 61, -62, 62, -63, 63, -64, 64, -65, 65, -66, 66, -67, 67, -68, 68, -69, 69, -70, 70, -71, 71, -72, 72, -73, 73, -74, 74, -75, 75, -76, 76, -77, 77, -78, 78, -79, 79, -80, 80, -81, 81, -82, 82, -83, 83, -84, 84, -85, 85, -86, 86, -87, 87, -88, 88, -89, 89, -90, 90, -91, 91, -92, 92, -93, 93, -94, 94, -95, 95, -96, 96, -97, 97, -98, 98, -99, 99, -100, 100, -101, 101, -102, 102, -103, 103, -104, 104, -105, 105, -106, 106, -107, 107, -108, 108, -109, 109, -110, 110, -111, 111, -112, 112, -113, 113, -114, 114, -115, 115, -116, 116, -117, 117, -118, 118, -119, 119, -120, 120, -121, 121, -122, 122, -123, 123, -124, 124, -125, 125, -126, 126, -127, 127, -128, }; bool FSDCodec::forward(SliceArray& input, SliceArray& output, int count) { if (count == 0) return true; if (!SliceArray::isValid(input)) throw invalid_argument("FSD codec: Invalid input block"); if (!SliceArray::isValid(output)) throw invalid_argument("FSD codec: Invalid output block"); if (input._array == output._array) return false; if (output._length < getMaxEncodedLength(count)) return false; // If too small, skip if (count < MIN_LENGTH) return false; if (_pCtx != nullptr) { Global::DataType dt = (Global::DataType) _pCtx->getInt("dataType", Global::UNDEFINED); if ((dt != Global::UNDEFINED) && (dt != Global::MULTIMEDIA) && (dt != Global::BIN)) return false; } const kanzi::byte* src = &input._array[input._index]; kanzi::byte* dst = &output._array[output._index]; uint magic = Magic::getType(src); // Skip detection except for a few candidate types switch (magic) { case Magic::BMP_MAGIC: case Magic::RIFF_MAGIC: case Magic::PBM_MAGIC: case Magic::PGM_MAGIC: case Magic::PPM_MAGIC: case Magic::NO_MAGIC: break; default: return false; } const int srcEnd = count; const int dstEnd = getMaxEncodedLength(count); const int count10 = count / 10; const int count5 = 2 * count10; // count5=count/5 does not guarantee count5=2*count10 ! uint histo[7][256]; memset(&histo[0][0], 0, sizeof(histo)); // Check several step values on a few sub-blocks (no memory allocation) const kanzi::byte* in0 = &src[count5 * 0]; const kanzi::byte* in1 = &src[count5 * 2]; const kanzi::byte* in2 = &src[count5 * 4]; for (int i = count10; i < count5; i++) { const kanzi::byte b0 = in0[i]; histo[0][int(b0)]++; histo[1][int(b0 ^ in0[i - 1])]++; histo[2][int(b0 ^ in0[i - 2])]++; histo[3][int(b0 ^ in0[i - 3])]++; histo[4][int(b0 ^ in0[i - 4])]++; histo[5][int(b0 ^ in0[i - 8])]++; histo[6][int(b0 ^ in0[i - 16])]++; const kanzi::byte b1 = in1[i]; histo[0][int(b1)]++; histo[1][int(b1 ^ in1[i - 1])]++; histo[2][int(b1 ^ in1[i - 2])]++; histo[3][int(b1 ^ in1[i - 3])]++; histo[4][int(b1 ^ in1[i - 4])]++; histo[5][int(b1 ^ in1[i - 8])]++; histo[6][int(b1 ^ in1[i - 16])]++; const kanzi::byte b2 = in2[i]; histo[0][int(b2)]++; histo[1][int(b2 ^ in2[i - 1])]++; histo[2][int(b2 ^ in2[i - 2])]++; histo[3][int(b2 ^ in2[i - 3])]++; histo[4][int(b2 ^ in2[i - 4])]++; histo[5][int(b2 ^ in2[i - 8])]++; histo[6][int(b2 ^ in2[i - 16])]++; } // Find if entropy is lower post transform int minIdx = 0; int ent[7]; for (int i = 0; i < 7; i++) { ent[i] = Global::computeFirstOrderEntropy1024(3 * count10, histo[i]); if (ent[i] < ent[minIdx]) minIdx = i; } // If not better, quick exit if (ent[minIdx] >= ent[0]) { if (_pCtx != nullptr) _pCtx->putInt("dataType", Global::detectSimpleType(3 * count10, histo[0])); return false; } if (_pCtx != nullptr) _pCtx->putInt("dataType", Global::MULTIMEDIA); const int distances[7] = { 0, 1, 2, 3, 4, 8, 16 }; const int dist = distances[minIdx]; int largeDeltas = 0; // Detect best coding by sampling for large deltas for (int i = 2 * count5; i < 3 * count5; i++) { const int delta = int(src[i]) - int(src[i - dist]); if ((delta < -127) || (delta > 127)) largeDeltas++; } // Delta coding works better for pictures & xor coding better for wav files // Select xor coding if large deltas are over 3% (ad-hoc threshold) const kanzi::byte mode = (largeDeltas > (count5 >> 5)) ? XOR_CODING : DELTA_CODING; dst[0] = mode; dst[1] = kanzi::byte(dist); int srcIdx = 0; int dstIdx = 2; // Emit first bytes for (int i = 0; i < dist; i++) dst[dstIdx++] = src[srcIdx++]; // Emit modified bytes if (mode == DELTA_CODING) { while ((srcIdx < srcEnd) && (dstIdx < dstEnd - 1)) { const int delta = 127 + int(src[srcIdx]) - int(src[srcIdx - dist]); if ((delta >= 0) && (delta < 255)) { dst[dstIdx++] = kanzi::byte(ZIGZAG1[delta]); // zigzag encode delta srcIdx++; continue; } // Skip delta, encode with escape dst[dstIdx++] = ESCAPE_TOKEN; dst[dstIdx++] = src[srcIdx] ^ src[srcIdx - dist]; srcIdx++; } } else { // mode == XOR_CODING while (srcIdx < srcEnd) { dst[dstIdx++] = src[srcIdx] ^ src[srcIdx - dist]; srcIdx++; } } if (srcIdx != srcEnd) return false; // Extra check that the transform makes sense memset(&histo[0][0], 0, sizeof(uint) * 256); const kanzi::byte* out1 = &dst[count5 * 1]; const kanzi::byte* out2 = &dst[count5 * 3]; for (int i = 0; i < count10; i++) { histo[0][int(out1[i])]++; histo[0][int(out2[i])]++; } const int entropy = Global::computeFirstOrderEntropy1024(count5, histo[0]); if (entropy >= ent[0]) return false; input._index += srcIdx; output._index += dstIdx; return true; // Allowed to expand } bool FSDCodec::inverse(SliceArray& input, SliceArray& output, int count) { if (count == 0) return true; if (!SliceArray::isValid(input)) throw invalid_argument("FSD codec: Invalid input block"); if (!SliceArray::isValid(output)) throw invalid_argument("FSD codec: Invalid output block"); if (input._array == output._array) return false; if (count < 4) return false; if (input._index + count > input._length) return false; const int srcEnd = count; const int dstEnd = output._length - output._index; const kanzi::byte* src = &input._array[input._index]; kanzi::byte* dst = &output._array[output._index]; // Retrieve mode & step value const kanzi::byte mode = src[0]; const int dist = int(src[1]); // Sanity check if ((dist < 1) || ((dist > 4) && (dist != 8) && (dist != 16))) return false; if ((count < dist + 2) || (dist > dstEnd)) return false; // Emit first bytes memcpy(&dst[0], &src[2], size_t(dist)); int srcIdx = dist + 2; int dstIdx = dist; // Recover original bytes if (mode == DELTA_CODING) { while ((srcIdx < srcEnd) && (dstIdx < dstEnd)) { if (src[srcIdx] != ESCAPE_TOKEN) { dst[dstIdx] = kanzi::byte(int(dst[dstIdx - dist]) + ZIGZAG2[int(src[srcIdx])]); srcIdx++; dstIdx++; continue; } srcIdx++; if (srcIdx == srcEnd) return false; dst[dstIdx] = src[srcIdx] ^ dst[dstIdx - dist]; srcIdx++; dstIdx++; } } else if (mode == XOR_CODING) { while ((srcIdx < srcEnd) && (dstIdx < dstEnd)) { dst[dstIdx] = src[srcIdx] ^ dst[dstIdx - dist]; srcIdx++; dstIdx++; } } else { // Invalid mode return false; } input._index += srcIdx; output._index += dstIdx; return srcIdx == srcEnd; } kanzi-cpp-2.5.2/src/transform/FSDCodec.hpp000066400000000000000000000031431516423635400203130ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_FSDCodec #define knz_FSDCodec #include "../Context.hpp" #include "../Transform.hpp" // Fixed Step Delta codec // Decorrelate values separated by a constant distance (step) and encode residuals namespace kanzi { class FSDCodec FINAL : public Transform { public: FSDCodec() { _pCtx = nullptr; } FSDCodec(Context& ctx) : _pCtx(&ctx) {} ~FSDCodec() {} bool forward(SliceArray& src, SliceArray& dst, int length); bool inverse(SliceArray& src, SliceArray& dst, int length); // Required encoding output buffer size int getMaxEncodedLength(int srcLen) const { return srcLen + ((srcLen < 1024) ? 64 : srcLen >> 4); // limit expansion } private: static const int MIN_LENGTH; static const byte ESCAPE_TOKEN; static const byte DELTA_CODING; static const byte XOR_CODING; static const uint8 ZIGZAG1[256]; static const int8 ZIGZAG2[256]; Context* _pCtx; }; } #endif kanzi-cpp-2.5.2/src/transform/LZCodec.cpp000066400000000000000000000653251516423635400202310ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "LZCodec.hpp" #include "../Memory.hpp" #include "TransformFactory.hpp" using namespace kanzi; using namespace std; LZCodec::LZCodec() { _delegate = new LZXCodec(); } LZCodec::LZCodec(Context& ctx) { const int lzType = ctx.getInt("lz", TransformFactory::LZ_TYPE); if (lzType == TransformFactory::LZP_TYPE) { _delegate = new LZPCodec(ctx); } else if (lzType == TransformFactory::LZX_TYPE) { _delegate = new LZXCodec(ctx); } else { _delegate = new LZXCodec(ctx); } } bool LZCodec::forward(SliceArray& input, SliceArray& output, int count) { if (count == 0) return true; if (input._array == output._array) return false; return _delegate->forward(input, output, count); } bool LZCodec::inverse(SliceArray& input, SliceArray& output, int count) { if (count == 0) return true; if (input._array == output._array) return false; return _delegate->inverse(input, output, count); } template<> const uint LZXCodec::HASH_SEED = 0x1E35A7BD; template<> const uint LZXCodec::HASH_LOG = 16; template<> const uint LZXCodec::HASH_RSHIFT = 64 - HASH_LOG; template<> const uint LZXCodec::HASH_LSHIFT = 24; template<> const int LZXCodec::MAX_DISTANCE1 = (1 << 16) - 2; template<> const int LZXCodec::MAX_DISTANCE2 = (1 << 24) - 2; template<> const int LZXCodec::MIN_MATCH4 = 4; template<> const int LZXCodec::MIN_MATCH6 = 6; template<> const int LZXCodec::MIN_MATCH9 = 9; template<> const int LZXCodec::MAX_MATCH = 65535 + 254 + MIN_MATCH4; template<> const int LZXCodec::MIN_BLOCK_LENGTH = 24; template<> const uint LZXCodec::HASH_SEED = 0x1E35A7BD; template<> const uint LZXCodec::HASH_LOG = 19; template<> const uint LZXCodec::HASH_RSHIFT = 64 - HASH_LOG; template<> const uint LZXCodec::HASH_LSHIFT = 24; template<> const int LZXCodec::MAX_DISTANCE1 = (1 << 16) - 2; template<> const int LZXCodec::MAX_DISTANCE2 = (1 << 24) - 2; template<> const int LZXCodec::MIN_MATCH4 = 4; template<> const int LZXCodec::MIN_MATCH6 = 6; template<> const int LZXCodec::MIN_MATCH9 = 9; template<> const int LZXCodec::MAX_MATCH = 65535 + 254 + MIN_MATCH4; template<> const int LZXCodec::MIN_BLOCK_LENGTH = 24; template bool LZXCodec::forward(SliceArray& input, SliceArray& output, int count) { if (count == 0) return true; if (!SliceArray::isValid(input)) throw invalid_argument("LZ codec: Invalid input block"); if (!SliceArray::isValid(output)) throw invalid_argument("LZ codec: Invalid output block"); if (output._length - output._index < getMaxEncodedLength(count)) return false; // If too small, skip if (count < MIN_BLOCK_LENGTH) return false; if (_hashSize == 0) { _hashSize = 1 << HASH_LOG; if (_hashes != nullptr) delete[] _hashes; _hashes = new int32[_hashSize]; } if (_bufferSize < max(count / 5, 256)) { _bufferSize = max(count / 5, 256); if (_mLenBuf != nullptr) delete[] _mLenBuf; _mLenBuf = new kanzi::byte[_bufferSize]; if (_mBuf != nullptr) delete[] _mBuf; _mBuf = new kanzi::byte[_bufferSize]; if (_tkBuf != nullptr) delete[] _tkBuf; _tkBuf = new kanzi::byte[_bufferSize]; } memset(_hashes, 0, sizeof(int32) * _hashSize); const int srcEnd = count - 16 - 2; const kanzi::byte* src = &input._array[input._index]; kanzi::byte* dst = &output._array[output._index]; const int maxDist = (srcEnd < 4 * MAX_DISTANCE1) ? MAX_DISTANCE1 : MAX_DISTANCE2; dst[12] = (maxDist == MAX_DISTANCE1) ? kanzi::byte(0) : kanzi::byte(1); int mm = MIN_MATCH4; if (_pCtx != nullptr) { Global::DataType dt = (Global::DataType)_pCtx->getInt("dataType", Global::UNDEFINED); if (dt == Global::DNA) { // Longer min match for DNA input mm = MIN_MATCH6; } else if (dt == Global::SMALL_ALPHABET) { return false; } } // dst[12] = 0000MMMD (4 bits + 3 bits minMatch + 1 bit max distance) dst[12] |= kanzi::byte(((mm - 2) & 0x07) << 1); // minMatch in [2..9] const int minMatch = mm; int srcIdx = 0; int dstIdx = 13; int anchor = 0; int mIdx = 0; int mLenIdx = 0; int tkIdx = 0; int repd[] = { count, count }; int repIdx = 0; int srcInc = 0; while (srcIdx < srcEnd) { int bestLen = 0; const int32 h0 = hash(&src[srcIdx]); const int ref0 = _hashes[h0]; _hashes[h0] = srcIdx; const int srcIdx1 = srcIdx + 1; int ref = srcIdx1 - repd[repIdx]; const int minRef = max(srcIdx - maxDist, 0); if ((ref > minRef) && KANZI_MEM_EQ4(&src[srcIdx1], &src[ref])) { // Check repd0 first bestLen = findMatch(src, srcIdx1, ref, min(srcEnd - srcIdx1, MAX_MATCH)); } else { ref = srcIdx1 - repd[repIdx ^ 1]; if ((ref > minRef) && KANZI_MEM_EQ4(&src[srcIdx1], &src[ref])) { // Check repd1 first bestLen = findMatch(src, srcIdx1, ref, min(srcEnd - srcIdx1, MAX_MATCH)); } } if (bestLen < minMatch) { // Check match at position in hash table ref = ref0; if ((ref > minRef) && KANZI_MEM_EQ4(&src[srcIdx], &src[ref])) { bestLen = findMatch(src, srcIdx, ref, min(srcEnd - srcIdx, MAX_MATCH)); } // No good match ? if (bestLen < minMatch) { srcIdx = srcIdx1 + (srcInc >> 6); srcInc++; repIdx = 0; continue; } if ((srcIdx - ref != repd[0]) && (srcIdx - ref != repd[1])) { // Check if better match at next position const int32 h1 = hash(&src[srcIdx1]); const int ref1 = _hashes[h1]; _hashes[h1] = srcIdx1; if ((ref1 > minRef + 1) && KANZI_MEM_EQ4(&src[srcIdx1 + bestLen - 3], &src[ref1 + bestLen - 3])) { const int bestLen1 = findMatch(src, srcIdx1, ref1, min(srcEnd - srcIdx1, MAX_MATCH)); // Select best match if (bestLen1 >= bestLen) { ref = ref1; bestLen = bestLen1; srcIdx = srcIdx1; } } if (T == true) { const int srcIdx2 = srcIdx1 + 1; const int32 h2 = hash(&src[srcIdx2]); const int ref2 = _hashes[h2]; _hashes[h2] = srcIdx2; if ((ref2 > minRef + 2) && KANZI_MEM_EQ4(&src[srcIdx2 + bestLen - 3], &src[ref2 + bestLen - 3])) { const int bestLen2 = findMatch(src, srcIdx2, ref2, min(srcEnd - srcIdx2, MAX_MATCH)); // Select best match if (bestLen2 >= bestLen) { ref = ref2; bestLen = bestLen2; srcIdx = srcIdx2; } } } } // Extend backwards while ((srcIdx > anchor) && (ref > minRef) && (src[srcIdx - 1] == src[ref - 1])) { bestLen++; ref--; srcIdx--; } if (bestLen > MAX_MATCH) { ref += (bestLen - MAX_MATCH); srcIdx += (bestLen - MAX_MATCH); bestLen = MAX_MATCH; } } else { if ((bestLen >= MAX_MATCH) || (src[srcIdx] != src[ref - 1])) { srcIdx++; const int32 h1 = hash(&src[srcIdx]); _hashes[h1] = srcIdx; } else { bestLen++; ref--; } } // Emit match srcInc = 0; // Token: 3 bits litLen + 2 bits flag + 3 bits mLen (LLLFFMMM) // or 3 bits litLen + 3 bits flag + 2 bits mLen (LLLFFFMM) // LLL : <= 7 --> LLL == literal length (if 7, remainder encoded outside of token) // MMM : <= 7 --> MMM == match length (if 7, remainder encoded outside of token) // MM : <= 3 --> MM == match length (if 3, remainder encoded outside of token) // FF = 01 --> 1 byte dist // FF = 10 --> 2 byte dist // FF = 11 --> 3 byte dist // FFF = 000 --> dist == repd0 // FFF = 001 --> dist == repd1 const int dist = srcIdx - ref; int token, mLenTh; if (dist == repd[0]) { token = 0x00; mLenTh = 3; } else if (dist == repd[1]) { token = 0x04; mLenTh = 3; } else { // Emit distance (since not repeat) _mBuf[mIdx] = kanzi::byte(dist >> 16); const int inc1 = dist >= 65536 ? 1 : 0; mIdx += inc1; _mBuf[mIdx] = kanzi::byte(dist >> 8); const int inc2 = dist >= 256 ? 1 : 0; mIdx += inc2; _mBuf[mIdx++] = kanzi::byte(dist); token = (inc1 + inc2 + 1) << 3; mLenTh = 7; } const int mLen = bestLen - minMatch; // Emit match length if (mLen >= mLenTh) { token += mLenTh; mLenIdx += emitLength(&_mLenBuf[mLenIdx], mLen - mLenTh); } else { token += mLen; } repd[1] = repd[0]; repd[0] = dist; repIdx = 1; const int litLen = srcIdx - anchor; // Emit token // Literals to process ? if (litLen == 0) { _tkBuf[tkIdx++] = kanzi::byte(token); } else { // Emit literal length if (litLen >= 7) { if (litLen >= (1 << 24)) return false; _tkBuf[tkIdx++] = kanzi::byte((7 << 5) | token); dstIdx += emitLength(&dst[dstIdx], litLen - 7); } else { _tkBuf[tkIdx++] = kanzi::byte((litLen << 5) | token); } // Emit literals emitLiterals(&src[anchor], &dst[dstIdx], litLen); dstIdx += litLen; } if (mIdx >= _bufferSize - 8) { // Expand match buffer kanzi::byte* mBuf = new kanzi::byte[(_bufferSize * 3) / 2]; memcpy(&mBuf[0], &_mBuf[0], _bufferSize); if ( _mBuf != nullptr) delete[] _mBuf; _mBuf = mBuf; if (mLenIdx >= _bufferSize - 8) { kanzi::byte* mLenBuf = new kanzi::byte[(_bufferSize * 3) / 2]; memcpy(&mLenBuf[0], &_mLenBuf[0], _bufferSize); if (_mLenBuf != nullptr) delete[] _mLenBuf; _mLenBuf = mLenBuf; } _bufferSize = (_bufferSize * 3) / 2; } // Fill _hashes and update positions anchor = srcIdx + bestLen; while (srcIdx + 4 < anchor) { srcIdx += 4; const int32 hh0 = hash(&src[srcIdx - 3]); const int32 hh1 = hash(&src[srcIdx - 2]); const int32 hh2 = hash(&src[srcIdx - 1]); const int32 hh3 = hash(&src[srcIdx - 0]); _hashes[hh0] = srcIdx - 3; _hashes[hh1] = srcIdx - 2; _hashes[hh2] = srcIdx - 1; _hashes[hh3] = srcIdx - 0; } while (++srcIdx < anchor) { const int32 h = hash(&src[srcIdx]); _hashes[h] = srcIdx; } } // Emit last literals const int litLen = count - anchor; if (dstIdx + litLen + tkIdx + mIdx >= output._index + count) return false; if (litLen >= 7) { _tkBuf[tkIdx++] = kanzi::byte(7 << 5); dstIdx += emitLength(&dst[dstIdx], litLen - 7); } else { _tkBuf[tkIdx++] = kanzi::byte(litLen << 5); } memcpy(&dst[dstIdx], &src[anchor], litLen); dstIdx += litLen; // Emit buffers: literals + tokens + matches LittleEndian::writeInt32(&dst[0], dstIdx); LittleEndian::writeInt32(&dst[4], tkIdx); LittleEndian::writeInt32(&dst[8], mIdx); memcpy(&dst[dstIdx], &_tkBuf[0], tkIdx); dstIdx += tkIdx; memcpy(&dst[dstIdx], &_mBuf[0], mIdx); dstIdx += mIdx; memcpy(&dst[dstIdx], &_mLenBuf[0], mLenIdx); dstIdx += mLenIdx; input._index += count; output._index += dstIdx; return dstIdx <= count - (count / 100); } template bool LZXCodec::inverse(SliceArray& input, SliceArray& output, int count) { int bsVersion = _pCtx == nullptr ? 6 : _pCtx->getInt("bsVersion", 6); if (bsVersion < 6) return inverseV5(input, output, count); return inverseV6(input, output, count); } template bool LZXCodec::inverseV6(SliceArray& input, SliceArray& output, int count) { if (count == 0) return true; if (count < 13) return false; if (count > input._length - input._index) return false; if (!SliceArray::isValid(input)) throw invalid_argument("LZ codec: Invalid input block"); if (!SliceArray::isValid(output)) throw invalid_argument("LZ codec: Invalid output block"); const int dstEnd = output._length - output._index; kanzi::byte* dst = &output._array[output._index]; const kanzi::byte* src = &input._array[input._index]; int tkIdx = LittleEndian::readInt32(&src[0]); int mIdx = LittleEndian::readInt32(&src[4]); int mLenIdx = LittleEndian::readInt32(&src[8]); // Sanity checks if ((tkIdx < 0) || (mIdx < 0) || (mLenIdx < 0)) return false; if ((tkIdx < 13) || (tkIdx > count) || (mIdx > count - tkIdx) || (mLenIdx > count - tkIdx - mIdx)) return false; mIdx += tkIdx; mLenIdx += mIdx; const int srcEnd = tkIdx - 13; const int maxDist = ((int(src[12]) & 1) == 0) ? MAX_DISTANCE1 : MAX_DISTANCE2; const int minMatch = ((int(src[12]) >> 1) & 0x07) + 2; bool res = true; int srcIdx = 13; int dstIdx = 0; int repd0 = count; int repd1 = count; while (true) { const int token = int(src[tkIdx++]); if (token >= 32) { // Get literal length const int litLen = (token >= 0xE0) ? 7 + readLength(src, srcIdx) : token >> 5; // Emit literals const kanzi::byte* s = &src[srcIdx]; kanzi::byte* d = &dst[dstIdx]; srcIdx += litLen; dstIdx += litLen; if (srcIdx >= srcEnd) { memcpy(d, s, litLen); break; } emitLiterals(s, d, litLen); } // Get match length and distance int mLen, dist; if ((token & 0x18) == 0) { // Repetition distance, read mLen remainder (if any) outside of token mLen = token & 0x03; mLen += (mLen == 3 ? minMatch + readLength(src, mLenIdx) : minMatch); dist = (token & 0x04) == 0 ? repd0 : repd1; } else { // Read mLen remainder (if any) outside of token mLen = token & 0x07; mLen += (mLen == 7 ? minMatch + readLength(src, mLenIdx) : minMatch); dist = int(src[mIdx++]); const int f1 = (token >> 4) & 1; const int f2 = (token >> 3) & f1; dist = (dist << (8 * f1)) | (-f1 & int(src[mIdx])); mIdx += f1; dist = (dist << (8 * f2)) | (-f2 & int(src[mIdx])); mIdx += f2; } repd1 = repd0; repd0 = dist; const int mEnd = dstIdx + mLen; int ref = dstIdx - dist; // Sanity check if ((ref < 0) || (dist > maxDist) || (mEnd > dstEnd)) { res = false; goto exit; } prefetchWrite(&dst[dstIdx]); // Copy match if (dist >= 16) { do { // No overlap memcpy(&dst[dstIdx], &dst[ref], 16); ref += 16; dstIdx += 16; } while (dstIdx < mEnd); } else if (dist != 1) { const kanzi::byte* s = &dst[ref]; kanzi::byte* p = &dst[dstIdx]; const kanzi::byte* pend = &p[mLen]; while (p < pend) *p++ = *s++; } else { // dist = 1 memset(&dst[dstIdx], int(dst[ref]), mLen); } dstIdx = mEnd; } exit: output._index += dstIdx; input._index += count; return res && (srcIdx == srcEnd + 13); } template bool LZXCodec::inverseV5(SliceArray& input, SliceArray& output, int count) { if (count == 0) return true; if (count < 13) return false; if (count > input._length - input._index) return false; if (!SliceArray::isValid(input)) throw invalid_argument("LZ codec: Invalid input block"); if (!SliceArray::isValid(output)) throw invalid_argument("LZ codec: Invalid output block"); const int dstEnd = output._length - output._index; kanzi::byte* dst = &output._array[output._index]; const kanzi::byte* src = &input._array[input._index]; int tkIdx = LittleEndian::readInt32(&src[0]); int mIdx = LittleEndian::readInt32(&src[4]); int mLenIdx = LittleEndian::readInt32(&src[8]); // Sanity checks if ((tkIdx < 0) || (mIdx < 0) || (mLenIdx < 0)) return false; if ((tkIdx < 13) || (tkIdx > count) || (mIdx > count - tkIdx) || (mLenIdx > count - tkIdx - mIdx)) return false; mIdx += tkIdx; mLenIdx += mIdx; const int srcEnd = tkIdx - 13; const int mFlag = int(src[12]) & 1; const int maxDist = (mFlag == 0) ? MAX_DISTANCE1 : MAX_DISTANCE2; const int mmIdx = (int(src[12]) >> 1) & 0x03; const int MIN_MATCHES[4] = { MIN_MATCH4, MIN_MATCH9, MIN_MATCH6, MIN_MATCH6 }; const int minMatch = MIN_MATCHES[mmIdx]; bool res = true; int srcIdx = 13; int dstIdx = 0; int repd0 = 0; int repd1 = 0; while (true) { const int token = int(src[tkIdx++]); if (token >= 32) { // Get literal length const int litLen = (token >= 0xE0) ? 7 + readLength(src, srcIdx) : token >> 5; // Emit literals const kanzi::byte* s = &src[srcIdx]; kanzi::byte* d = &dst[dstIdx]; srcIdx += litLen; dstIdx += litLen; if (srcIdx >= srcEnd) { memcpy(d, s, litLen); break; } emitLiterals(s, d, litLen); } // Get match length and distance int mLen = token & 0x0F; int dist; if (mLen == 15) { // Repetition distance, read mLen fully outside of token mLen = minMatch + readLength(src, mLenIdx); dist = ((token & 0x10) == 0) ? repd0 : repd1; } else { // Read mLen remainder (if any) outside of token mLen = (mLen == 14) ? 14 + minMatch + readLength(src, mLenIdx) : mLen + minMatch; dist = int(src[mIdx++]); if (mFlag != 0) dist = (dist << 8) | int(src[mIdx++]); //if ((token & 0x10) != 0) { // dist = (dist << 8) | int(src[mIdx++]); //} const int t = (token >> 4) & 1; dist = (dist << (8 * t)) | (-t & int(src[mIdx])); mIdx += t; } prefetchRead(&src[mLenIdx]); repd1 = repd0; repd0 = dist; const int mEnd = dstIdx + mLen; int ref = dstIdx - dist; // Sanity check if ((ref < 0) || (dist > maxDist) || (mEnd > dstEnd)) { res = false; goto exit; } prefetchWrite(&dst[dstIdx]); // Copy match if (dist >= 16) { do { // No overlap memcpy(&dst[dstIdx], &dst[ref], 16); ref += 16; dstIdx += 16; } while (dstIdx < mEnd); } else if (dist != 1) { const kanzi::byte* s = &dst[ref]; kanzi::byte* p = &dst[dstIdx]; const kanzi::byte* pend = &p[mLen]; while (p < pend) *p++ = *s++; } else { // dist = 1 memset(&dst[dstIdx], int(dst[ref]), mLen); } dstIdx = mEnd; } exit: output._index += dstIdx; input._index += count; return res && (srcIdx == srcEnd + 13); } const uint LZPCodec::HASH_SEED = 0x7FEB352D; const uint LZPCodec::HASH_LOG = 16; const uint LZPCodec::HASH_SHIFT = 32 - HASH_LOG; const int LZPCodec::MIN_MATCH = 64; const int LZPCodec::MIN_BLOCK_LENGTH = 128; const int LZPCodec::MATCH_FLAG = 0xFC; bool LZPCodec::forward(SliceArray& input, SliceArray& output, int count) { if (count == 0) return true; if (count < 4) return false; if (!SliceArray::isValid(input)) throw invalid_argument("LZP codec: Invalid input block"); if (!SliceArray::isValid(output)) throw invalid_argument("LZP codec: Invalid output block"); if (output._length < getMaxEncodedLength(count)) return false; // If too small, skip if (count < MIN_BLOCK_LENGTH) return false; const kanzi::byte* src = &input._array[input._index]; kanzi::byte* dst = &output._array[output._index]; const int srcEnd = count; const int dstEnd = count - (count >> 6); if (_hashSize == 0) { _hashSize = 1 << HASH_LOG; if (_hashes != nullptr) delete[] _hashes; _hashes = new int32[_hashSize]; } memset(_hashes, 0, sizeof(int32) * _hashSize); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; uint ctx = LittleEndian::readInt32(&src[0]); int srcIdx = 4; int dstIdx = 4; while ((srcIdx < srcEnd - MIN_MATCH) && (dstIdx < dstEnd)) { const uint32 h = (HASH_SEED * ctx) >> HASH_SHIFT; const int32 ref = _hashes[h]; _hashes[h] = srcIdx; int bestLen = 0; // Find a match if ((ref != 0) && KANZI_MEM_EQ8(&src[ref + MIN_MATCH - 8], &src[srcIdx + MIN_MATCH - 8])) bestLen = findMatch(src, srcIdx, ref, srcEnd - srcIdx); // No good match ? if (bestLen < MIN_MATCH) { const uint val = uint(src[srcIdx]); ctx = (ctx << 8) | val; dst[dstIdx++] = src[srcIdx++]; if ((ref != 0) && (val == MATCH_FLAG)) dst[dstIdx++] = kanzi::byte(0xFF); continue; } srcIdx += bestLen; ctx = LittleEndian::readInt32(&src[srcIdx - 4]); dst[dstIdx++] = kanzi::byte(MATCH_FLAG); bestLen -= MIN_MATCH; // Emit match length while (bestLen >= 254) { bestLen -= 254; dst[dstIdx++] = kanzi::byte(0xFE); if (dstIdx >= dstEnd) break; } dst[dstIdx++] = kanzi::byte(bestLen); } while ((srcIdx < srcEnd) && (dstIdx < dstEnd)) { const uint32 h = (HASH_SEED * ctx) >> HASH_SHIFT; const int ref = _hashes[h]; _hashes[h] = srcIdx; const uint val = uint(src[srcIdx]); ctx = (ctx << 8) | val; dst[dstIdx++] = src[srcIdx++]; if ((ref != 0) && (val == MATCH_FLAG)) dst[dstIdx++] = kanzi::byte(0xFF); } input._index += srcIdx; output._index += dstIdx; return (srcIdx == count) && (dstIdx < dstEnd); } bool LZPCodec::inverse(SliceArray& input, SliceArray& output, int count) { if (count == 0) return true; if (count > input._length - input._index) return false; if (!SliceArray::isValid(input)) throw invalid_argument("LZP codec: Invalid input block"); if (!SliceArray::isValid(output)) throw invalid_argument("LZP codec: Invalid output block"); if (count < 4) return false; const int srcEnd = count; const int dstEnd = output._length - output._index; const kanzi::byte* src = &input._array[input._index]; kanzi::byte* dst = &output._array[output._index]; if (_hashSize == 0) { _hashSize = 1 << HASH_LOG; delete[] _hashes; _hashes = new int32[_hashSize]; } memset(_hashes, 0, sizeof(int32) * _hashSize); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; uint32 ctx = LittleEndian::readInt32(&dst[0]); int srcIdx = 4; int dstIdx = 4; while (srcIdx < srcEnd) { const int32 h = (HASH_SEED * ctx) >> HASH_SHIFT; int ref = _hashes[h]; _hashes[h] = dstIdx; if ((src[srcIdx] != kanzi::byte(MATCH_FLAG)) || (ref == 0)) { ctx = (ctx << 8) | uint32(src[srcIdx]); dst[dstIdx++] = src[srcIdx++]; continue; } srcIdx++; if (src[srcIdx] == kanzi::byte(0xFF)) { ctx = (ctx << 8) | uint32(MATCH_FLAG); dst[dstIdx++] = kanzi::byte(MATCH_FLAG); srcIdx++; continue; } int mLen = MIN_MATCH; if (src[srcIdx] == kanzi::byte(0xFE)) { while ((srcIdx < srcEnd) && (src[srcIdx] == kanzi::byte(0xFE))) { srcIdx++; mLen += 254; } if (srcIdx >= srcEnd) return false; } mLen += int(src[srcIdx++]); const int mEnd = dstIdx + mLen; if (mEnd > dstEnd) return false; if (dstIdx >= ref + 16) { do { // No overlap memcpy(&dst[dstIdx], &dst[ref], 16); ref += 16; dstIdx += 16; } while (dstIdx < mEnd); } else { for (int i = 0; i < mLen; i++) dst[dstIdx + i] = dst[ref + i]; } dstIdx = mEnd; ctx = LittleEndian::readInt32(&dst[dstIdx - 4]); } input._index += srcIdx; output._index += dstIdx; return srcIdx == srcEnd; } kanzi-cpp-2.5.2/src/transform/LZCodec.hpp000066400000000000000000000157521516423635400202350ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_LZCodec #define knz_LZCodec #include "../Context.hpp" #include "../Global.hpp" #include "../Transform.hpp" #include "../Memory.hpp" namespace kanzi { class LZCodec FINAL : public Transform { public: LZCodec(); LZCodec(Context& ctx); ~LZCodec() { delete _delegate; } bool forward(SliceArray& src, SliceArray& dst, int length); bool inverse(SliceArray& src, SliceArray& dst, int length); // Required encoding output buffer size int getMaxEncodedLength(int srcLen) const { return _delegate->getMaxEncodedLength(srcLen); } private: Transform* _delegate; }; // Simple byte oriented LZ77 implementation. template class LZXCodec FINAL : public Transform { public: LZXCodec() { _hashes = nullptr; _hashSize = 0; _tkBuf = nullptr; _mLenBuf = nullptr; _mBuf = nullptr; _bufferSize = 0; _pCtx = nullptr; } LZXCodec(Context& ctx) : _pCtx(&ctx) { _hashes = nullptr; _hashSize = 0; _tkBuf = nullptr; _mLenBuf = nullptr; _mBuf = nullptr; _bufferSize = 0; } ~LZXCodec() { _bufferSize = 0; _hashSize = 0; if (_hashes != nullptr) delete[] _hashes; if (_mLenBuf != nullptr) delete[] _mLenBuf; if (_mBuf != nullptr) delete[] _mBuf; if (_tkBuf != nullptr) delete[] _tkBuf; } bool forward(SliceArray& src, SliceArray& dst, int length); bool inverse(SliceArray& src, SliceArray& dst, int length); // Required encoding output buffer size int getMaxEncodedLength(int srcLen) const { return (srcLen <= 1024) ? srcLen + 16 : srcLen + (srcLen / 64); } private: static const uint HASH_SEED; static const uint HASH_LOG; static const uint HASH_LSHIFT; static const uint HASH_RSHIFT; static const int MAX_DISTANCE1; static const int MAX_DISTANCE2; static const int MIN_MATCH4; static const int MIN_MATCH6; static const int MIN_MATCH9; static const int MAX_MATCH; static const int MIN_BLOCK_LENGTH; int32* _hashes; int _hashSize; byte* _mLenBuf; byte* _mBuf; byte* _tkBuf; int _bufferSize; Context* _pCtx; bool inverseV6(SliceArray& src, SliceArray& dst, int length); bool inverseV5(SliceArray& src, SliceArray& dst, int length); static int emitLength(byte block[], int len); static void emitLiterals(const byte src[], byte dst[], int len); static int findMatch(const byte block[], const int pos, const int ref, const int maxMatch); static int readLength(const byte block[], int& pos); static int32 hash(const byte* p); }; class LZPCodec FINAL : public Transform { public: LZPCodec() { _hashes = nullptr; _hashSize = 0; } LZPCodec(Context&) { _hashes = nullptr; _hashSize = 0; } ~LZPCodec() { delete[] _hashes; } bool forward(SliceArray& src, SliceArray& dst, int length); bool inverse(SliceArray& src, SliceArray& dst, int length); // Required encoding output buffer size int getMaxEncodedLength(int srcLen) const { return (srcLen <= 1024) ? srcLen + 16 : srcLen + (srcLen / 64); } private: static const uint HASH_SEED; static const uint HASH_LOG; static const uint HASH_SHIFT; static const int MIN_MATCH; static const int MIN_BLOCK_LENGTH; static const int MATCH_FLAG; int32* _hashes; int _hashSize; static int findMatch(const byte block[], const int pos, const int ref, const int maxMatch); }; template inline void LZXCodec::emitLiterals(const byte src[], byte dst[], int len) { for (int i = 0; i < len; i += 16) memcpy(&dst[i], &src[i], 16); } template inline int32 LZXCodec::hash(const byte* p) { return ((uint64(LittleEndian::readLong64(p)) << HASH_LSHIFT) * HASH_SEED) >> HASH_RSHIFT; } template inline int LZXCodec::emitLength(byte block[], int length) { if (length < 254) { block[0] = byte(length); return 1; } if (length < 65536 + 254) { const uint32 l = (length - 254) | 0x00FE0000; kanzi::BigEndian::writeInt32(&block[0], l << 8); return 3; } const uint32 l = (length - 255) | 0xFF000000; kanzi::BigEndian::writeInt32(&block[0], l); return 4; } template inline int LZXCodec::readLength(const byte block[], int& pos) { int res = int(block[pos++]); if (res < 254) return res; if (res == 254) { res += ((kanzi::BigEndian::readInt16(&block[pos])) & 0xFFFF); pos += 2; return res; } res += ((kanzi::BigEndian::readInt32(&block[pos])) >> 8); pos += 3; return res; } template inline int LZXCodec::findMatch(const byte src[], const int srcIdx, const int ref, const int maxMatch) { int n = 0; while (n + 8 <= maxMatch) { const int64 diff = LittleEndian::readLong64(&src[srcIdx + n]) ^ LittleEndian::readLong64(&src[ref + n]); if (diff != 0) { n += (Global::trailingZeros(uint64(diff)) >> 3); break; } n += 8; } return n; } inline int LZPCodec::findMatch(const byte src[], const int srcIdx, const int ref, const int maxMatch) { int n = 0; while (n + 8 <= maxMatch) { const int64 diff = LittleEndian::readLong64(&src[srcIdx + n]) ^ LittleEndian::readLong64(&src[ref + n]); if (diff != 0) { n += (Global::trailingZeros(uint64(diff)) >> 3); break; } n += 8; } return n; } } #endif kanzi-cpp-2.5.2/src/transform/NullTransform.hpp000066400000000000000000000040441516423635400215500ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_NullTransform #define knz_NullTransform #include "../Context.hpp" #include "../Transform.hpp" namespace kanzi { class NullTransform FINAL : public Transform { public: NullTransform() {} NullTransform(Context&) {} ~NullTransform() {} bool forward(SliceArray& input, SliceArray& output, int length) { return doCopy(input, output, length); } bool inverse(SliceArray& input, SliceArray& output, int length) { return doCopy(input, output, length); } // Required encoding output buffer size int getMaxEncodedLength(int inputLen) const { return inputLen; } private: bool doCopy(SliceArray& input, SliceArray& output, int length) const; }; inline bool NullTransform::doCopy(SliceArray& input, SliceArray& output, int length) const { if (length == 0) return true; if (!SliceArray::isValid(input)) throw std::invalid_argument("Invalid input block"); if (!SliceArray::isValid(output)) throw std::invalid_argument("Invalid output block"); if (input._index + length > input._length) return false; if (output._index + length > output._length) return false; memcpy(&output._array[output._index], &input._array[input._index], size_t(length)); input._index += length; output._index += length; return true; } } #endif kanzi-cpp-2.5.2/src/transform/RLT.cpp000066400000000000000000000231301516423635400173730ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include "RLT.hpp" #include "../Global.hpp" #include "../Memory.hpp" using namespace kanzi; using namespace std; const int RLT::RUN_LEN_ENCODE1 = 224; // used to encode run length const int RLT::RUN_LEN_ENCODE2 = (255 - RUN_LEN_ENCODE1) << 8; // used to encode run length const int RLT::RUN_THRESHOLD = 3; const int RLT::MAX_RUN = 0xFFFF + RUN_LEN_ENCODE2 + RUN_THRESHOLD - 1; const int RLT::MAX_RUN4 = MAX_RUN - 4; const int RLT::MIN_BLOCK_LENGTH = 16; const kanzi::byte RLT::DEFAULT_ESCAPE = kanzi::byte(0xFB); bool RLT::forward(SliceArray& input, SliceArray& output, int count) { if (count == 0) return true; if (count < MIN_BLOCK_LENGTH) return false; if (!SliceArray::isValid(input)) throw invalid_argument("RLT: Invalid input block"); if (!SliceArray::isValid(output)) throw invalid_argument("RLT: Invalid output block"); if (output._length - output._index < getMaxEncodedLength(count)) return false; const kanzi::byte* src = &input._array[input._index]; kanzi::byte* dst = &output._array[output._index]; Global::DataType dt = Global::UNDEFINED; bool findBestEscape = true; if (_pCtx != nullptr) { dt = (Global::DataType) _pCtx->getInt("dataType", Global::UNDEFINED); if ((dt == Global::DNA) || (dt == Global::BASE64) || (dt == Global::UTF8)) return false; std::string entropyType = _pCtx->getString("entropy"); transform(entropyType.begin(), entropyType.end(), entropyType.begin(), ::toupper); // Fast track if fast entropy coder is used if ((entropyType == "NONE") || (entropyType == "ANS0") || (entropyType == "HUFFMAN") || (entropyType == "RANGE")) findBestEscape = false; } kanzi::byte escape = DEFAULT_ESCAPE; if (findBestEscape == true) { uint freqs[256] = { 0 }; Global::computeHistogram(&src[0], count, freqs); if (dt == Global::UNDEFINED) { dt = Global::detectSimpleType(count, freqs); if ((_pCtx != nullptr) && (dt != Global::UNDEFINED)) _pCtx->putInt("dataType", dt); if ((dt == Global::DNA) || (dt == Global::BASE64) || (dt == Global::UTF8)) return false; } int minIdx = 0; if (freqs[minIdx] > 0) { for (int i = 1; i < 256; i++) { if (freqs[i] < freqs[minIdx]) { minIdx = i; if (freqs[i] == 0) break; } } } escape = kanzi::byte(minIdx); } int srcIdx = 0; int dstIdx = 0; const int srcEnd = count; const int srcEnd4 = srcEnd - 4; const int dstEnd = output._length; bool res = true; int run = 0; kanzi::byte prev = src[srcIdx++]; dst[dstIdx++] = escape; dst[dstIdx++] = prev; if (prev == escape) dst[dstIdx++] = kanzi::byte(0); // Main loop while (true) { if (prev == src[srcIdx]) { const uint32 v = 0x01010101 * uint32(prev); if (KANZI_MEM_EQ4(&v, &src[srcIdx])) { srcIdx += 4; run += 4; if ((run < MAX_RUN4) && (srcIdx < srcEnd4)) continue; } else { srcIdx++; run++; if (prev == src[srcIdx]) { srcIdx++; run++; if (prev == src[srcIdx]) { srcIdx++; run++; if ((run < MAX_RUN4) && (srcIdx < srcEnd4)) continue; } } } } if (run > RUN_THRESHOLD) { if (dstIdx + 6 >= dstEnd) { res = false; break; } dstIdx += emitRunLength(&dst[dstIdx], run, escape, prev); } else if (prev != escape) { if (dstIdx + run >= dstEnd) { res = false; break; } if (run-- > 0) dst[dstIdx++] = prev; while (run-- > 0) dst[dstIdx++] = prev; } else { // escape literal if (dstIdx + (2 * run) >= dstEnd) { res = false; break; } while (run-- > 0) { dst[dstIdx++] = escape; dst[dstIdx++] = kanzi::byte(0); } } prev = src[srcIdx]; srcIdx++; run = 1; if (srcIdx >= srcEnd4) break; } if (res == true) { // run == 1 if (prev != escape) { if (dstIdx + run < dstEnd) { while (run-- > 0) dst[dstIdx++] = prev; } } else { // escape literal if (dstIdx + (2 * run) < dstEnd) { while (run-- > 0) { dst[dstIdx++] = escape; dst[dstIdx++] = kanzi::byte(0); } } } // Emit the last few bytes while ((srcIdx < srcEnd) && (dstIdx < dstEnd)) { if (src[srcIdx] == escape) { if (dstIdx + 2 >= dstEnd) { res = false; break; } dst[dstIdx++] = escape; dst[dstIdx++] = kanzi::byte(0); srcIdx++; continue; } dst[dstIdx++] = src[srcIdx++]; } res &= (srcIdx == srcEnd); } input._index += srcIdx; output._index += dstIdx; return res && (dstIdx < srcIdx); } int RLT::emitRunLength(kanzi::byte dst[], int run, kanzi::byte escape, kanzi::byte val) { dst[0] = val; dst[1] = kanzi::byte(0); int dstIdx = (val == escape) ? 2 : 1; dst[dstIdx++] = escape; run -= RUN_THRESHOLD; // Encode run length if (run >= RUN_LEN_ENCODE1) { if (run < RUN_LEN_ENCODE2) { run -= RUN_LEN_ENCODE1; dst[dstIdx++] = kanzi::byte(RUN_LEN_ENCODE1 + (run >> 8)); } else { run -= RUN_LEN_ENCODE2; dst[dstIdx++] = kanzi::byte(0xFF); dst[dstIdx++] = kanzi::byte(run >> 8); } } dst[dstIdx] = kanzi::byte(run); return dstIdx + 1; } bool RLT::inverse(SliceArray& input, SliceArray& output, int count) { if (count == 0) return true; if (!SliceArray::isValid(input)) throw invalid_argument("RLT: Invalid input block"); if (!SliceArray::isValid(output)) throw invalid_argument("RLT: Invalid output block"); if (input._index + count > input._length) return false; const kanzi::byte* src = &input._array[input._index]; kanzi::byte* dst = &output._array[output._index]; int srcIdx = 0; int dstIdx = 0; const int srcEnd = srcIdx + count; const int dstEnd = output._length - output._index; bool res = true; const kanzi::byte escape = src[srcIdx++]; if ((srcIdx < srcEnd) && (src[srcIdx] == escape)) { srcIdx++; // The data cannot start with a run but may start with an escape literal if ((srcIdx < srcEnd) && (src[srcIdx] != kanzi::byte(0))) return false; if (dstIdx >= dstEnd) return false; dst[dstIdx++] = escape; srcIdx++; } // Main loop while (srcIdx < srcEnd) { if (src[srcIdx] != escape) { // Literal if (dstIdx >= dstEnd) { res = false; break; } dst[dstIdx++] = src[srcIdx++]; continue; } srcIdx++; if (srcIdx >= srcEnd) { res = false; break; } int run = int(src[srcIdx++]); if (run == 0) { // Just an escape symbol, not a run if (dstIdx >= dstEnd) { res = false; break; } dst[dstIdx++] = escape; continue; } // Decode run length if (run == 0xFF) { if (srcIdx + 1 >= srcEnd) { res = false; break; } run = (int(src[srcIdx]) << 8) | int(src[srcIdx + 1]); srcIdx += 2; run += RUN_LEN_ENCODE2; } else if (run >= RUN_LEN_ENCODE1) { if (srcIdx >= srcEnd) { res = false; break; } run = ((run - RUN_LEN_ENCODE1) << 8) | int(src[srcIdx]); srcIdx++; run += RUN_LEN_ENCODE1; } run += (RUN_THRESHOLD - 1); if ((dstIdx + run >= dstEnd) || (run > MAX_RUN)) { res = false; break; } if (dstIdx == 0) { res = false; break; } memset(&dst[dstIdx], int(dst[dstIdx - 1]), size_t(run)); dstIdx += run; } input._index += srcIdx; output._index += dstIdx; return res && (srcIdx == srcEnd); } kanzi-cpp-2.5.2/src/transform/RLT.hpp000066400000000000000000000033711516423635400174050ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_RLT #define knz_RLT #include "../Context.hpp" #include "../Transform.hpp" namespace kanzi { // Implementation of an escaped RLE // Run length encoding: // RUN_LEN_ENCODE1 = 224 => RUN_LEN_ENCODE2 = 31*224 = 6944 // 4 <= runLen < 224+4 -> 1 byte // 228 <= runLen < 6944+228 -> 2 bytes // 7172 <= runLen < 65535+7172 -> 3 bytes class RLT FINAL : public Transform { public: RLT() { _pCtx = nullptr; } RLT(Context& ctx) : _pCtx(&ctx) {} ~RLT() {} bool forward(SliceArray& pSrc, SliceArray& pDst, int length); bool inverse(SliceArray& pSrc, SliceArray& pDst, int length); int getMaxEncodedLength(int srcLen) const { return (srcLen <= 512) ? srcLen + 32 : srcLen; } private: static const int RUN_LEN_ENCODE1; static const int RUN_LEN_ENCODE2; static const int RUN_THRESHOLD; static const int MAX_RUN; static const int MAX_RUN4; static const int MIN_BLOCK_LENGTH; static const byte DEFAULT_ESCAPE; static int emitRunLength(byte dst[], int run, byte escape, byte val); Context* _pCtx; }; } #endif kanzi-cpp-2.5.2/src/transform/ROLZCodec.cpp000066400000000000000000001017601516423635400204640ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include #include "ROLZCodec.hpp" #include "../Global.hpp" #include "../Memory.hpp" #include "../bitstream/DefaultInputBitStream.hpp" #include "../bitstream/DefaultOutputBitStream.hpp" #include "../entropy/ANSRangeDecoder.hpp" #include "../entropy/ANSRangeEncoder.hpp" #include "../util/fixedbuf.hpp" using namespace kanzi; using namespace std; const int ROLZCodec::HASH_SIZE = 65536; const int ROLZCodec::CHUNK_SIZE = 16 * 1024 * 1024; const int32 ROLZCodec::HASH = 200002979; const int32 ROLZCodec::HASH_MASK = ~(ROLZCodec::CHUNK_SIZE - 1); const int ROLZCodec::MAX_BLOCK_SIZE = 1024 * 1024 * 1024; const int ROLZCodec::MIN_BLOCK_SIZE = 64; ROLZCodec::ROLZCodec(uint logPosChecks) { _delegate = new ROLZCodec1(logPosChecks); } ROLZCodec::ROLZCodec(Context& ctx) { string transform = ctx.getString("transform", "NONE"); _delegate = (transform.find("ROLZX") != string::npos) ? static_cast*>(new ROLZCodec2(ctx)) : static_cast*>(new ROLZCodec1(ctx)); } bool ROLZCodec::forward(SliceArray& input, SliceArray& output, int count) { if (count == 0) return true; if (count < MIN_BLOCK_SIZE) return false; if (!SliceArray::isValid(input)) throw invalid_argument("ROLZ codec: Invalid input block"); if (!SliceArray::isValid(output)) throw invalid_argument("ROLZ codec: Invalid output block"); if (input._array == output._array) return false; if (count > MAX_BLOCK_SIZE) return false; return _delegate->forward(input, output, count); } bool ROLZCodec::inverse(SliceArray& input, SliceArray& output, int count) { if (count == 0) return true; if (!SliceArray::isValid(input)) throw invalid_argument("ROLZ codec: Invalid input block"); if (!SliceArray::isValid(output)) throw invalid_argument("ROLZ codec: Invalid output block"); if (input._array == output._array) return false; if ((count < 5) || (input._index + count > input._length)) return false; if (count > MAX_BLOCK_SIZE) return false; return _delegate->inverse(input, output, count); } const int ROLZCodec1::MIN_MATCH3 = 3; const int ROLZCodec1::MIN_MATCH4 = 4; const int ROLZCodec1::MIN_MATCH7 = 7; const int ROLZCodec1::MAX_MATCH = MIN_MATCH3 + 65535; const int ROLZCodec1::LOG_POS_CHECKS = 4; ROLZCodec1::ROLZCodec1(uint logPosChecks) : _logPosChecks(logPosChecks) { if ((logPosChecks < 2) || (logPosChecks > 8)) { stringstream ss; ss << "ROLZ codec: Invalid logPosChecks parameter: " << logPosChecks << " (must be in [2..8])"; throw invalid_argument(ss.str()); } _pCtx = nullptr; _posChecks = 1 << _logPosChecks; _maskChecks = uint8(_posChecks - 1); _minMatch = MIN_MATCH3; _mSize = 0; _matches = nullptr; memset(&_counters[0], 0, sizeof(_counters)); } ROLZCodec1::ROLZCodec1(Context& ctx) : _pCtx(&ctx) { _logPosChecks = LOG_POS_CHECKS; _posChecks = 1 << _logPosChecks; _maskChecks = uint8(_posChecks - 1); _minMatch = MIN_MATCH3; _mSize = 0; _matches = nullptr; memset(&_counters[0], 0, sizeof(_counters)); } // return position index (_logPosChecks bits) + length (16 bits) or -1 int ROLZCodec1::findMatch(const kanzi::byte buf[], int pos, int end, uint32 hash32, const uint32* matches, const uint8* counter) const { const int s = int(*counter); const int e = s - _posChecks; prefetchRead(matches); const kanzi::byte* curBuf = &buf[pos]; int bestLen = 0; int bestIdx = -1; const int maxMatch = min(ROLZCodec1::MAX_MATCH, end - pos) - 8; // Check all recorded positions for (int i = s; i > e; i--) { uint32 ref = matches[i & _maskChecks]; // Hash check may save a memory access ... if ((ref & ROLZCodec::HASH_MASK) != hash32) continue; ref &= ~ROLZCodec::HASH_MASK; if (buf[ref + bestLen] != curBuf[bestLen]) continue; int n = 0; while (n < maxMatch) { const int64 diff = LittleEndian::readLong64(&buf[ref + n]) ^ LittleEndian::readLong64(&curBuf[n]); if (diff != 0) { n += (Global::trailingZeros(uint64(diff)) >> 3); break; } n += 8; } if (n > bestLen) { bestIdx = i; bestLen = n; } } return (bestLen < _minMatch) ? -1 : ((s - bestIdx) << 16) | (bestLen - _minMatch); } bool ROLZCodec1::forward(SliceArray& input, SliceArray& output, int count) { if (output._length < getMaxEncodedLength(count)) return false; const int srcEnd = count - 4; const kanzi::byte* src = &input._array[input._index]; kanzi::byte* dst = &output._array[output._index]; BigEndian::writeInt32(&dst[0], count); int dstIdx = 5; int sizeChunk = min(count, ROLZCodec::CHUNK_SIZE); int startChunk = 0; SliceArray litBuf(new kanzi::byte[getMaxEncodedLength(sizeChunk)], getMaxEncodedLength(sizeChunk)); SliceArray lenBuf(new kanzi::byte[sizeChunk / 5], sizeChunk / 5); SliceArray mIdxBuf(new kanzi::byte[sizeChunk / 4], sizeChunk / 4); SliceArray tkBuf(new kanzi::byte[sizeChunk / 4], sizeChunk / 4); memset(&_counters[0], 0, sizeof(_counters)); bool success = true; const int litOrder = (count < (1 << 17)) ? 0 : 1; int flags = litOrder; stringbuf buffer; iostream ios(&buffer); _minMatch = MIN_MATCH3; int delta = 2; if (_pCtx != nullptr) { Global::DataType dt = (Global::DataType) _pCtx->getInt("dataType", Global::UNDEFINED); if (dt == Global::UNDEFINED) { uint freqs0[256] = { 0 }; Global::computeHistogram(&src[0], count, freqs0); dt = Global::detectSimpleType(count, freqs0); if (dt != Global::UNDEFINED) _pCtx->putInt("dataType", dt); } if (dt == Global::EXE) { delta = 3; flags |= 8; } else if (dt == Global::DNA) { delta = 8; _minMatch = MIN_MATCH7; flags |= 4; } else if (dt == Global::MULTIMEDIA) { delta = 8; _minMatch = MIN_MATCH4; flags |= 2; } } if (_mSize == 0) { _mSize = size_t(ROLZCodec::HASH_SIZE << _logPosChecks); if (_matches != nullptr) delete[] _matches; _matches = new uint32[_mSize]; } flags |= (_logPosChecks << 4); dst[4] = kanzi::byte(flags); const bool cond = _minMatch == MIN_MATCH3; // Main loop while (startChunk < srcEnd) { litBuf._index = 0; lenBuf._index = 0; mIdxBuf._index = 0; tkBuf._index = 0; memset(&_matches[0], 0, sizeof(uint32) * size_t(ROLZCodec::HASH_SIZE << _logPosChecks)); const int endChunk = min(startChunk + sizeChunk, srcEnd); sizeChunk = endChunk - startChunk; const kanzi::byte* buf = &src[startChunk]; const kanzi::byte* ref = &src[startChunk - delta]; int srcIdx = 0; const int n = min(srcEnd - startChunk, 8); for (int j = 0; j < n; j++) litBuf._array[litBuf._index++] = buf[srcIdx++]; int firstLitIdx = srcIdx; int srcInc = 0; while (srcIdx < sizeChunk) { const uint32 key = (cond == true) ? ROLZCodec::getKey1(&ref[srcIdx]): ROLZCodec::getKey2(&ref[srcIdx]); uint8* counter = &_counters[key]; uint32* matches = &_matches[key << _logPosChecks]; uint32 hash32 = ROLZCodec::hash(&buf[srcIdx]); int match = findMatch(buf, srcIdx, sizeChunk, hash32, matches, counter); // Register current position *counter = (*counter + 1) & _maskChecks; matches[*counter] = hash32 | int32(srcIdx); if (match < 0) { srcIdx++; srcIdx += (srcInc >> 6); srcInc++; continue; } // Check if better match at next position const int srcIdx1 = srcIdx + 1; const uint32 key2 = (cond == true) ? ROLZCodec::getKey1(&ref[srcIdx1]) : ROLZCodec::getKey2(&ref[srcIdx1]); counter = &_counters[key2]; matches = &_matches[key2 << _logPosChecks]; hash32 = ROLZCodec::hash(&buf[srcIdx1]); const int match2 = findMatch(buf, srcIdx1, sizeChunk, hash32, matches, counter); if ((match2 >= 0) && ((match2 & 0xFFFF) > (match & 0xFFFF))) { // New match is better match = match2; srcIdx = srcIdx1; // Register current position *counter = (*counter + 1) & _maskChecks; matches[*counter] = hash32 | int32(srcIdx); } // token LLLLLMMM -> L lit length, M match length const int litLen = srcIdx - firstLitIdx; const int token = (litLen < 31) ? litLen << 3 : 0xF8; const int mLen = match & 0xFFFF; if (mLen >= 7) { tkBuf._array[tkBuf._index++] = kanzi::byte(token | 0x07); lenBuf._index += emitLength(&lenBuf._array[lenBuf._index], mLen - 7); } else { tkBuf._array[tkBuf._index++] = kanzi::byte(token | mLen); } // Emit literals if (litLen > 0) { lenBuf._index += (litLen >= 31 ? emitLength(&lenBuf._array[lenBuf._index], litLen - 31) : 0); memcpy(&litBuf._array[litBuf._index], &buf[firstLitIdx], size_t(litLen)); litBuf._index += litLen; } // Emit match index mIdxBuf._array[mIdxBuf._index++] = kanzi::byte(match >> 16); srcIdx += (mLen + _minMatch); firstLitIdx = srcIdx; srcInc = 0; } // Emit last chunk literals const int litLen = sizeChunk - firstLitIdx; if (tkBuf._index != 0) { // At least one match to emit const int token = (litLen < 31) ? litLen << 3 : 0xF8; tkBuf._array[tkBuf._index++] = kanzi::byte(token); } lenBuf._index += (litLen >= 31 ? emitLength(&lenBuf._array[lenBuf._index], litLen - 31) : 0); memcpy(&litBuf._array[litBuf._index], &buf[firstLitIdx], size_t(litLen)); litBuf._index += litLen; try { // Encode literal, match length and match index buffers DefaultOutputBitStream obs(ios, 65536); obs.writeBits(litBuf._index, 32); obs.writeBits(tkBuf._index, 32); obs.writeBits(lenBuf._index, 32); obs.writeBits(mIdxBuf._index, 32); ANSRangeEncoder litEnc(obs, litOrder); litEnc.encode(litBuf._array, 0, litBuf._index); litEnc.dispose(); ANSRangeEncoder mEnc(obs, 0, 32768); mEnc.encode(tkBuf._array, 0, tkBuf._index); mEnc.encode(lenBuf._array, 0, lenBuf._index); mEnc.encode(mIdxBuf._array, 0, mIdxBuf._index); mEnc.dispose(); } catch (const BitStreamException&) { delete[] litBuf._array; delete[] lenBuf._array; delete[] mIdxBuf._array; delete[] tkBuf._array; throw; } // Copy bitstream array to output const int bufSize = int(ios.tellp()); if (dstIdx + bufSize > output._length) { input._index = startChunk + srcIdx; success = false; goto End; } buffer.pubseekpos(0); ios.read(reinterpret_cast(&dst[dstIdx]), streamsize(bufSize)); dstIdx += bufSize; startChunk = endChunk; } End: if (success == true) { if (dstIdx + 4 > output._length) { input._index = srcEnd; } else { // Emit last literals memcpy(&dst[dstIdx], &src[srcEnd], 4); dstIdx += 4; input._index = srcEnd + 4; } } output._index += dstIdx; delete[] litBuf._array; delete[] lenBuf._array; delete[] mIdxBuf._array; delete[] tkBuf._array; return (input._index == count) && (dstIdx < count); } bool ROLZCodec1::inverse(SliceArray& input, SliceArray& output, int count) { kanzi::byte* src = &input._array[input._index]; const int end = BigEndian::readInt32(&src[0]); if ((end <= 4) || (end - 4 > output._length - output._index)) return false; const int dstEnd = end - 4; int srcIdx = 5; int sizeChunk = min(dstEnd, ROLZCodec::CHUNK_SIZE); int startChunk = 0; const int flags = int(src[4]); const int litOrder = flags & 1; _minMatch = MIN_MATCH3; int delta = 2; switch (flags & 0x0E) { case 2: // MULTIMEDIA _minMatch = MIN_MATCH4; delta = 8; break; case 4: // DNA _minMatch = MIN_MATCH7; delta = 8; break; case 8: // EXE delta = 3; break; default: break; } _logPosChecks = flags >> 4; if ((_logPosChecks < 2) || (_logPosChecks > 8)) return false; if (_mSize < size_t(ROLZCodec::HASH_SIZE << _logPosChecks)) { _mSize = size_t(ROLZCodec::HASH_SIZE << _logPosChecks); if (_matches != nullptr) delete[] _matches; _matches = new uint32[_mSize]; } _posChecks = 1 << _logPosChecks; _maskChecks = uint8(_posChecks - 1); kanzi::byte* arena = new kanzi::byte[sizeChunk + sizeChunk / 5 + 2 * sizeChunk / 4]; SliceArray litBuf(&arena[0], sizeChunk); SliceArray mIdxBuf(&arena[sizeChunk], sizeChunk / 4); SliceArray tkBuf(&arena[sizeChunk + sizeChunk / 4], sizeChunk / 4); SliceArray lenBuf(&arena[sizeChunk + sizeChunk / 2], sizeChunk / 5); memset(&_counters[0], 0, sizeof(_counters)); bool success = true; const int litBufSize = litBuf._length; // Main loop while (startChunk < dstEnd) { litBuf._index = 0; lenBuf._index = 0; mIdxBuf._index = 0; tkBuf._index = 0; memset(&_matches[0], 0, sizeof(uint32) * size_t(ROLZCodec::HASH_SIZE << _logPosChecks)); const int endChunk = min(startChunk + sizeChunk, dstEnd); sizeChunk = endChunk - startChunk; bool onlyLiterals = false; int litLenDecoded = 0; try { // Decode literal, length and match index buffers ifixedbuf buffer(reinterpret_cast(&src[srcIdx]), max(min(count - srcIdx, sizeChunk + 16), 65536)); istream is(&buffer); DefaultInputBitStream ibs(is, 65536); const int litLen = int(ibs.readBits(32)); const int tkLen = int(ibs.readBits(32)); const int mLenLen = int(ibs.readBits(32)); const int mIdxLen = int(ibs.readBits(32)); const int firstLitLen = min(sizeChunk, 8); if ((litLen < 0) || (tkLen < 0) || (mLenLen < 0) || (mIdxLen < 0)) { input._index += srcIdx; output._index += startChunk; success = false; goto End; } if ((litLen > litBuf._length) || (tkLen > tkBuf._length) || (mLenLen > lenBuf._length) || (mIdxLen > mIdxBuf._length) || (litLen < firstLitLen) || (litLen > sizeChunk) || ((tkLen == 0) && (mIdxLen != 0)) || ((tkLen > 0) && (mIdxLen + 1 != tkLen))) { input._index += srcIdx; output._index += startChunk; success = false; goto End; } litLenDecoded = litLen; ANSRangeDecoder litDec(ibs, litOrder); litDec.decode(litBuf._array, 0, litLen); litDec.dispose(); ANSRangeDecoder mDec(ibs, 0, 32768); mDec.decode(tkBuf._array, 0, tkLen); mDec.decode(lenBuf._array, 0, mLenLen); mDec.decode(mIdxBuf._array, 0, mIdxLen); mDec.dispose(); onlyLiterals = tkLen == 0; srcIdx += int((ibs.read() + 7) >> 3); } catch (const BitStreamException&) { delete[] arena; throw; } if (onlyLiterals == true) { // Shortcut when no match if (litLenDecoded != sizeChunk) { success = false; goto End; } memcpy(&output._array[output._index], &litBuf._array[0], size_t(sizeChunk)); startChunk = endChunk; output._index += sizeChunk; continue; } const bool cond = _minMatch == MIN_MATCH3; kanzi::byte* buf = &output._array[output._index]; const kanzi::byte* refBuf = &output._array[output._index - delta]; int dstIdx = 0; const int n = min(dstEnd - output._index, 8); for (int j = 0; j < n; j++) buf[dstIdx++] = litBuf._array[litBuf._index++]; // Next chunk while (dstIdx < sizeChunk) { // token LLLLLMMM -> L lit length, M match length const int token = int(tkBuf._array[tkBuf._index++]); int mLen = token & 0x07; mLen += (mLen == 7 ? _minMatch + readLength(lenBuf._array, lenBuf._index) : _minMatch); // Emit literals const int litLen = (token < 0xF8) ? token >> 3 : readLength(lenBuf._array, lenBuf._index) + 31; if (litLen > 0) { if (dstIdx + litLen > litBufSize) { success = false; goto End; } memcpy(&buf[dstIdx], &litBuf._array[litBuf._index], size_t(litLen)); int srcInc = 0; if (cond == true) { for (int k = 0; k < litLen; k++) { const uint32 key = ROLZCodec::getKey1(&refBuf[dstIdx + k]); uint8* counter = &_counters[key]; uint32* matches = &_matches[key << _logPosChecks]; *counter = (*counter + 1) & _maskChecks; matches[*counter] = dstIdx + k; k += (srcInc >> 6); srcInc++; } } else { for (int k = 0; k < litLen; k++) { const uint32 key = ROLZCodec::getKey2(&refBuf[dstIdx + k]); uint8* counter = &_counters[key]; uint32* matches = &_matches[key << _logPosChecks]; *counter = (*counter + 1) & _maskChecks; matches[*counter] = dstIdx + k; k += (srcInc >> 6); srcInc++; } } litBuf._index += litLen; dstIdx += litLen; prefetchRead(&litBuf._array[litBuf._index]); if (dstIdx >= sizeChunk) { // Last chunk literals not followed by match if (dstIdx == sizeChunk) break; output._index += dstIdx; success = false; goto End; } } // Sanity check if (output._index + dstIdx + mLen > dstEnd) { success = false; goto End; } const uint8 mIdx = uint8(mIdxBuf._array[mIdxBuf._index++]); const uint32 key = (cond == true) ? ROLZCodec::getKey1(&refBuf[dstIdx]) : ROLZCodec::getKey2(&refBuf[dstIdx]); uint32* matches = &_matches[key << _logPosChecks]; const int32 ref = matches[(_counters[key] - mIdx) & _maskChecks]; _counters[key] = (_counters[key] + 1) & _maskChecks; matches[_counters[key]] = dstIdx; dstIdx = ROLZCodec::emitCopy(buf, dstIdx, ref, mLen); } startChunk = endChunk; output._index += dstIdx; } End: if (success == true) { // Emit last chunk literals if ((output._index + 4 > output._length) || (srcIdx + 4 > input._length)) { success = false; } else { memcpy(&output._array[output._index], &src[srcIdx], 4); output._index += 4; srcIdx += 4; } } input._index += srcIdx; delete[] arena; return (success == true) && (srcIdx == count); } const uint64 ROLZEncoder::TOP = 0x00FFFFFFFFFFFFFF; const uint64 ROLZEncoder::MASK_0_32 = 0x00000000FFFFFFFF; const int ROLZEncoder::MATCH_FLAG = 0; const int ROLZEncoder::LITERAL_FLAG = 1; const int ROLZEncoder::PSCALE = 0xFFFF; ROLZEncoder::ROLZEncoder(uint litLogSize, uint mLogSize, kanzi::byte buf[], int& idx) : _idx(idx) , _low(0) , _high(TOP) , _c1(1) , _ctx(0) , _pIdx(LITERAL_FLAG) { _buf = buf; _logSizes[MATCH_FLAG] = mLogSize; _logSizes[LITERAL_FLAG] = litLogSize; _probs[MATCH_FLAG] = new uint16[256 << mLogSize]; _probs[LITERAL_FLAG] = new uint16[256 << litLogSize]; reset(); } void ROLZEncoder::reset() { const int mLogSize = _logSizes[MATCH_FLAG]; for (int i = 0; i < (256 << mLogSize); i++) _probs[MATCH_FLAG][i] = PSCALE >> 1; const int litLogSize = _logSizes[LITERAL_FLAG]; for (int i = 0; i < (256 << litLogSize); i++) _probs[LITERAL_FLAG][i] = PSCALE >> 1; } void ROLZEncoder::encodeBits(int val, int n) { _c1 = 1; do { n--; encodeBit(val & (1 << n)); } while (n != 0); } void ROLZEncoder::encode9Bits(int val) { _c1 = 1; encodeBit(val & 0x100); encodeBit(val & 0x80); encodeBit(val & 0x40); encodeBit(val & 0x20); encodeBit(val & 0x10); encodeBit(val & 0x08); encodeBit(val & 0x04); encodeBit(val & 0x02); encodeBit(val & 0x01); } void ROLZEncoder::dispose() { for (int i = 0; i < 8; i++) { _buf[_idx + i] = kanzi::byte(_low >> 56); _low <<= 8; } _idx += 8; } const uint64 ROLZDecoder::TOP = 0x00FFFFFFFFFFFFFF; const uint64 ROLZDecoder::MASK_0_56 = 0x00FFFFFFFFFFFFFF; const uint64 ROLZDecoder::MASK_0_32 = 0x00000000FFFFFFFF; const int ROLZDecoder::MATCH_FLAG = 0; const int ROLZDecoder::LITERAL_FLAG = 1; const int ROLZDecoder::PSCALE = 0xFFFF; ROLZDecoder::ROLZDecoder(uint litLogSize, uint mLogSize, kanzi::byte buf[], int& idx) : _idx(idx) , _low(0) , _high(TOP) , _current(0) , _buf(buf) , _c1(1) , _ctx(0) , _pIdx(LITERAL_FLAG) { for (int i = 0; i < 8; i++) _current = (_current << 8) | (uint64(_buf[_idx + i]) & 0xFF); _idx += 8; _logSizes[MATCH_FLAG] = mLogSize; _logSizes[LITERAL_FLAG] = litLogSize; _probs[MATCH_FLAG] = new uint16[256 << mLogSize]; _probs[LITERAL_FLAG] = new uint16[256 << litLogSize]; reset(); } void ROLZDecoder::reset() { const int mLogSize = _logSizes[MATCH_FLAG]; for (int i = 0; i < (256 << mLogSize); i++) _probs[MATCH_FLAG][i] = PSCALE >> 1; const int litLogSize = _logSizes[LITERAL_FLAG]; for (int i = 0; i < (256 << litLogSize); i++) _probs[LITERAL_FLAG][i] = PSCALE >> 1; } int ROLZDecoder::decodeBits(int n) { _c1 = 1; const int mask = (1 << n) - 1; do { decodeBit(); n--; } while (n != 0); return _c1 & mask; } int ROLZDecoder::decode9Bits() { _c1 = 1; decodeBit(); decodeBit(); decodeBit(); decodeBit(); decodeBit(); decodeBit(); decodeBit(); decodeBit(); decodeBit(); return _c1 & 0x1FF; } const int ROLZCodec2::MATCH_FLAG = 0; const int ROLZCodec2::LITERAL_FLAG = 1; const int ROLZCodec2::MATCH_CTX = 0; const int ROLZCodec2::LITERAL_CTX = 1; const int ROLZCodec2::MIN_MATCH3 = 3; const int ROLZCodec2::MIN_MATCH7 = 7; const int ROLZCodec2::MAX_MATCH = MIN_MATCH3 + 255; const int ROLZCodec2::LOG_POS_CHECKS = 5; ROLZCodec2::ROLZCodec2(uint logPosChecks) : _logPosChecks(logPosChecks) { if ((logPosChecks < 2) || (logPosChecks > 8)) { stringstream ss; ss << "ROLZX codec: Invalid logPosChecks parameter: " << logPosChecks << " (must be in [2..8])"; throw invalid_argument(ss.str()); } _pCtx = nullptr; _posChecks = 1 << _logPosChecks; _maskChecks = uint8(_posChecks - 1); _minMatch = MIN_MATCH3; _matches = new uint32[ROLZCodec::HASH_SIZE << _logPosChecks]; memset(&_counters[0], 0, sizeof(_counters)); } ROLZCodec2::ROLZCodec2(Context& ctx) : _pCtx(&ctx) { _logPosChecks = LOG_POS_CHECKS; _posChecks = 1 << _logPosChecks; _maskChecks = uint8(_posChecks - 1); _minMatch = MIN_MATCH3; _matches = new uint32[ROLZCodec::HASH_SIZE << _logPosChecks]; memset(&_counters[0], 0, sizeof(_counters)); } // return position index (_logPosChecks bits) + length (16 bits) or -1 int ROLZCodec2::findMatch(const kanzi::byte buf[], int pos, int end, uint32 key) { const int counter = _counters[key]; uint32* matches = &_matches[key << _logPosChecks]; prefetchRead(matches); const kanzi::byte* curBuf = &buf[pos]; const uint32 hash32 = ROLZCodec::hash(curBuf); int bestLen = 0; int bestIdx = -1; const int maxMatch = min(ROLZCodec2::MAX_MATCH, end - pos) - 8; // Check all recorded positions for (int i = counter; i > counter - _posChecks; i--) { uint32 ref = matches[i & _maskChecks]; // Hash check may save a memory access ... if ((ref & ROLZCodec::HASH_MASK) != hash32) continue; ref &= ~ROLZCodec::HASH_MASK; if (buf[ref + bestLen] != curBuf[bestLen]) continue; int n = 0; while (n < maxMatch) { const int64 diff = LittleEndian::readLong64(&buf[ref + n]) ^ LittleEndian::readLong64(&curBuf[n]); if (diff != 0) { n += (Global::trailingZeros(uint64(diff)) >> 3); break; } n += 8; } if (n > bestLen) { bestIdx = counter - i; bestLen = n; if (bestLen == maxMatch) break; } } // Register current position _counters[key] = (_counters[key] + 1) & _maskChecks; matches[_counters[key]] = hash32 | int32(pos); return (bestLen < _minMatch) ? -1 : (bestIdx << 16) | (bestLen - _minMatch); } bool ROLZCodec2::forward(SliceArray& input, SliceArray& output, int count) { if (output._length < getMaxEncodedLength(count)) return false; const int srcEnd = count - 4; const kanzi::byte* src = &input._array[input._index]; kanzi::byte* dst = &output._array[output._index]; BigEndian::writeInt32(&dst[0], count); _minMatch = MIN_MATCH3; int flags = 0; int delta = 2; if (_pCtx != nullptr) { Global::DataType dt = (Global::DataType) _pCtx->getInt("dataType", Global::UNDEFINED); if (dt == Global::UNDEFINED) { uint freqs0[256] = { 0 }; Global::computeHistogram(&src[0], count, freqs0); dt = Global::detectSimpleType(count, freqs0); if (dt != Global::UNDEFINED) _pCtx->putInt("dataType", dt); } if (dt == Global::EXE) { delta = 3; flags |= 8; } else if (dt == Global::DNA) { delta = 8; _minMatch = MIN_MATCH7; flags |= 4; } } const int dt = delta; const bool cond = _minMatch == MIN_MATCH3; dst[4] = kanzi::byte(flags); int srcIdx = 0; int dstIdx = 5; int sizeChunk = min(count, ROLZCodec::CHUNK_SIZE); int startChunk = 0; ROLZEncoder re(9, _logPosChecks, &dst[0], dstIdx); memset(&_counters[0], 0, sizeof(_counters)); while (startChunk < srcEnd) { memset(&_matches[0], 0, sizeof(uint32) * size_t(ROLZCodec::HASH_SIZE << _logPosChecks)); const int endChunk = min(startChunk + sizeChunk, srcEnd); sizeChunk = endChunk - startChunk; re.reset(); src = &input._array[startChunk]; srcIdx = 0; // First literals const int n = min(srcEnd - startChunk, 8); re.setContext(LITERAL_CTX, kanzi::byte(0)); for (int j = 0; j < n; j++) { re.encode9Bits((LITERAL_FLAG << 8) | int(src[srcIdx])); srcIdx++; } while (srcIdx < sizeChunk) { re.setContext(LITERAL_CTX, src[srcIdx - 1]); uint32 key = (cond == true) ? ROLZCodec::getKey1(&src[srcIdx - dt]) : ROLZCodec::getKey2(&src[srcIdx - dt]); const int match = findMatch(src, srcIdx, sizeChunk, key); if (match < 0) { // Emit one literal re.encode9Bits((LITERAL_FLAG << 8) | int(src[srcIdx])); srcIdx++; continue; } // Emit one match length and index const int matchLen = match & 0xFFFF; re.encode9Bits((MATCH_FLAG << 8) | matchLen); const int matchIdx = match >> 16; re.setContext(MATCH_CTX, src[srcIdx - 1]); re.encodeBits(matchIdx, _logPosChecks); srcIdx += (matchLen + _minMatch); } startChunk = endChunk; } // Emit last literals for (int i = 0; i < 4; i++, srcIdx++) { re.setContext(LITERAL_CTX, src[srcIdx - 1]); re.encode9Bits((LITERAL_FLAG << 8) | int(src[srcIdx])); } re.dispose(); input._index = startChunk - sizeChunk + srcIdx; output._index = dstIdx; return (input._index == count) && (output._index < count); } bool ROLZCodec2::inverse(SliceArray& input, SliceArray& output, int count) { if (count == 0) return true; if (input._array == output._array) return false; kanzi::byte* src = &input._array[input._index]; const int dstEnd = BigEndian::readInt32(&src[0]); if ((dstEnd <= 0) || (dstEnd > output._length - output._index)) return false; int srcIdx = 5; int sizeChunk = min(dstEnd, ROLZCodec::CHUNK_SIZE); int startChunk = 0; _minMatch = MIN_MATCH3; const int flags = int(src[4]); int delta = 2; if ((flags & 0x0E) == 8) { delta = 3; } else if ((flags & 0x0E) == 4) { delta = 8; _minMatch = MIN_MATCH7; } const bool cond = _minMatch == MIN_MATCH3; ROLZDecoder rd(9, _logPosChecks, &src[0], srcIdx); memset(&_counters[0], 0, sizeof(_counters)); while (startChunk < dstEnd) { memset(&_matches[0], 0, sizeof(uint32) * (ROLZCodec::HASH_SIZE << _logPosChecks)); const int endChunk = min(startChunk + sizeChunk, dstEnd); sizeChunk = endChunk - startChunk; rd.reset(); kanzi::byte* dst = &output._array[output._index]; kanzi::byte* refBuf = &output._array[output._index - delta]; int dstIdx = 0; // First literals rd.setContext(LITERAL_CTX, kanzi::byte(0)); const int n = min(dstEnd - output._index, 8); for (int j = 0; j < n; j++) { int val = rd.decode9Bits(); // Sanity check if ((val >> 8) == MATCH_FLAG) { output._index += dstIdx; return false; } dst[dstIdx++] = kanzi::byte(val); } // Next chunk while (dstIdx < sizeChunk) { const int savedIdx = dstIdx; const uint32 key = (cond == true) ? ROLZCodec::getKey1(&refBuf[dstIdx]) : ROLZCodec::getKey2(&refBuf[dstIdx]); uint32* matches = &_matches[key << _logPosChecks]; rd.setContext(LITERAL_CTX, dst[dstIdx - 1]); int val = rd.decode9Bits(); if ((val >> 8) == LITERAL_FLAG) { dst[dstIdx++] = kanzi::byte(val); } else { // Read one match length and index const int matchLen = val & 0xFF; prefetchRead(&_counters[key]); // Sanity check if (dstIdx + matchLen + 3 > dstEnd) { output._index += dstIdx; return false; } rd.setContext(MATCH_CTX, dst[dstIdx - 1]); const int32 matchIdx = int32(rd.decodeBits(_logPosChecks)); const int32 ref = matches[(_counters[key] - matchIdx) & _maskChecks]; dstIdx = ROLZCodec::emitCopy(dst, dstIdx, ref, matchLen + _minMatch); } // Update map _counters[key]++; matches[_counters[key] & _maskChecks] = savedIdx; } startChunk = endChunk; output._index += dstIdx; } rd.dispose(); input._index = srcIdx; return srcIdx == count; } kanzi-cpp-2.5.2/src/transform/ROLZCodec.hpp000066400000000000000000000233761516423635400204770ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_ROLZCodec #define knz_ROLZCodec #include "../Context.hpp" #include "../Memory.hpp" #include "../Transform.hpp" // Implementation of a Reduced Offset Lempel Ziv transform // More information about ROLZ at http://ezcodesample.com/rolz/rolz_article.html namespace kanzi { class ROLZEncoder { private: static const uint64 TOP; static const uint64 MASK_0_32; static const int MATCH_FLAG; static const int LITERAL_FLAG; static const int PSCALE; uint16* _probs[2]; uint _logSizes[2]; int& _idx; uint64 _low; uint64 _high; byte* _buf; int32 _c1; int32 _ctx; int _pIdx; void encodeBit(int bit); public: ROLZEncoder(uint litLogSize, uint mLogSize, byte buf[], int& idx); ~ROLZEncoder() { delete[] _probs[LITERAL_FLAG]; delete[] _probs[MATCH_FLAG]; } void encodeBits(int val, int n); void encode9Bits(int val); void dispose(); void reset(); void setContext(int n, byte ctx) { _pIdx = n; _ctx = int32(ctx) << _logSizes[_pIdx]; } }; class ROLZDecoder { private: static const uint64 TOP; static const uint64 MASK_0_56; static const uint64 MASK_0_32; static const int MATCH_FLAG; static const int LITERAL_FLAG; static const int PSCALE; uint16* _probs[2]; uint _logSizes[2]; int& _idx; uint64 _low; uint64 _high; uint64 _current; byte* _buf; int32 _c1; int32 _ctx; int _pIdx; int decodeBit(); public: ROLZDecoder(uint litLogSize, uint mLogSize, byte buf[], int& idx); ~ROLZDecoder() { delete[] _probs[LITERAL_FLAG]; delete[] _probs[MATCH_FLAG]; } int decodeBits(int n); int decode9Bits(); void dispose() const {} void reset(); void setContext(int n, byte ctx) { _pIdx = n; _ctx = int32(ctx) << _logSizes[_pIdx]; } }; // Use ANS to encode/decode literals and matches class ROLZCodec1 FINAL : public Transform { public: ROLZCodec1(uint logPosChecks); ROLZCodec1(Context& ctx); ~ROLZCodec1() { if (_matches != nullptr) delete[] _matches; } bool forward(SliceArray& src, SliceArray& dst, int length); bool inverse(SliceArray& src, SliceArray& dst, int length); // Required encoding output buffer size int getMaxEncodedLength(int srcLen) const { return (srcLen <= 512) ? srcLen + 64 : srcLen; } private: static const int MIN_MATCH3; static const int MIN_MATCH4; static const int MIN_MATCH7; static const int MAX_MATCH; static const int LOG_POS_CHECKS; uint32* _matches; size_t _mSize; uint8 _counters[65536]; int _logPosChecks; int _posChecks; Context* _pCtx; int _minMatch; uint8 _maskChecks; int findMatch(const byte buf[], int pos, int end, uint32 hash32, const uint32* matches, const uint8* counter) const; int emitLength(byte block[], int length) const; int readLength(const byte block[], int& idx) const; }; // Use CM (ROLZEncoder/ROLZDecoder) to encode/decode literals and matches // Code loosely based on 'balz' by Ilya Muravyov class ROLZCodec2 FINAL : public Transform { public: ROLZCodec2(uint logPosChecks); ROLZCodec2(Context& ctx); ~ROLZCodec2() { if (_matches != nullptr) delete[] _matches; } bool forward(SliceArray& src, SliceArray& dst, int length); bool inverse(SliceArray& src, SliceArray& dst, int length); // Required encoding output buffer size int getMaxEncodedLength(int srcLen) const { // Since we do not check the dst index for each byte (for speed purpose) // allocate some extra buffer for incompressible data. return srcLen + ((srcLen < 32768) ? 1024 : srcLen >> 5); } private: static const int MATCH_FLAG; static const int LITERAL_FLAG; static const int MATCH_CTX; static const int LITERAL_CTX; static const int MIN_MATCH3; static const int MIN_MATCH7; static const int MAX_MATCH; static const int LOG_POS_CHECKS; uint32* _matches; uint8 _counters[65536]; int _logPosChecks; uint8 _maskChecks; Context* _pCtx; int _minMatch; int _posChecks; int findMatch(const byte buf[], int pos, int end, uint32 key); }; class ROLZCodec FINAL : public Transform { friend class ROLZCodec1; friend class ROLZCodec2; public: ROLZCodec(uint logPosChecks = 4); ROLZCodec(Context& ctx); ~ROLZCodec() { delete _delegate; } bool forward(SliceArray& src, SliceArray& dst, int length); bool inverse(SliceArray& src, SliceArray& dst, int length); // Required encoding output buffer size int getMaxEncodedLength(int srcLen) const { return _delegate->getMaxEncodedLength(srcLen); } private: static const int HASH_SIZE; static const int CHUNK_SIZE; static const int32 HASH; static const int32 HASH_MASK; static const int MAX_BLOCK_SIZE; static const int MIN_BLOCK_SIZE; Transform* _delegate; static uint32 getKey1(const byte* p) { return uint32(LittleEndian::readInt16(p)) & (HASH_SIZE - 1); } static uint32 getKey2(const byte* p) { return uint32((uint64(LittleEndian::readLong64(p)) * HASH) >> 40) & (HASH_SIZE - 1); } static uint32 hash(const byte* p) { return ((uint32(LittleEndian::readInt32(p)) << 8) * HASH) & HASH_MASK; } static int emitCopy(byte dst[], int dstIdx, int ref, int matchLen); }; inline int ROLZCodec1::emitLength(byte block[], int length) const { if (length < 1 << 7) { block[0] = byte(length); return 1; } int idx = 0; if (length >= 1 << 14) { block[idx] = byte(0x80 | (length >> 21)); idx += ((length >= 1 << 21) ? 1 : 0); block[idx++] = byte(0x80 | (length >> 14)); } block[idx++] = byte(0x80 | (length >> 7)); block[idx++] = byte(length & 0x7F); return idx; } inline int ROLZCodec1::readLength(const byte block[], int& pos) const { int next = int(block[pos++]); if (next < 128) return next; int length = next & 0x7F; next = int(block[pos++]); length = (length << 7) | (next & 0x7F); if (next >= 128) { next = int(block[pos++]); length = (length << 7) | (next & 0x7F); if (next >= 128) { next = int(block[pos++]); length = (length << 7) | (next & 0x7F); } } return length; } inline int ROLZCodec::emitCopy(byte buf[], int dstIdx, int ref, int matchLen) { const int res = dstIdx + matchLen; if (dstIdx - ref >= 8) { while (matchLen > 0) { memcpy(&buf[dstIdx], &buf[ref], 8); ref += 8; dstIdx += 8; matchLen -= 8; } } else { while (matchLen != 0) { buf[dstIdx++] = buf[ref++]; matchLen--; } } return res; } inline void ROLZEncoder::encodeBit(int bit) { const uint64 split = ((_high - _low) >> 4) * uint64(_probs[_pIdx][_ctx + _c1] >> 4) >> 8; // Update fields with new interval bounds if (bit == 0) { _low += (split + 1); _probs[_pIdx][_ctx + _c1] -= (_probs[_pIdx][_ctx + _c1] >> 5); _c1 += _c1; } else { _high = _low + split; _probs[_pIdx][_ctx + _c1] -= ((_probs[_pIdx][_ctx + _c1] - PSCALE + 32) >> 5); _c1 += (_c1 + 1); } // Emit unchanged first 32 bits while (((_low ^ _high) >> 24) == 0) { BigEndian::writeInt32(&_buf[_idx], int32(_high >> 32)); _idx += 4; _low <<= 32; _high = (_high << 32) | MASK_0_32; } } inline int ROLZDecoder::decodeBit() { const uint64 mid = _low + (((_high - _low) >> 4) * uint64(_probs[_pIdx][_ctx + _c1] >> 4) >> 8); int bit; // Update bounds and predictor if (mid >= _current) { bit = 1; _high = mid; _probs[_pIdx][_ctx + _c1] -= ((_probs[_pIdx][_ctx + _c1] - PSCALE + 32) >> 5); _c1 += (_c1 + 1); } else { bit = 0; _low = mid + 1; _probs[_pIdx][_ctx + _c1] -= (_probs[_pIdx][_ctx + _c1] >> 5); _c1 += _c1; } // Read 32 bits while (((_low ^ _high) >> 24) == 0) { _low = (_low << 32) & MASK_0_56; _high = ((_high << 32) | MASK_0_32) & MASK_0_56; const uint64 val = uint64(BigEndian::readInt32(&_buf[_idx])) & MASK_0_32; _current = ((_current << 32) | val) & MASK_0_56; _idx += 4; } return bit; } } #endif kanzi-cpp-2.5.2/src/transform/SBRT.cpp000066400000000000000000000073011516423635400175060ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include "SBRT.hpp" using namespace kanzi; const int SBRT::MODE_MTF = 1; // alpha = 0 const int SBRT::MODE_RANK = 2; // alpha = 1/2 const int SBRT::MODE_TIMESTAMP = 3; // alpha = 1 SBRT::SBRT(int mode) : _mask1((mode == MODE_TIMESTAMP) ? 0 : -1) , _mask2((mode == MODE_MTF) ? 0 : -1) , _shift((mode == MODE_RANK) ? 1 : 0) { if ((mode != MODE_MTF) && (mode != MODE_RANK) && (mode != MODE_TIMESTAMP)) throw std::invalid_argument("Invalid mode parameter"); } SBRT::SBRT(int mode, Context&) : _mask1((mode == MODE_TIMESTAMP) ? 0 : -1) , _mask2((mode == MODE_MTF) ? 0 : -1) , _shift((mode == MODE_RANK) ? 1 : 0) { if ((mode != MODE_MTF) && (mode != MODE_RANK) && (mode != MODE_TIMESTAMP)) throw std::invalid_argument("Invalid mode parameter"); } bool SBRT::forward(SliceArray& input, SliceArray& output, int count) { if (count == 0) return true; if (!SliceArray::isValid(input)) throw std::invalid_argument("SBRT: Invalid input block"); if (!SliceArray::isValid(output)) throw std::invalid_argument("SBRT: Invalid output block"); // Aliasing const kanzi::byte* src = &input._array[input._index]; kanzi::byte* dst = &output._array[output._index]; int p[256] = { 0 }; int q[256] = { 0 }; uint8 s2r[256]; uint8 r2s[256]; for (int i = 0; i < 256; i++) { s2r[i] = uint8(i); r2s[i] = uint8(i); } for (int i = 0; i < count; i++) { const uint8 c = uint8(src[i]); int r = int(s2r[c]); dst[i] = kanzi::byte(r); const int qc = ((i & _mask1) + (p[c] & _mask2)) >> _shift; p[c] = i; q[c] = qc; // Move up symbol to correct rank while ((r > 0) && (q[r2s[r - 1]] <= qc)) { r2s[r] = r2s[r - 1]; s2r[r2s[r]] = uint8(r); r--; } r2s[r] = c; s2r[c] = uint8(r); } input._index += count; output._index += count; return true; } bool SBRT::inverse(SliceArray& input, SliceArray& output, int count) { if (count == 0) return true; if (!SliceArray::isValid(input)) throw std::invalid_argument("SBRT: Invalid input block"); if (!SliceArray::isValid(output)) throw std::invalid_argument("SBRT: Invalid output block"); // Aliasing const kanzi::byte* src = &input._array[input._index]; kanzi::byte* dst = &output._array[output._index]; int p[256] = { 0 }; int q[256] = { 0 }; uint8 r2s[256]; for (int i = 0; i < 256; i++) r2s[i] = uint8(i); for (int i = 0; i < count; i++) { int r = int(src[i]); const int c = int(r2s[r]); dst[i] = kanzi::byte(r2s[r]); const int qc = ((i & _mask1) + (p[c] & _mask2)) >> _shift; p[c] = i; q[c] = qc; // Move up symbol to correct rank while ((r > 0) && (q[r2s[r - 1]] <= qc)) { r2s[r] = r2s[r - 1]; r--; } r2s[r] = uint8(c); } input._index += count; output._index += count; return true; } kanzi-cpp-2.5.2/src/transform/SBRT.hpp000066400000000000000000000036011516423635400175120ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_SBRT #define knz_SBRT #include "../Context.hpp" #include "../Transform.hpp" namespace kanzi { // Sort by Rank Transform is a family of transforms typically used after // a BWT to reduce the variance of the data prior to entropy coding. // SBR(alpha) is defined by sbr(x, alpha) = (1-alpha)*(t-w1(x,t)) + alpha*(t-w2(x,t)) // where x is an item in the data list, t is the current access time and wk(x,t) is // the k-th access time to x at time t (with 0 <= alpha <= 1). // See [Two new families of list update algorithms] by Frank Schulz for details. // SBR(0)= Move to Front Transform // SBR(1)= Time Stamp Transform // This code implements SBR(0), SBR(1/2) and SBR(1). Code derived from openBWT class SBRT FINAL : public Transform { public: static const int MODE_MTF; static const int MODE_RANK; static const int MODE_TIMESTAMP; SBRT(int mode); SBRT(int mode, Context&); ~SBRT() {} bool forward(SliceArray& input, SliceArray& output, int length); bool inverse(SliceArray& input, SliceArray& output, int length); int getMaxEncodedLength(int srcLen) const { return srcLen; } private: const int _mask1; const int _mask2; const int _shift; }; } #endif kanzi-cpp-2.5.2/src/transform/SRT.cpp000066400000000000000000000164411516423635400174110ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include "SRT.hpp" using namespace kanzi; bool SRT::forward(SliceArray& input, SliceArray& output, int length) { if (length == 0) return true; if (!SliceArray::isValid(input)) throw std::invalid_argument("SRT: Invalid input block"); if (!SliceArray::isValid(output)) throw std::invalid_argument("SRT: Invalid output block"); if (output._length - output._index < getMaxEncodedLength(length)) return false; uint freqs[256] = { 0 }; uint8 s2r[256] = { 0 }; uint8 r2s[256] = { 0 }; const kanzi::byte* src = &input._array[input._index]; // find first symbols and count occurrences for (int i = 0, b = 0; i < length;) { uint8 c = uint8(src[i]); int j = i + 1; while ((j < length) && (src[j] == kanzi::byte(c))) j++; if (freqs[c] == 0) { r2s[b] = c; s2r[c] = uint8(b); b++; } freqs[c] += (j - i); i = j; } // init arrays uint8 symbols[256]; int buckets[256] = { 0 }; const int nbSymbols = preprocess(freqs, symbols); for (int i = 0, bucketPos = 0; i < nbSymbols; i++) { const uint8 c = symbols[i]; buckets[c] = bucketPos; bucketPos += freqs[c]; } const int headerSize = encodeHeader(freqs, &output._array[output._index]); output._index += headerSize; kanzi::byte* dst = &output._array[output._index]; // encoding for (int i = 0; i < length;) { uint8 c = uint8(src[i]); int r = s2r[c]; int p = buckets[c]; dst[p] = kanzi::byte(r); p++; if (r != 0) { do { const uint8 t = r2s[r - 1]; r2s[r] = t; s2r[t] = uint8(r); r--; } while (r != 0); r2s[0] = c; s2r[c] = 0; } i++; while ((i < length) && (src[i] == kanzi::byte(c))) { dst[p] = kanzi::byte(0); p++; i++; } buckets[c] = p; } input._index += length; output._index += length; return true; } bool SRT::inverse(SliceArray& input, SliceArray& output, int length) { if (length == 0) return true; if (!SliceArray::isValid(input)) throw std::invalid_argument("SRT: Invalid input block"); if (!SliceArray::isValid(output)) throw std::invalid_argument("SRT: Invalid output block"); if ((length < 256) || (input._index + length > input._length)) return false; uint freqs[256] = { 0 }; const int headerSize = decodeHeader(&input._array[input._index], length, freqs); if (headerSize < 0) return false; input._index += headerSize; length -= headerSize; if (length < 0) return false; if (length > output._length - output._index) return false; const kanzi::byte* src = &input._array[input._index]; uint8 symbols[256] = { 0 }; // init arrays int nbSymbols = preprocess(freqs, symbols); int buckets[256] = { 0 }; int bucketEnds[256] = { 0 }; uint8 r2s[256] = { 0 }; for (int i = 0, bucketPos = 0; i < nbSymbols; i++) { const uint8 c = symbols[i]; if ((bucketPos < 0) || (bucketPos >= length)) return false; r2s[int(src[bucketPos])] = c; buckets[c] = bucketPos + 1; bucketPos += freqs[c]; bucketEnds[c] = bucketPos; } uint8 c = r2s[0]; kanzi::byte* dst = &output._array[output._index]; // decoding for (int i = 0; i < length; i++) { dst[i] = kanzi::byte(c); if (buckets[c] < bucketEnds[c]) { const uint8 r = uint8(src[buckets[c]]); buckets[c]++; if (r == 0) continue; if (r <= 8) { if (r >= 1) r2s[0] = r2s[1]; if (r >= 2) r2s[1] = r2s[2]; if (r >= 3) r2s[2] = r2s[3]; if (r >= 4) r2s[3] = r2s[4]; if (r >= 5) r2s[4] = r2s[5]; if (r >= 6) r2s[5] = r2s[6]; if (r >= 7) r2s[6] = r2s[7]; if (r >= 8) r2s[7] = r2s[8]; } else { memmove(&r2s[0], &r2s[1], size_t(r)); } r2s[r] = c; c = r2s[0]; } else { if (nbSymbols == 1) continue; nbSymbols--; memmove(&r2s[0], &r2s[1], size_t(nbSymbols)); c = r2s[0]; } } input._index += length; output._index += length; return true; } int SRT::preprocess(const uint freqs[], uint8 symbols[]) { int nbSymbols = 0; for (int i = 0; i < 256; i++) { if (freqs[i] == 0) continue; symbols[nbSymbols] = uint8(i); nbSymbols++; } int h = 4; while (h < nbSymbols) h = h * 3 + 1; do { h /= 3; for (int i = h; i < nbSymbols; i++) { uint8 t = symbols[i]; int b; for (b = i - h; b >= 0; b -= h) { const int val = freqs[symbols[b]] - freqs[t]; if ((val >= 0) && ((val != 0) || (t >= symbols[b]))) break; symbols[b + h] = symbols[b]; } symbols[b + h] = t; } } while (h != 1); return nbSymbols; } int SRT::encodeHeader(const uint freqs[], kanzi::byte dst[]) { int dstIdx = 0; for (int i = 0; i < 256; i++) { uint f = freqs[i]; if (f >= 128) { dst[dstIdx++] = kanzi::byte(0x80 | f); f >>= 7; if (f >= 128) { dst[dstIdx++] = kanzi::byte(0x80 | f); f >>= 7; if (f >= 128) { dst[dstIdx++] = kanzi::byte(0x80 | f); f >>= 7; if (f >= 128) { dst[dstIdx++] = kanzi::byte(0x80 | f); f >>= 7; } } } } dst[dstIdx++] = kanzi::byte(f); } return dstIdx; } int SRT::decodeHeader(const kanzi::byte src[], int srcEnd, uint freqs[]) { int srcIdx = 0; for (int i = 0; i < 256; i++) { uint res = 0; int shift = 0; // Frequencies are encoded as varints with up to 5 bytes. for (int j = 0; j < 5; j++) { if (srcIdx >= srcEnd) return -1; const uint val = uint(src[srcIdx++]); res |= ((val & 0x7F) << shift); if ((val & 0x80) == 0) break; if (j == 4) return -1; shift += 7; } freqs[i] = res; } return srcIdx; } kanzi-cpp-2.5.2/src/transform/SRT.hpp000066400000000000000000000026361516423635400174170ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_SRT #define knz_SRT #include "../Context.hpp" #include "../Transform.hpp" namespace kanzi { // Sorted Rank Transform is typically used after a BWT to reduce the variance // of the data prior to entropy coding. class SRT FINAL : public Transform { public: SRT() {} SRT(Context&) {} ~SRT() {} bool forward(SliceArray& pSrc, SliceArray& pDst, int length); bool inverse(SliceArray& pSrc, SliceArray& pDst, int length); int getMaxEncodedLength(int srcLen) const { return srcLen + 1024 /* max header size */; } private: static int preprocess(const uint freqs[], uint8 symbols[]); static int encodeHeader(const uint freqs[], byte dst[]); static int decodeHeader(const byte src[], int srcEnd, uint freqs[]); }; } #endif kanzi-cpp-2.5.2/src/transform/TextCodec.cpp000066400000000000000000001504601516423635400206230ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include "TextCodec.hpp" #include "../Global.hpp" #include "../Magic.hpp" using namespace kanzi; using namespace std; const int TextCodec::MAX_DICT_SIZE = 1 << 19; // must be less than 1<<24 const int TextCodec::MAX_WORD_LENGTH = 31; // must be less than 128 const int TextCodec::MIN_BLOCK_SIZE = 1024; const int TextCodec::MAX_BLOCK_SIZE = 1 << 30; // 1 GB const kanzi::byte TextCodec::ESCAPE_TOKEN1 = kanzi::byte(0x0F); // dictionary word preceded by space symbol const kanzi::byte TextCodec::ESCAPE_TOKEN2 = kanzi::byte(0x0E); // toggle upper/lower case of first word char const kanzi::byte TextCodec::MASK_1F = kanzi::byte(0x1F); const kanzi::byte TextCodec::MASK_3F = kanzi::byte(0x3F); const kanzi::byte TextCodec::MASK_20 = kanzi::byte(0x20); const kanzi::byte TextCodec::MASK_40 = kanzi::byte(0x40); const kanzi::byte TextCodec::MASK_80 = kanzi::byte(0x80); const kanzi::byte TextCodec::MASK_FLIP_CASE = kanzi::byte(0x80); const int TextCodec::HASH1 = 0x7FEB352D; const int TextCodec::HASH2 = 0x846CA68B; const kanzi::byte TextCodec::CR = kanzi::byte(0x0D); const kanzi::byte TextCodec::LF = kanzi::byte(0x0A); const kanzi::byte TextCodec::SP = kanzi::byte(0x20); const int TextCodec::THRESHOLD1 = 128; const int TextCodec::THRESHOLD2 = TextCodec::THRESHOLD1 * TextCodec::THRESHOLD1; const int TextCodec::THRESHOLD3 = 64; const int TextCodec::THRESHOLD4 = TextCodec::THRESHOLD3 * 128; const int TextCodec::LOG_HASHES_SIZE = 24; // 16 MB const kanzi::byte TextCodec::MASK_NOT_TEXT = kanzi::byte(0x80); const kanzi::byte TextCodec::MASK_CRLF = kanzi::byte(0x40); const kanzi::byte TextCodec::MASK_XML_HTML = kanzi::byte(0x20); const kanzi::byte TextCodec::MASK_DT = kanzi::byte(0x0F); const int TextCodec::MASK_LENGTH = 0x0007FFFF; // 19 bits // 1024 of the most common English words with at least 2 chars. char TextCodec::DICT_EN_1024[] = "TheBeAndOfInToWithItThatForYouHeHaveOnSaidSayAtButWeByHadTheyAsW\ ouldWhoOrCanMayDoThisWasIsMuchAnyFromNotSheWhatTheirWhichGetGive\ HasAreHimHerComeMyOurWereWillSomeBecauseThereThroughTellWhenWork\ ThemYetUpOwnOutIntoJustCouldOverOldThinkDayWayThanLikeOtherHowTh\ enItsPeopleTwoMoreTheseBeenNowWantFirstNewUseSeeTimeManManyThing\ MakeHereWellOnlyHisVeryAfterWithoutAnotherNoAllBelieveBeforeOffT\ houghSoAgainstWhileLastTooDownTodaySameBackTakeEachDifferentWher\ eBetweenThoseEvenSeenUnderAboutOneAlsoFactMustActuallyPreventExp\ ectContainConcernIfSchoolYearGoingCannotDueEverTowardGirlFirmGla\ ssGasKeepWorldStillWentShouldSpendStageDoctorMightJobGoContinueE\ veryoneNeverAnswerFewMeanDifferenceTendNeedLeaveTryNiceHoldSomet\ hingAskWarmLipCoverIssueHappenTurnLookSureDiscoverFightMadDirect\ ionAgreeSomeoneFailRespectNoticeChoiceBeginThreeSystemLevelFeelM\ eetCompanyBoxShowPlayLiveLetterEggNumberOpenProblemFatHandMeasur\ eQuestionCallRememberCertainPutNextChairStartRunRaiseGoalReallyH\ omeTeaCandidateMoneyBusinessYoungGoodCourtFindKnowKindHelpNightC\ hildLotYourUsEyeYesWordBitVanMonthHalfLowMillionHighOrganization\ RedGreenBlueWhiteBlackYourselfEightBothLittleHouseLetDespiteProv\ ideServiceHimselfFriendDescribeFatherDevelopmentAwayKillTripHour\ GameOftenPlantPlaceEndAmongSinceStandDesignParticularSuddenlyMem\ berPayLawBookSilenceAlmostIncludeAgainEitherToolFourOnceLeastExp\ lainIdentifyUntilSiteMinuteCoupleWeekMatterBringDetailInformatio\ nNothingAnythingEverythingAgoLeadSometimesUnderstandWhetherNatur\ eTogetherFollowParentStopIndeedDifficultPublicAlreadySpeakMainta\ inRemainHearAllowMediaOfficeBenefitDoorHugPersonLaterDuringWarHi\ storyArgueWithinSetArticleStationMorningWalkEventWinChooseBehavi\ orShootFireFoodTitleAroundAirTeacherGapSubjectEnoughProveAcrossA\ lthoughHeadFootSecondBoyMainLieAbleCivilTableLoveProcessOfferStu\ dentConsiderAppearStudyBuyNearlyHumanEvidenceTextMethodIncluding\ SendRealizeSenseBuildControlAudienceSeveralCutCollegeInterestSuc\ cessSpecialRiskExperienceBehindBetterResultTreatFiveRelationship\ AnimalImproveHairStayTopReducePerhapsLateWriterPickElseSignifica\ ntChanceHotelGeneralRockRequireAlongFitThemselvesReportCondition\ ReachTruthEffortDecideRateEducationForceGardenDrugLeaderVoiceQui\ teWholeSeemMindFinallySirReturnFreeStoryRespondPushAccordingBrot\ herLearnSonHopeDevelopFeelingReadCarryDiseaseRoadVariousBallCase\ OperationCloseVisitReceiveBuildingValueResearchFullModelJoinSeas\ onKnownDirectorPositionPlayerSportErrorRecordRowDataPaperTheoryS\ paceEveryFormSupportActionOfficialWhoseIdeaHappyHeartBestTeamPro\ jectHitBaseRepresentTownPullBusMapDryMomCatDadRoomSmileFieldImpa\ ctFundLargeDogHugePrepareEnvironmentalProduceHerselfTeachOilSuch\ SituationTieCostIndustrySkinStreetImageItselfPhonePriceWearMostS\ unSoonClearPracticePieceWaitRecentImportantProductLeftWallSeries\ NewsShareMovieKidNorSimplyWifeOntoCatchMyselfFineComputerSongAtt\ entionDrawFilmRepublicanSecurityScoreTestStockPositiveCauseCentu\ ryWindowMemoryExistListenStraightCultureBillionFormerDecisionEne\ rgyMoveSummerWonderRelateAvailableLineLikelyOutsideShotShortCoun\ tryRoleAreaSingleRuleDaughterMarketIndicatePresentLandCampaignMa\ terialPopulationEconomyMedicalHospitalChurchGroundThousandAuthor\ ityInsteadRecentlyFutureWrongInvolveLifeHeightIncreaseRightBankC\ ulturalCertainlyWestExecutiveBoardSeekLongOfficerStatementRestBa\ yDealWorkerResourceThrowForwardPolicyScienceEyesBedItemWeaponFil\ lPlanMilitaryGunHotHeatAddressColdFocusForeignTreatmentBloodUpon\ CourseThirdWatchAffectEarlyStoreThusSoundEverywhereBabyAdministr\ ationMouthPageEnterProbablyPointSeatNaturalRaceFarChallengePassA\ pplyMailUsuallyMixToughClearlyGrowFactorStateLocalGuyEastSaveSou\ thSceneMotherCareerQuicklyCentralFaceIceAboveBeyondPictureNetwor\ kManagementIndividualWomanSizeSpeedBusySeriousOccurAddReadySignC\ ollectionListApproachChargeQualityPressureVoteNotePartRealWebCur\ rentDetermineTrueSadWhateverBreakWorryCupParticularlyAmountAbili\ tyEatRecognizeSitCharacterSomebodyLossDegreeEffectAttackStaffMid\ dleTelevisionWhyLegalCapitalTradeElectionEverybodyDropMajorViewS\ tandardBillEmployeeDiscussionOpportunityAnalysisTenSuggestLawyer\ HusbandSectionBecomeSkillSisterStyleCrimeProgramCompareCapMissBa\ dSortTrainingEasyNearRegionStrategyPurposePerformTechnologyEcono\ micBudgetExampleCheckEnvironmentDoneDarkTermRatherLaughGuessCarL\ owerHangPastSocialForgetHundredRemoveManagerEnjoyExactlyDieFinal\ MaybeHealthFloorChangeAmericanPoorFunEstablishTrialSpringDinnerB\ igThankProtectAvoidImagineTonightStarArmFinishMusicOwnerCryArtPr\ ivateOthersSimplePopularReflectEspeciallySmallLightMessageStepKe\ yPeaceProgressMadeSideGreatFixInterviewManageNationalFishLoseCam\ eraDiscussEqualWeightPerformanceSevenWaterProductionPersonalCell\ PowerEveningColorInsideBarUnitLessAdultWideRangeMentionDeepEdgeS\ trongHardTroubleNecessarySafeCommonFearFamilySeaDreamConferenceR\ eplyPropertyMeetingAlwaysStuffAgencyDeathGrowthSellSoldierActHea\ vyWetBagMarriageDeadSingRiseDecadeWhomFigurePoliceBodyMachineCat\ egoryAheadFrontCareOrderRealityPartnerYardBeatViolenceTotalDefen\ seWriteConsumerCenterGroupThoughtModernTaskCoachReasonAgeFingerS\ pecificConnectionWishResponsePrettyMovementCardLogNumberSumTreeE\ ntireCitizenThroughoutPetSimilarVictimNewspaperThreatClassShakeS\ ourceAccountPainFallRichPossibleAcceptSolidTravelTalkSaidCreateN\ onePlentyPeriodDefineNormalRevealDrinkAuthorServeNameMomentAgent\ DocumentActivityAnywayAfraidTypeActiveTrainInterestingRadioDange\ rGenerationLeafCopyMatchClaimAnyoneSoftwarePartyDeviceCodeLangua\ geLinkHoweverConfirmCommentCityAnywhereSomewhereDebateDriveHighe\ rBeautifulOnlineFanPriorityTraditionalSixUnited"; DictEntry TextCodec::STATIC_DICTIONARY[1024] = {}; int8 TextCodec::CHAR_TYPE[256] = {}; const bool TextCodec::INIT = TextCodec::init(TextCodec::CHAR_TYPE); const int TextCodec::STATIC_DICT_WORDS = TextCodec::createDictionary(DICT_EN_1024, sizeof(DICT_EN_1024), STATIC_DICTIONARY, 1024, 0); bool TextCodec::init(int8 cType[256]) { for (int i = 0; i < 256; i++) { if ((i >= ' ') && (i <= '/')) // [ !"#$%&'()*+,-./] cType[i] = 1; else if ((i >= ':') && (i <= '?')) // [:;<=>?] cType[i] = 1; else { switch (i) { case '\n': case '\r': case '\t': case '_': case '|': case '{': case '}': case '[': case ']': cType[i] = 1; break; default: cType[i] = (isUpperCase(byte(i)) || isLowerCase(byte(i))) == true ? 0 : -1; } } } return true; } // Create dictionary from array of words int TextCodec::createDictionary(char words[], int dictSize, DictEntry dict[], int maxWords, int startWord) { int delimAnchor = 0; uint h = HASH1; int nbWords = startWord; byte* src = reinterpret_cast(words); for (int i = 0; ((i < dictSize) && (nbWords < maxWords)); i++) { if (isText(src[i]) == false) continue; if (isUpperCase(src[i])) { if (i > delimAnchor) { dict[nbWords] = DictEntry(&src[delimAnchor], h, nbWords, i - delimAnchor); nbWords++; delimAnchor = i; h = HASH1; } src[i] ^= byte(0x20); } h = h * HASH1 ^ uint(src[i]) * HASH2; } if (nbWords < maxWords) { dict[nbWords] = DictEntry(&src[delimAnchor], h, nbWords, dictSize - 1 - delimAnchor); nbWords++; } return nbWords; } // Analyze the block and return an 8-bit status (see MASK flags constants) // The goal is to detect text data amenable to pre-processing. byte TextCodec::computeStats(const byte block[], int count, uint freqs0[], bool strict) { if ((strict == false) && (Magic::getType(block) != Magic::NO_MAGIC)) { // This is going to fail if the block is not the first of the file. // But this is a cheap test, good enough for fast mode. return TextCodec::MASK_NOT_TEXT; } uint* freqs1 = new uint[65536]; memset(&freqs1[0], 0, 65536 * sizeof(uint)); uint f0[256] = { 0 }; uint f1[256] = { 0 }; uint f3[256] = { 0 }; uint f2[256] = { 0 }; uint8 prv = 0; const uint8* data = reinterpret_cast(&block[0]); const int count4 = count & -4; // Unroll loop for (int i = 0; i < count4; i += 4) { const uint8 cur0 = data[i]; const uint8 cur1 = data[i + 1]; const uint8 cur2 = data[i + 2]; const uint8 cur3 = data[i + 3]; f0[cur0]++; f1[cur1]++; f2[cur2]++; f3[cur3]++; freqs1[(prv * 256) + cur0]++; freqs1[(cur0 * 256) + cur1]++; freqs1[(cur1 * 256) + cur2]++; freqs1[(cur2 * 256) + cur3]++; prv = cur3; } for (int i = count4; i < count; i++) { freqs0[data[i]]++; freqs1[(prv * 256) + data[i]]++; prv = data[i]; } for (int i = 0; i < 256; i++) { freqs0[i] += (f0[i] + f1[i] + f2[i] + f3[i]); } const int cr = int(CR); const int lf = int(LF); int nbTextChars = freqs0[cr] + freqs0[lf]; int nbASCII = 0; for (int i = 0; i < 128; i++) { if (isText(byte(i)) == true) nbTextChars += freqs0[i]; nbASCII += freqs0[i]; } // Not text (crude thresholds) const int nbBinChars = count - nbASCII; bool notText = nbBinChars > (count >> 2); if (notText == false) { notText = nbTextChars < (count >> 2); if (strict == true) { notText |= ((freqs0[0] >= uint(count / 100)) || ((nbASCII / 95) < (count / 100))); } else { notText |= (freqs0[32] < uint(count / 50)); } } byte res = byte(0); if (notText == true) { res = detectType(freqs0, freqs1, count); delete[] freqs1; return res; } if (nbBinChars <= count - count / 10) { // Check if likely XML/HTML // Another crude test: check that the frequencies of < and > are similar // and 'high enough'. Also check it is worth to attempt replacing ampersand sequences. // Getting this flag wrong results in a very small compression speed degradation. const int f60 = freqs0[60]; // '<' const int f62 = freqs0[62]; // '>' const int f38 = freqs1[38 * 256 + 97] + freqs1[38 * 256 + 103] + freqs1[38 * 256 + 108] + freqs1[38 * 256 + 113]; // '&a', '&g', '&l', '&q' const int minFreq = max((count - nbBinChars) >> 9, 2); if ((f60 >= minFreq) && (f62 >= minFreq) && (f38 > 0)) { if (f60 < f62) { if (f60 >= (f62 - f62 / 100)) res |= TextCodec::MASK_XML_HTML; } else if (f62 < f60) { if (f62 >= (f60 - f60 / 100)) res |= TextCodec::MASK_XML_HTML; } else { res |= TextCodec::MASK_XML_HTML; } } } // Check CR+LF matches if ((freqs0[cr] != 0) && (freqs0[cr] == freqs0[lf])) { res |= TextCodec::MASK_CRLF; for (int i = 0; i < 256; i++) { if ((i != lf) && (freqs1[(cr * 256) + i]) != 0) { res &= ~TextCodec::MASK_CRLF; break; } if ((i != cr) && (freqs1[(i * 256) + lf]) != 0) { res &= ~TextCodec::MASK_CRLF; break; } } } delete[] freqs1; return res; } byte TextCodec::detectType(const uint freqs0[], const uint freqs1[], int count) { Global::DataType dt = Global::detectSimpleType(count, freqs0); if (dt != Global::UNDEFINED) return TextCodec::MASK_NOT_TEXT | byte(dt); // Valid UTF-8 sequences // See Unicode 16 Standard - UTF-8 Table 3.7 // U+0000..U+007F 00..7F // U+0080..U+07FF C2..DF 80..BF // U+0800..U+0FFF E0 A0..BF 80..BF // U+1000..U+CFFF E1..EC 80..BF 80..BF // U+D000..U+D7FF ED 80..9F 80..BF 80..BF // U+E000..U+FFFF EE..EF 80..BF 80..BF // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF // Check rules for 1 byte uint sum = freqs0[0xC0] + freqs0[0xC1]; uint sum2 = 0; bool res = true; for (int i = 0xF5; i <= 0xFF; i++) sum += freqs0[i]; if (sum != 0) { res = false; goto end; } // Check rules for first 2 bytes for (int i = 0; i < 256; i++) { // Exclude < 0xE0A0 || > 0xE0BF if ((i < 0xA0) || (i > 0xBF)) sum += freqs1[0xE0 * 256 + i]; // Exclude < 0xED80 || > 0xEDE9F if ((i < 0x80) || (i > 0x9F)) sum += freqs1[0xED * 256 + i]; // Exclude < 0xF090 || > 0xF0BF if ((i < 0x90) || (i > 0xBF)) sum += freqs1[0xF0 * 256 + i]; // Exclude < 0xF480 || > 0xF48F if ((i < 0x80) || (i > 0x8F)) sum += freqs1[0xF4 * 256 + i]; if ((i < 0x80) || (i > 0xBF)) { // Exclude < 0x??80 || > 0x??BF with ?? in [C2..DF] for (int j = 0xC2; j <= 0xDF; j++) sum += freqs1[j * 256 + i]; // Exclude < 0x??80 || > 0x??BF with ?? in [E1..EC] for (int j = 0xE1; j <= 0xEC; j++) sum += freqs1[j * 256 + i]; // Exclude < 0x??80 || > 0x??BF with ?? in [F1..F3] sum += freqs1[0xF1 * 256 + i]; sum += freqs1[0xF2 * 256 + i]; sum += freqs1[0xF3 * 256 + i]; // Exclude < 0xEE80 || > 0xEEBF sum += freqs1[0xEE * 256 + i]; // Exclude < 0xEF80 || > 0xEFBF sum += freqs1[0xEF * 256 + i]; } else { // Count non-primary bytes sum2 += freqs0[i]; } if (sum != 0) { res = false; break; } } end: // Ad-hoc threshold res &= (sum2 >= uint(count / 8)); return res == true ? TextCodec::MASK_NOT_TEXT | byte(Global::UTF8) : TextCodec::MASK_NOT_TEXT; } TextCodec::TextCodec() { _delegate = new TextCodec1(); } TextCodec::TextCodec(Context& ctx) { int encodingType = ctx.getInt("textcodec", 1); _delegate = (encodingType == 1) ? static_cast*>(new TextCodec1(ctx)) : static_cast*>(new TextCodec2(ctx)); } bool TextCodec::forward(SliceArray& input, SliceArray& output, int count) { if (count == 0) return true; if ((count < MIN_BLOCK_SIZE) || (count > MAX_BLOCK_SIZE)) return false; if (!SliceArray::isValid(input)) throw invalid_argument("TextCodec: Invalid input block"); if (!SliceArray::isValid(output)) throw invalid_argument("TextCodec: Invalid output block"); if (input._array == output._array) return false; return _delegate->forward(input, output, count); } bool TextCodec::inverse(SliceArray& input, SliceArray& output, int count) { if (count == 0) return true; if (count > MAX_BLOCK_SIZE) // ! no min return false; if (!SliceArray::isValid(input)) throw invalid_argument("TextCodec: Invalid input block"); if (!SliceArray::isValid(output)) throw invalid_argument("TextCodec: Invalid output block"); if (input._array == output._array) return false; if ((count < 2) || (input._index + count > input._length)) return false; return _delegate->inverse(input, output, count); } TextCodec1::TextCodec1() { _logHashSize = TextCodec::LOG_HASHES_SIZE; _dictSize = 1 << 13; _dictMap = nullptr; _dictList = nullptr; _hashMask = (1 << _logHashSize) - 1; _staticDictSize = TextCodec::STATIC_DICT_WORDS; _isCRLF = false; _escapes[0] = TextCodec::ESCAPE_TOKEN2; _escapes[1] = TextCodec::ESCAPE_TOKEN1; _pCtx = nullptr; } TextCodec1::TextCodec1(Context& ctx) { // Actual block size const int blockSize = ctx.getInt("blockSize", 0); const int log = blockSize >= 8 ? max(min(Global::log2(uint32(blockSize / 8)), 26), 13) : 13; _logHashSize = ctx.getString("entropy") == "TPAQX" ? log + 1 : log; _dictSize = 1 << 13; _dictMap = nullptr; _dictList = nullptr; _hashMask = (1 << _logHashSize) - 1; _staticDictSize = TextCodec::STATIC_DICT_WORDS; _isCRLF = false; _escapes[0] = TextCodec::ESCAPE_TOKEN2; _escapes[1] = TextCodec::ESCAPE_TOKEN1; _pCtx = &ctx; } void TextCodec1::reset(int count) { // Select an appropriate initial dictionary size const int log = count < 1024 ? 13 : max(min(Global::log2(uint32(count / 128)), 18), 13); _dictSize = max(TextCodec::STATIC_DICT_WORDS + 2, 1 << log); const int mapSize = 1 << _logHashSize; if (_dictMap == nullptr) _dictMap = new DictEntry*[mapSize]; for (int i = 0; i < mapSize; i++) _dictMap[i] = nullptr; if (_dictList == nullptr) { _dictList = new DictEntry[_dictSize]; #if __cplusplus >= 201103L memcpy(&_dictList[0], &TextCodec::STATIC_DICTIONARY[0], sizeof(TextCodec::STATIC_DICTIONARY)); #else for (int i = 0; i < TextCodec::STATIC_DICT_WORDS; i++) _dictList[i] = TextCodec::STATIC_DICTIONARY[i]; #endif // Add special entries at end of static dictionary _staticDictSize = TextCodec::STATIC_DICT_WORDS; _dictList[_staticDictSize] = DictEntry(&_escapes[0], 0, _staticDictSize, 1); _dictList[_staticDictSize + 1] = DictEntry(&_escapes[1], 0, _staticDictSize + 1, 1); _staticDictSize += 2; } for (int i = 0; i < _staticDictSize; i++) _dictMap[_dictList[i]._hash & _hashMask] = &_dictList[i]; // Pre-allocate all dictionary entries for (int i = _staticDictSize; i < _dictSize; i++) _dictList[i] = DictEntry(nullptr, 0, i); } bool TextCodec1::forward(SliceArray& input, SliceArray& output, int count) { if (output._length - output._index < getMaxEncodedLength(count)) return false; const byte* src = &input._array[input._index]; byte* dst = &output._array[output._index]; int srcIdx = 0; int dstIdx = 0; if (_pCtx != nullptr) { Global::DataType dt = (Global::DataType) _pCtx->getInt("dataType", Global::UNDEFINED); // Filter out most types. Still check binaries which may contain significant parts of text if ((dt != Global::UNDEFINED) && (dt != Global::TEXT) && (dt != Global::BIN)) return false; } uint freqs[256] = { 0 }; byte mode = TextCodec::computeStats(&src[srcIdx], count, freqs, true); // Not text ? if ((mode & TextCodec::MASK_NOT_TEXT) != byte(0)) { if (_pCtx != nullptr) _pCtx->putInt("dataType", Global::DataType(mode & TextCodec::MASK_DT)); return false; } if (_pCtx != nullptr) _pCtx->putInt("dataType", Global::TEXT); reset(count); const int srcEnd = count; const int dstEnd = getMaxEncodedLength(count); const int dstEnd4 = dstEnd - 4; int emitAnchor = 0; // never less than 0 int words = _staticDictSize; // DOS encoded end of line (CR+LF) ? _isCRLF = int(mode & TextCodec::MASK_CRLF) != 0; dst[dstIdx++] = mode; bool res = true; while ((srcIdx < srcEnd) && (src[srcIdx] == TextCodec::SP)) { dst[dstIdx++] = TextCodec::SP; srcIdx++; emitAnchor++; } int delimAnchor = TextCodec::isText(src[srcIdx]) ? srcIdx - 1 : srcIdx; // previous delimiter while (srcIdx < srcEnd) { const int8 cType = TextCodec::getType(src[srcIdx]); if (cType == 0) { srcIdx++; continue; } if ((srcIdx > delimAnchor + 2) && (cType > 0)) { // At least 2 letters const byte val = src[delimAnchor + 1]; const int length = srcIdx - delimAnchor - 1; if (length <= TextCodec::MAX_WORD_LENGTH) { // Compute hashes // h1 -> hash of word chars // h2 -> hash of word chars with first char case flipped uint h1 = TextCodec::HASH1; h1 = h1 * TextCodec::HASH1 ^ uint(val) * TextCodec::HASH2; uint h2 = TextCodec::HASH1; h2 = h2 * TextCodec::HASH1 ^ (uint(val) ^ 0x20) * TextCodec::HASH2; for (int i = delimAnchor + 2; i < srcIdx; i++) { h1 = h1 * TextCodec::HASH1 ^ uint(src[i]) * TextCodec::HASH2; h2 = h2 * TextCodec::HASH1 ^ uint(src[i]) * TextCodec::HASH2; } // Check word in dictionary DictEntry* pe = nullptr; prefetchRead(&_dictMap[h1 & _hashMask]); DictEntry* pe1 = _dictMap[h1 & _hashMask]; if ((pe1 != nullptr) && (pe1->_hash == h1) && ((pe1->_data >> 24) == length)) pe = pe1; else { prefetchRead(&_dictMap[h2 & _hashMask]); DictEntry* pe2 = _dictMap[h2 & _hashMask]; if ((pe2 != nullptr) && (pe2->_hash == h2) && ((pe2->_data >> 24) == length)) pe = pe2; } // Check for hash collisions if ((pe != nullptr) && (!TextCodec::sameWords(&pe->_ptr[1], &src[delimAnchor + 2], length - 1))) pe = nullptr; if (pe == nullptr) { // Word not found in the dictionary or hash collision. // Replace entry if not in static dictionary if (((length > 3) || ((length == 3) && (words < TextCodec::THRESHOLD2))) && (pe1 == nullptr)) { DictEntry* pe3 = &_dictList[words]; if ((pe3->_data & TextCodec::MASK_LENGTH) >= _staticDictSize) { // Reuse old entry _dictMap[pe3->_hash & _hashMask] = nullptr; pe3->_ptr = &src[delimAnchor + 1]; pe3->_hash = h1; pe3->_data = (length << 24) | words; } // Update hash map _dictMap[h1 & _hashMask] = pe3; words++; // Dictionary full ? Expand or reset index to end of static dictionary if (words >= _dictSize) { if (expandDictionary() == false) words = _staticDictSize; } } } else { // Word found in the dictionary // Skip space if only delimiter between 2 word references if ((emitAnchor != delimAnchor) || (src[delimAnchor] != byte(' '))) { const int dIdx = emitSymbols(&src[emitAnchor], &dst[dstIdx], delimAnchor + 1 - emitAnchor, dstEnd - dstIdx); if (dIdx < 0) { res = false; break; } dstIdx += dIdx; } if (dstIdx >= dstEnd4) { res = false; break; } dst[dstIdx++] = (pe == pe1) ? TextCodec::ESCAPE_TOKEN1 : TextCodec::ESCAPE_TOKEN2; dstIdx += emitWordIndex(&dst[dstIdx], pe->_data & TextCodec::MASK_LENGTH); emitAnchor = delimAnchor + 1 + int(pe->_data >> 24); } } } // Reset delimiter position delimAnchor = srcIdx; srcIdx++; } if (res == true) { // Emit last symbols const int dIdx = emitSymbols(&src[emitAnchor], &dst[dstIdx], srcEnd - emitAnchor, dstEnd - dstIdx); if (dIdx < 0) res = false; else dstIdx += dIdx; res &= (srcIdx == srcEnd); } output._index += dstIdx; input._index += srcIdx; return res; } bool TextCodec1::expandDictionary() { if (_dictSize >= TextCodec::MAX_DICT_SIZE) return false; DictEntry* newDict = new DictEntry[_dictSize * 2]; memcpy(static_cast(&newDict[0]), &_dictList[0], sizeof(DictEntry) * _dictSize); for (int i = _dictSize; i < _dictSize * 2; i++) newDict[i] = DictEntry(nullptr, 0, i); delete[] _dictList; _dictList = newDict; // Reset map (values must point to addresses of new DictEntry items) for (int i = 0; i < _dictSize; i++) { _dictMap[_dictList[i]._hash & _hashMask] = &_dictList[i]; } _dictSize <<= 1; return true; } int TextCodec1::emitSymbols(const byte src[], byte dst[], const int srcEnd, const int dstEnd) const { int dstIdx = 0; for (int i = 0; i < srcEnd; i++) { if (dstIdx >= dstEnd) return -1; // Work around incorrect warning by GCC 7.x.x with C++17 #ifdef __GNUC__ #if (__GNUC__ == 7) && (__cplusplus > 201402L) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wswitch" #endif #endif const byte cur = src[i]; switch (cur) { case TextCodec::ESCAPE_TOKEN1: case TextCodec::ESCAPE_TOKEN2: { // Emit special word dst[dstIdx++] = TextCodec::ESCAPE_TOKEN1; const int idx = (cur == TextCodec::ESCAPE_TOKEN1) ? _staticDictSize - 1 : _staticDictSize - 2; int lenIdx = 1; if (idx >= TextCodec::THRESHOLD1) lenIdx = (idx >= TextCodec::THRESHOLD2) ? 3 : 2; if (dstIdx + lenIdx >= dstEnd) return -1; dstIdx += emitWordIndex(&dst[dstIdx], idx); break; } case TextCodec::CR: if (_isCRLF == false) dst[dstIdx++] = cur; break; default: dst[dstIdx++] = cur; } } // Work around incorrect warning by GCC 7.x.x with C++17 #ifdef __GNUC__ #if (__GNUC__ == 7) && (__cplusplus > 201402L) #pragma GCC diagnostic pop #endif #endif return dstIdx; } int TextCodec1::emitWordIndex(byte dst[], int val) { // Emit word index (varint 5 bits + 7 bits + 7 bits) if (val >= TextCodec::THRESHOLD1) { if (val >= TextCodec::THRESHOLD2) { dst[0] = byte(0xE0 | (val >> 14)); dst[1] = byte(0x80 | (val >> 7)); dst[2] = byte(0x7F & val); return 3; } dst[0] = byte(0x80 | (val >> 7)); dst[1] = byte(0x7F & val); return 2; } dst[0] = byte(val); return 1; } bool TextCodec1::inverse(SliceArray& input, SliceArray& output, int count) { if ((count < 2) || (input._index + count > input._length)) return false; reset(output._length); const byte* src = &input._array[input._index]; byte* dst = &output._array[output._index]; _isCRLF = int(src[0] & TextCodec::MASK_CRLF) != 0; const bool isCRLF = _isCRLF; int srcIdx = 1; int dstIdx = 0; const int srcEnd = count; const int dstEnd = output._length - output._index; int delimAnchor = TextCodec::isText(src[srcIdx]) ? srcIdx - 1 : srcIdx; // previous delimiter int words = _staticDictSize; bool wordRun = false; bool res = true; while ((srcIdx < srcEnd) && (dstIdx < dstEnd)) { const byte cur = src[srcIdx]; const int8 cType = TextCodec::getType(cur); if (cType == 0) { dst[dstIdx] = src[srcIdx]; srcIdx++; dstIdx++; continue; } if ((srcIdx > delimAnchor + 3) && (cType > 0)) { const int length = srcIdx - delimAnchor - 1; // length > 2 if (length <= TextCodec::MAX_WORD_LENGTH) { uint h1 = TextCodec::HASH1; h1 = h1 * TextCodec::HASH1 ^ uint(src[delimAnchor + 1]) * TextCodec::HASH2; h1 = h1 * TextCodec::HASH1 ^ uint(src[delimAnchor + 2]) * TextCodec::HASH2; for (int i = delimAnchor + 3; i < srcIdx; i++) h1 = h1 * TextCodec::HASH1 ^ uint(src[i]) * TextCodec::HASH2; // Lookup word in dictionary DictEntry* pe = nullptr; DictEntry* pe1 = _dictMap[h1 & _hashMask]; // Check for hash collisions if ((pe1 != nullptr) && (pe1->_hash == h1) && ((pe1->_data >> 24) == length)) { if (TextCodec::sameWords(&pe1->_ptr[1], &src[delimAnchor + 2], length - 1)) pe = pe1; } if (pe == nullptr) { // Word not found in the dictionary or hash collision. // Replace entry if not in static dictionary if (((length > 3) || (words < TextCodec::THRESHOLD2)) && (pe1 == nullptr)) { DictEntry& e = _dictList[words]; if ((e._data & TextCodec::MASK_LENGTH) >= _staticDictSize) { // Reuse old entry _dictMap[e._hash & _hashMask] = nullptr; e._ptr = &src[delimAnchor + 1]; e._hash = h1; e._data = (length << 24) | words; } _dictMap[h1 & _hashMask] = &e; words++; // Dictionary full ? Expand or reset index to end of static dictionary if (words >= _dictSize) { if (expandDictionary() == false) words = _staticDictSize; } } } } } srcIdx++; if ((cur == TextCodec::ESCAPE_TOKEN1) || (cur == TextCodec::ESCAPE_TOKEN2)) { // Word in dictionary // Read word index (varint 5 bits + 7 bits + 7 bits) int idx = int(src[srcIdx++]); if (idx >= 128) { const int idx2 = int(src[srcIdx++]); if (idx2 >= 128) { idx = ((idx & 0x1F) << 14) | ((idx2 & 0x7F) << 7) | int(src[srcIdx]); srcIdx++; } else { idx = ((idx & 0x7F) << 7) | idx2; } if (idx >= _dictSize) { res = false; break; } } const int length = (_dictList[idx]._data >> 24) & 0xFF; // Emit word if (length > 1) { // Add space if only delimiter between 2 words (not an escaped delimiter) if (wordRun == true) dst[dstIdx++] = TextCodec::SP; // Regular word entry wordRun = true; delimAnchor = srcIdx; } else { if (length == 0) { res = false; break; } // Escape entry wordRun = false; delimAnchor = srcIdx - 1; } // Sanity check if (dstIdx + length > dstEnd) { res = false; break; } memcpy(&dst[dstIdx], _dictList[idx]._ptr, length); // Flip case of first character ? if (cur == TextCodec::ESCAPE_TOKEN2) dst[dstIdx] ^= byte(0x20); dstIdx += length; } else { wordRun = false; delimAnchor = srcIdx - 1; if ((isCRLF == true) && (cur == TextCodec::LF)) { dst[dstIdx++] = TextCodec::CR; if (dstIdx >= dstEnd) { res = false; break; } } dst[dstIdx++] = cur; } } output._index += dstIdx; input._index += srcIdx; return (res == true) && (srcIdx == srcEnd); } TextCodec2::TextCodec2() { _logHashSize = TextCodec::LOG_HASHES_SIZE; _dictSize = 1 << 13; _dictMap = nullptr; _dictList = nullptr; _hashMask = (1 << _logHashSize) - 1; _staticDictSize = TextCodec::STATIC_DICT_WORDS; _isCRLF = false; _pCtx = nullptr; _bsVersion = 6; } TextCodec2::TextCodec2(Context& ctx) { const int blockSize = ctx.getInt("blockSize", 0); const int log = blockSize >= 32 ? max(min(Global::log2(uint32(blockSize / 32)), 24), 13) : 13; _logHashSize = ctx.getString("entropy") == "TPAQX" ? log + 1 : log; _dictSize = 1 << 13; _dictMap = nullptr; _dictList = nullptr; _hashMask = (1 << _logHashSize) - 1; _staticDictSize = TextCodec::STATIC_DICT_WORDS; _isCRLF = false; _pCtx = &ctx; _bsVersion = ctx.getInt("bsVersion"); } void TextCodec2::reset(int count) { // Select an appropriate initial dictionary size const int log = count < 1024 ? 13 : max(min(Global::log2(uint32(count / 128)), 18), 13); _dictSize = max(TextCodec::STATIC_DICT_WORDS, 1 << log); const int mapSize = 1 << _logHashSize; if (_dictMap == nullptr) _dictMap = new DictEntry*[mapSize]; for (int i = 0; i < mapSize; i++) _dictMap[i] = nullptr; if (_dictList == nullptr) { _dictList = new DictEntry[_dictSize]; #if __cplusplus >= 201103L memcpy(&_dictList[0], &TextCodec::STATIC_DICTIONARY[0], sizeof(TextCodec::STATIC_DICTIONARY)); #else for (int i = 0; i < TextCodec::STATIC_DICT_WORDS; i++) _dictList[i] = TextCodec::STATIC_DICTIONARY[i]; #endif } for (int i = 0; i < _staticDictSize; i++) _dictMap[_dictList[i]._hash & _hashMask] = &_dictList[i]; // Pre-allocate all dictionary entries for (int i = _staticDictSize; i < _dictSize; i++) _dictList[i] = DictEntry(nullptr, 0, i); } bool TextCodec2::forward(SliceArray& input, SliceArray& output, int count) { if (output._length - output._index < getMaxEncodedLength(count)) return false; const byte* src = &input._array[input._index]; byte* dst = &output._array[output._index]; if (_pCtx != nullptr) { Global::DataType dt = (Global::DataType) _pCtx->getInt("dataType", Global::UNDEFINED); // Filter out most types. Still check binaries which may contain significant parts of text if ((dt != Global::UNDEFINED) && (dt != Global::TEXT) && (dt != Global::BIN)) return false; } uint freqs[256] = { 0 }; byte mode = TextCodec::computeStats(&src[0], count, freqs, false); // Not text ? if ((mode & TextCodec::MASK_NOT_TEXT) != byte(0)) { if (_pCtx != nullptr) _pCtx->putInt("dataType", Global::DataType(mode & TextCodec::MASK_DT)); return false; } if (_pCtx != nullptr) _pCtx->putInt("dataType", Global::TEXT); reset(count); const int srcEnd = count; const int dstEnd = getMaxEncodedLength(count); const int dstEnd3 = dstEnd - 3; int emitAnchor = 0; int words = _staticDictSize; // DOS encoded end of line (CR+LF) ? _isCRLF = (mode & TextCodec::MASK_CRLF) != byte(0); dst[0] = mode; bool res = true; int srcIdx = 0; int dstIdx = 1; while ((srcIdx < srcEnd) && (src[srcIdx] == TextCodec::SP)) { dst[dstIdx++] = TextCodec::SP; srcIdx++; emitAnchor++; } int delimAnchor = TextCodec::isText(src[srcIdx]) ? srcIdx - 1 : srcIdx; // previous delimiter while (srcIdx < srcEnd) { const int8 cType = TextCodec::getType(src[srcIdx]); if (cType == 0) { srcIdx++; continue; } if ((srcIdx > delimAnchor + 2) && (cType > 0)) { const byte val = src[delimAnchor + 1]; const int length = srcIdx - delimAnchor - 1; if (length <= TextCodec::MAX_WORD_LENGTH) { // Compute hashes // h1 -> hash of word chars // h2 -> hash of word chars with first char case flipped uint h1 = TextCodec::HASH1; h1 = h1 * TextCodec::HASH1 ^ uint(val) * TextCodec::HASH2; uint h2 = TextCodec::HASH1; h2 = h2 * TextCodec::HASH1 ^ (uint(val) ^ 0x20) * TextCodec::HASH2; for (int i = delimAnchor + 2; i < srcIdx; i++) { h1 = h1 * TextCodec::HASH1 ^ uint(src[i]) * TextCodec::HASH2; h2 = h2 * TextCodec::HASH1 ^ uint(src[i]) * TextCodec::HASH2; } // Check word in dictionary DictEntry* pe = nullptr; prefetchRead(&_dictMap[h1 & _hashMask]); DictEntry* pe1 = _dictMap[h1 & _hashMask]; if ((pe1 != nullptr) && (pe1->_hash == h1) && ((pe1->_data >> 24) == length)) pe = pe1; else { prefetchRead(&_dictMap[h2 & _hashMask]); DictEntry* pe2 = _dictMap[h2 & _hashMask]; if ((pe2 != nullptr) && (pe2->_hash == h2) && ((pe2->_data >> 24) == length)) pe = pe2; } // Check for hash collisions if ((pe != nullptr) && (!TextCodec::sameWords(&pe->_ptr[1], &src[delimAnchor + 2], length - 1))) pe = nullptr; if (pe == nullptr) { // Word not found in the dictionary or hash collision. // Replace entry if not in static dictionary if (((length > 3) || ((length == 3) && (words < TextCodec::THRESHOLD2))) && (pe1 == nullptr)) { DictEntry* pe3 = &_dictList[words]; if ((pe3->_data & TextCodec::MASK_LENGTH) >= _staticDictSize) { // Reuse old entry _dictMap[pe3->_hash & _hashMask] = nullptr; pe3->_ptr = &src[delimAnchor + 1]; pe3->_hash = h1; pe3->_data = (length << 24) | words; } // Update hash map _dictMap[h1 & _hashMask] = pe3; words++; // Dictionary full ? Expand or reset index to end of static dictionary if (words >= _dictSize) { if (expandDictionary() == false) words = _staticDictSize; } } } else { // Word found in the dictionary // Skip space if only delimiter between 2 word references if ((emitAnchor != delimAnchor) || (src[delimAnchor] != TextCodec::SP)) { const int dIdx = emitSymbols(&src[emitAnchor], &dst[dstIdx], delimAnchor + 1 - emitAnchor, dstEnd - dstIdx); if (dIdx < 0) { res = false; break; } dstIdx += dIdx; } if (dstIdx >= dstEnd3) { res = false; break; } // Case flip is encoded as 0x80 dst[dstIdx] = TextCodec::MASK_FLIP_CASE; dstIdx += (pe == pe1 ? 0 : 1); dstIdx += emitWordIndex(&dst[dstIdx], pe->_data & TextCodec::MASK_LENGTH); emitAnchor = delimAnchor + 1 + (pe->_data >> 24); } } } // Reset delimiter position delimAnchor = srcIdx; srcIdx++; } if (res == true) { // Emit last symbols const int dIdx = emitSymbols(&src[emitAnchor], &dst[dstIdx], srcEnd - emitAnchor, dstEnd - dstIdx); if (dIdx < 0) res = false; else dstIdx += dIdx; res &= (srcIdx == srcEnd); } output._index += dstIdx; input._index += srcIdx; return res; } bool TextCodec2::expandDictionary() { if (_dictSize >= TextCodec::MAX_DICT_SIZE) return false; DictEntry* newDict = new DictEntry[_dictSize * 2]; memcpy(static_cast(&newDict[0]), &_dictList[0], sizeof(DictEntry) * _dictSize); for (int i = _dictSize; i < _dictSize * 2; i++) newDict[i] = DictEntry(nullptr, 0, i); delete[] _dictList; _dictList = newDict; // Reset map (values must point to addresses of new DictEntry items) for (int i = 0; i < _dictSize; i++) { _dictMap[_dictList[i]._hash & _hashMask] = &_dictList[i]; } _dictSize <<= 1; return true; } int TextCodec2::emitSymbols(const byte src[], byte dst[], const int srcEnd, const int dstEnd) const { // Work around incorrect warning by GCC 7.x.x with C++17 #ifdef __GNUC__ #if (__GNUC__ == 7) && (__cplusplus > 201402L) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wswitch" #endif #endif int dstIdx = 0; if (2 * srcEnd < dstEnd) { for (int i = 0; i < srcEnd; i++) { switch (src[i]) { case TextCodec::ESCAPE_TOKEN1: dst[dstIdx++] = TextCodec::ESCAPE_TOKEN1; dst[dstIdx++] = TextCodec::ESCAPE_TOKEN1; break; case TextCodec::CR: if (_isCRLF == false) dst[dstIdx++] = src[i]; break; default: dst[dstIdx] = TextCodec::ESCAPE_TOKEN1; dstIdx += int(src[i] >> 7); dst[dstIdx++] = src[i]; } } } else { for (int i = 0; i < srcEnd; i++) { switch (src[i]) { case TextCodec::ESCAPE_TOKEN1: if (dstIdx >= dstEnd - 1) return -1; dst[dstIdx++] = TextCodec::ESCAPE_TOKEN1; dst[dstIdx++] = TextCodec::ESCAPE_TOKEN1; break; case TextCodec::CR: if (_isCRLF == false) { if (dstIdx >= dstEnd) return -1; dst[dstIdx++] = src[i]; } break; default: if (src[i] >= kanzi::byte(128)) { if (dstIdx >= dstEnd) return -1; dst[dstIdx++] = TextCodec::ESCAPE_TOKEN1; } if (dstIdx >= dstEnd) return -1; dst[dstIdx++] = src[i]; } } } // Work around incorrect warning by GCC 7.x.x with C++17 #ifdef __GNUC__ #if (__GNUC__ == 7) && (__cplusplus > 201402L) #pragma GCC diagnostic pop #endif #endif return dstIdx; } int TextCodec2::emitWordIndex(kanzi::byte dst[], int wIdx) { // 0x80 is reserved to first symbol case flip wIdx++; if (wIdx >= TextCodec::THRESHOLD3) { if (wIdx >= TextCodec::THRESHOLD4) { // 3 kanzi::byte index (1111xxxx xxxxxxxx xxxxxxxx) dst[0] = kanzi::byte(0xF0 | (wIdx >> 16)); dst[1] = kanzi::byte(wIdx >> 8); dst[2] = kanzi::byte(wIdx); return 3; } // 2 kanzi::byte index (110xxxxx xxxxxxxx) dst[0] = kanzi::byte(0xC0 | (wIdx >> 8)); dst[1] = kanzi::byte(wIdx); return 2; } // 1 kanzi::byte index (10xxxxxx) with 0x80 excluded dst[0] = kanzi::byte(0x80 | wIdx); return 1; } bool TextCodec2::inverse(SliceArray& input, SliceArray& output, int count) { if ((count < 2) || (input._index + count > input._length)) return false; reset(output._length); const kanzi::byte* src = &input._array[input._index]; kanzi::byte* dst = &output._array[output._index]; _isCRLF = (src[0] & TextCodec::MASK_CRLF) != kanzi::byte(0); const bool isCRLF = _isCRLF; int srcIdx = 1; int dstIdx = 0; const int srcEnd = count; const int dstEnd = output._length - output._index; int delimAnchor = TextCodec::isText(src[srcIdx]) ? srcIdx - 1 : srcIdx; // previous delimiter int words = _staticDictSize; bool wordRun = false; bool res = true; const bool oldEncoding = _bsVersion < 6; while ((srcIdx < srcEnd) && (dstIdx < dstEnd)) { kanzi::byte cur = src[srcIdx]; const int8 cType = TextCodec::getType(cur); if (cType == 0) { dst[dstIdx] = src[srcIdx]; srcIdx++; dstIdx++; continue; } if ((srcIdx > delimAnchor + 3) && (cType > 0)) { const int length = srcIdx - delimAnchor - 1; // length > 2 if (length <= TextCodec::MAX_WORD_LENGTH) { uint h1 = TextCodec::HASH1; h1 = h1 * TextCodec::HASH1 ^ uint(src[delimAnchor + 1]) * TextCodec::HASH2; h1 = h1 * TextCodec::HASH1 ^ uint(src[delimAnchor + 2]) * TextCodec::HASH2; for (int i = delimAnchor + 3; i < srcIdx; i++) h1 = h1 * TextCodec::HASH1 ^ uint(src[i]) * TextCodec::HASH2; // Lookup word in dictionary DictEntry* pe = nullptr; DictEntry* pe1 = _dictMap[h1 & _hashMask]; // Check for hash collisions if ((pe1 != nullptr) && (pe1->_hash == h1) && ((pe1->_data >> 24) == length)) { if (TextCodec::sameWords(&pe1->_ptr[1], &src[delimAnchor + 2], length - 1)) pe = pe1; } if (pe == nullptr) { // Word not found in the dictionary or hash collision. // Replace entry if not in static dictionary if (((length > 3) || (words < TextCodec::THRESHOLD2)) && (pe1 == nullptr)) { DictEntry& e = _dictList[words]; if ((e._data & TextCodec::MASK_LENGTH) >= _staticDictSize) { // Reuse old entry _dictMap[e._hash & _hashMask] = nullptr; e._ptr = &src[delimAnchor + 1]; e._hash = h1; e._data = (length << 24) | words; } _dictMap[h1 & _hashMask] = &e; words++; // Dictionary full ? Expand or reset index to end of static dictionary if (words >= _dictSize) { if (expandDictionary() == false) words = _staticDictSize; } } } } } srcIdx++; kanzi::byte flipMask = kanzi::byte(0); if (cur >= TextCodec::MASK_80) { // Word in dictionary int idx; if (oldEncoding == true) { // Read word index (varint 5 bits + 7 bits + 7 bits) flipMask = cur & TextCodec::MASK_20; idx = int(cur & TextCodec::MASK_1F); if ((cur & TextCodec::MASK_40) != kanzi::byte(0)) { const int idx2 = int(src[srcIdx++]); if (idx2 >= 128) { idx = (idx << 14) | ((idx2 & 0x7F) << 7) | int(src[srcIdx]); srcIdx++; } else { idx = (idx << 7) | idx2; } // Sanity check if (idx >= _dictSize) { res = false; break; } } } else { if (cur == TextCodec::MASK_80) { // Flip first char case flipMask = TextCodec::MASK_20; cur = src[srcIdx++]; } // Read word index // 10xxxxxx => 1 kanzi::byte // 110xxxxx => 2 bytes // 1111xxxx => 3 bytes idx = int(cur) & 0x7F; if (idx >= 64) { if (idx >= 112) { idx = ((idx & 0x0F) << 16) | (int(src[srcIdx]) << 8) | int(src[srcIdx + 1]); srcIdx += 2; } else { idx = ((idx & 0x1F) << 8) | int(src[srcIdx]); srcIdx++; } // Sanity check before adjusting index if (idx > _dictSize) { res = false; break; } } else if (idx == 0) { res = false; break; } // Adjust index idx--; } const int length = (_dictList[idx]._data >> 24) & 0xFF; // Emit word if (length > 1) { // Add space if only delimiter between 2 words (not an escaped delimiter) if (wordRun == true) dst[dstIdx++] = TextCodec::SP; // Regular word entry wordRun = true; delimAnchor = srcIdx; } else { if (length == 0) { res = false; break; } // Escape entry wordRun = false; delimAnchor = srcIdx - 1; } // Sanity check if (dstIdx + length > dstEnd) { res = false; break; } memcpy(&dst[dstIdx], _dictList[idx]._ptr, length); // Flip case of first character ? dst[dstIdx] ^= flipMask; dstIdx += length; } else { if (cur == TextCodec::ESCAPE_TOKEN1) { dst[dstIdx++] = src[srcIdx++]; } else { if ((isCRLF == true) && (cur == TextCodec::LF)) { dst[dstIdx++] = TextCodec::CR; if (dstIdx >= dstEnd) { res = false; break; } } dst[dstIdx++] = cur; } wordRun = false; delimAnchor = srcIdx - 1; } } output._index += dstIdx; input._index += srcIdx; return (res == true) && (srcIdx == srcEnd); } kanzi-cpp-2.5.2/src/transform/TextCodec.hpp000066400000000000000000000164041516423635400206270ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_TextCodec #define knz_TextCodec #include "../Context.hpp" #include "../Transform.hpp" namespace kanzi { class DictEntry FINAL { public: const byte* _ptr; // text data uint _hash; // full word hash int _data; // packed word length (8 MSB) + index in dictionary (24 LSB) DictEntry(); DictEntry(const byte* ptr, int hash, int idx, int length); #if __cplusplus < 201103L DictEntry(const DictEntry& de); DictEntry& operator=(const DictEntry& de); ~DictEntry() {} #else DictEntry(const DictEntry& de) = delete; DictEntry& operator=(const DictEntry& de) = delete; DictEntry(DictEntry&& de) noexcept = default; DictEntry& operator=(DictEntry&& de) noexcept = default; ~DictEntry() noexcept = default; #endif }; // Encode word indexes using a token class TextCodec1 FINAL : public Transform { public: TextCodec1(); TextCodec1(Context&); ~TextCodec1() { if (_dictList != nullptr) delete[] _dictList; if (_dictMap != nullptr) delete[] _dictMap; } bool forward(SliceArray& src, SliceArray& dst, int length); bool inverse(SliceArray& src, SliceArray& dst, int length); // Limit to 1 x srcLength and let the caller deal with // a failure when the output is too small int getMaxEncodedLength(int srcLen) const { return srcLen; } private: DictEntry** _dictMap; DictEntry* _dictList; byte _escapes[2]; int _staticDictSize; int _dictSize; int _logHashSize; int _hashMask; bool _isCRLF; // EOL = CR + LF Context* _pCtx; bool expandDictionary(); void reset(int count); static int emitWordIndex(byte dst[], int val); int emitSymbols(const byte src[], byte dst[], const int srcEnd, const int dstEnd) const; }; // Encode word indexes using a mask (0x80) class TextCodec2 FINAL : public Transform { public: TextCodec2(); TextCodec2(Context&); ~TextCodec2() { if (_dictList != nullptr) delete[] _dictList; if (_dictMap != nullptr) delete[] _dictMap; } bool forward(SliceArray& src, SliceArray& dst, int length); bool inverse(SliceArray& src, SliceArray& dst, int length); // Limit to 1 x srcLength and let the caller deal with // a failure when the output is too small int getMaxEncodedLength(int srcLen) const { return srcLen; } private: DictEntry** _dictMap; DictEntry* _dictList; int _staticDictSize; int _dictSize; int _logHashSize; int _hashMask; int _bsVersion; bool _isCRLF; // EOL = CR + LF Context* _pCtx; bool expandDictionary(); void reset(int count); static int emitWordIndex(byte dst[], int val); int emitSymbols(const byte src[], byte dst[], const int srcEnd, const int dstEnd) const; }; // Simple one-pass text codec that replaces words with indexes. // Generates a dynamic dictionary. class TextCodec FINAL : public Transform { friend class TextCodec1; friend class TextCodec2; public: static const int MAX_DICT_SIZE; static const int MAX_WORD_LENGTH; static const int MIN_BLOCK_SIZE; static const int MAX_BLOCK_SIZE; static const byte ESCAPE_TOKEN1; static const byte ESCAPE_TOKEN2; static const byte MASK_1F; static const byte MASK_3F; static const byte MASK_20; static const byte MASK_40; static const byte MASK_80; static const byte MASK_FLIP_CASE; TextCodec(); TextCodec(Context& ctx); ~TextCodec() { delete _delegate; } bool forward(SliceArray& src, SliceArray& dst, int length); bool inverse(SliceArray& src, SliceArray& dst, int length); int getMaxEncodedLength(int srcLen) const { return _delegate->getMaxEncodedLength(srcLen); } static int8 getType(byte val) { return CHAR_TYPE[uint8(val)]; } static bool isText(byte val) { return getType(val) == 0; } static bool isLowerCase(byte val) { return (val >= byte('a')) && (val <= byte('z')); } static bool isUpperCase(byte val) { return (val >= byte('A')) && (val <= byte('Z')); } static bool isDelimiter(byte val) { return getType(val) > 0; } private: static const int HASH1; static const int HASH2; static const byte CR; static const byte LF; static const byte SP; static const int THRESHOLD1; static const int THRESHOLD2; static const int THRESHOLD3; static const int THRESHOLD4; static const int LOG_HASHES_SIZE; static const byte MASK_NOT_TEXT; static const byte MASK_CRLF; static const byte MASK_XML_HTML; static const byte MASK_DT; static const int MASK_LENGTH; static bool init(int8 cType[256]); static int8 CHAR_TYPE[256]; static const bool INIT; static bool sameWords(const byte src[], const byte dst[], int length); static byte computeStats(const byte block[], int count, uint freqs[], bool strict); static byte detectType(const uint freqs0[], const uint freqs1[], int count); // Common English words. static char DICT_EN_1024[]; // Static dictionary of 1024 entries. static DictEntry STATIC_DICTIONARY[1024]; static int createDictionary(char words[], int dictSize, DictEntry dict[], int maxWords, int startWord); static const int STATIC_DICT_WORDS; Transform* _delegate; }; inline DictEntry::DictEntry() : _ptr(nullptr) , _hash(0) , _data(0) { } inline DictEntry::DictEntry(const byte* ptr, int hash, int idx, int length = 0) : _ptr(ptr) , _hash(hash) , _data((length << 24) | idx) { } #if __cplusplus < 201103L inline DictEntry::DictEntry(const DictEntry& de) { _ptr = de._ptr; _hash = de._hash; _data = de._data; } inline DictEntry& DictEntry::operator=(const DictEntry& de) { _ptr = de._ptr; _hash = de._hash; _data = de._data; return *this; } #endif inline bool TextCodec::sameWords(const byte src[], const byte dst[], int length) { while (length >= 4) { length -= 4; if (memcmp(&src[length], &dst[length], 4) != 0) return false; } while (length > 0) { length--; if (dst[length] != src[length]) return false; } return true; } } #endif kanzi-cpp-2.5.2/src/transform/TransformFactory.hpp000066400000000000000000000236121516423635400222470ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_TransformFactory #define knz_TransformFactory #include #include #include #include "../types.hpp" #include "../Context.hpp" #include "AliasCodec.hpp" #include "BWTBlockCodec.hpp" #include "BWTS.hpp" #include "EXECodec.hpp" #include "FSDCodec.hpp" #include "LZCodec.hpp" #include "NullTransform.hpp" #include "ROLZCodec.hpp" #include "RLT.hpp" #include "SBRT.hpp" #include "SRT.hpp" #include "TextCodec.hpp" #include "TransformSequence.hpp" #include "UTFCodec.hpp" #include "ZRLT.hpp" namespace kanzi { template class TransformFactory { public: // Up to 64 transforms can be declared (6 bit index) enum TransformType { NONE_TYPE = 0, // Copy BWT_TYPE = 1, // Burrows Wheeler BWTS_TYPE = 2, // Burrows Wheeler Scott LZ_TYPE = 3, // Lempel Ziv SNAPPY_TYPE = 4, // Snappy (obsolete) RLT_TYPE = 5, // Run Length ZRLT_TYPE = 6, // Zero Run Length MTFT_TYPE = 7, // Move To Front RANK_TYPE = 8, // Rank EXE_TYPE = 9, // EXE codec DICT_TYPE = 10, // Text codec ROLZ_TYPE = 11, // ROLZ codec ROLZX_TYPE = 12, // ROLZ Extra codec SRT_TYPE = 13, // Sorted Rank LZP_TYPE = 14, // Lempel Ziv Predict MM_TYPE = 15, // Multimedia (FSD) codec LZX_TYPE = 16, // Lempel Ziv Extra UTF_TYPE = 17, // UTF Codec PACK_TYPE = 18, // Alias Codec DNA_TYPE = 19, // DNA Alias Codec RESERVED3 = 20, // Reserved RESERVED4 = 21, // Reserved RESERVED5 = 22 // Reserved }; static uint64 getType(const char* tName); static uint64 getTypeToken(const char* tName); static std::string getName(uint64 functionType); static TransformSequence* newTransform(Context& ctx, uint64 functionType); private: TransformFactory() {} ~TransformFactory() {} static const int ONE_SHIFT = 6; // bits per transform static const int MAX_SHIFT = (8 - 1) * ONE_SHIFT; // 8 transforms static const int MASK = (1 << ONE_SHIFT) - 1; static Transform* newToken(Context& ctx, uint64 functionType); static const char* getNameToken(uint64 functionType); }; // The returned type contains 8 transform values template uint64 TransformFactory::getType(const char* tName) { std::string name(tName); size_t pos = name.find('+'); if (pos == std::string::npos) return getTypeToken(name.c_str()) << MAX_SHIFT; size_t prv = 0; int n = 0; uint64 res = 0; int shift = MAX_SHIFT; name += '+'; while (pos != std::string::npos) { n++; if (n > 8) { std::stringstream ss; ss << "Only 8 transforms allowed: " << name; throw std::invalid_argument(ss.str()); } std::string token = name.substr(prv, pos - prv); uint64 typeTk = getTypeToken(token.c_str()); // Skip null transform if (typeTk != NONE_TYPE) { res |= (typeTk << shift); shift -= ONE_SHIFT; } prv = pos + 1; pos = name.find('+', prv); } return res; } template uint64 TransformFactory::getTypeToken(const char* tName) { std::string name(tName); transform(name.begin(), name.end(), name.begin(), ::toupper); if (name == "TEXT") return DICT_TYPE; if (name == "BWT") return BWT_TYPE; if (name == "BWTS") return BWTS_TYPE; if (name == "ROLZ") return ROLZ_TYPE; if (name == "ROLZX") return ROLZX_TYPE; if (name == "MTFT") return MTFT_TYPE; if (name == "ZRLT") return ZRLT_TYPE; if (name == "RLT") return RLT_TYPE; if (name == "SRT") return SRT_TYPE; if (name == "RANK") return RANK_TYPE; if (name == "LZ") return LZ_TYPE; if (name == "LZX") return LZX_TYPE; if (name == "LZP") return LZP_TYPE; if (name == "EXE") return EXE_TYPE; if (name == "UTF") return UTF_TYPE; if (name == "PACK") return PACK_TYPE; if (name == "DNA") return DNA_TYPE; if (name == "MM") return MM_TYPE; if (name == "NONE") return NONE_TYPE; std::stringstream ss; ss << "Unknown transform type: '" << name << "'"; throw std::invalid_argument(ss.str()); } template TransformSequence* TransformFactory::newTransform(Context& ctx, uint64 functionType) { Transform* transforms[8]; int nbtr = 0; for (int i = 0; i < 8; i++) { transforms[i] = nullptr; const uint64 t = (functionType >> (MAX_SHIFT - ONE_SHIFT * i)) & MASK; if ((t != NONE_TYPE) || (i == 0)) transforms[nbtr++] = newToken(ctx, t); } return new TransformSequence(transforms, true); } template Transform* TransformFactory::newToken(Context& ctx, uint64 functionType) { switch (functionType) { case DICT_TYPE: { int textCodecType = 1; if (ctx.has("entropy")) { std::string entropyType = ctx.getString("entropy"); transform(entropyType.begin(), entropyType.end(), entropyType.begin(), ::toupper); // Select text encoding based on entropy codec. if ((entropyType == "NONE") || (entropyType == "ANS0") || (entropyType == "HUFFMAN") || (entropyType == "RANGE")) textCodecType = 2; } ctx.putInt("textcodec", textCodecType); return new TextCodec(ctx); } case ROLZ_TYPE: return new ROLZCodec(ctx); case ROLZX_TYPE: return new ROLZCodec(ctx); case BWT_TYPE: return new BWTBlockCodec(ctx); case BWTS_TYPE: return new BWTS(ctx); case LZX_TYPE: ctx.putInt("lz", LZX_TYPE); return new LZCodec(ctx); case LZ_TYPE: ctx.putInt("lz", LZ_TYPE); return new LZCodec(ctx); case LZP_TYPE: ctx.putInt("lz", LZP_TYPE); return new LZCodec(ctx); case RANK_TYPE: return new SBRT(SBRT::MODE_RANK, ctx); case SRT_TYPE: return new SRT(ctx); case MTFT_TYPE: return new SBRT(SBRT::MODE_MTF, ctx); case ZRLT_TYPE: return new ZRLT(ctx); case RLT_TYPE: return new RLT(ctx); case EXE_TYPE: return new EXECodec(ctx); case UTF_TYPE: return new UTFCodec(ctx); case PACK_TYPE: return new AliasCodec(ctx); case DNA_TYPE: ctx.putInt("packOnlyDNA", 1); return new AliasCodec(ctx); case MM_TYPE: return new FSDCodec(ctx); case NONE_TYPE: return new NullTransform(ctx); default: std::stringstream ss; ss << "Unknown transform type: '" << functionType << "'"; throw std::invalid_argument(ss.str()); } } template std::string TransformFactory::getName(uint64 functionType) { std::stringstream res; bool first = true; for (int i = 0; i < 8; i++) { const uint64 t = (functionType >> (MAX_SHIFT - ONE_SHIFT * i)) & MASK; if (t == NONE_TYPE) continue; if (first == false) res << '+'; res << getNameToken(t); first = false; } return (first == true) ? getNameToken(NONE_TYPE) : res.str(); } template const char* TransformFactory::getNameToken(uint64 functionType) { switch (functionType) { case DICT_TYPE: return "TEXT"; case BWT_TYPE: return "BWT"; case BWTS_TYPE: return "BWTS"; case ROLZ_TYPE: return "ROLZ"; case ROLZX_TYPE: return "ROLZX"; case LZ_TYPE: return "LZ"; case LZX_TYPE: return "LZX"; case LZP_TYPE: return "LZP"; case ZRLT_TYPE: return "ZRLT"; case RLT_TYPE: return "RLT"; case SRT_TYPE: return "SRT"; case RANK_TYPE: return "RANK"; case MTFT_TYPE: return "MTFT"; case EXE_TYPE: return "EXE"; case PACK_TYPE: return "PACK"; case DNA_TYPE: return "DNA"; case UTF_TYPE: return "UTF"; case MM_TYPE: return "MM"; case NONE_TYPE: return "NONE"; default: std::stringstream ss; ss << "Unknown transform type: '" << functionType << "'"; throw std::invalid_argument(ss.str()); } } } #endif kanzi-cpp-2.5.2/src/transform/TransformSequence.hpp000066400000000000000000000214661516423635400224150ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_TransformSequence #define knz_TransformSequence #include #include #include "../Transform.hpp" #define SKIP_MASK byte(0xFF) namespace kanzi { // Encapsulates a sequence of transforms in a transform template class TransformSequence FINAL : public Transform { public: TransformSequence(Transform* transforms[8], bool deallocate = true); ~TransformSequence(); bool forward(SliceArray& input, SliceArray& output, int length); bool inverse(SliceArray& input, SliceArray& output, int length); // Required encoding output buffer size int getMaxEncodedLength(int srcLen) const; byte getSkipFlags() const { return _skipFlags; } void setSkipFlags(byte flags) { _skipFlags = flags; } int getNbTransforms() const { return _length; } private: Transform* _transforms[8]; // transforms or functions bool _deallocate; // deallocate memory for transforms ? int _length; // number of transforms byte _skipFlags; // skip transforms }; template TransformSequence::TransformSequence(Transform* transforms[8], bool deallocate) { _deallocate = deallocate; _length = 8; _skipFlags = byte(0); for (int i = 7; i >= 0; i--) { _transforms[i] = transforms[i]; if (_transforms[i] == nullptr) _length = i; } if (_length == 0) throw std::invalid_argument("At least one transform required"); } template TransformSequence::~TransformSequence() { if (_deallocate == true) { for (int i = 0; i < 8; i++) { if (_transforms[i] != nullptr) delete _transforms[i]; } } } template bool TransformSequence::forward(SliceArray& input, SliceArray& output, int count) { if (!SliceArray::isValid(input)) throw std::invalid_argument("Invalid input block"); if (!SliceArray::isValid(output)) throw std::invalid_argument("Invalid output block"); if ((count < 0) || (count + input._index > input._length)) return false; _skipFlags = SKIP_MASK; if (count == 0) return true; SliceArray* in = &input; SliceArray* out = &output; SliceArray buffer(nullptr, 0, 0); const int blockSize = count; const int requiredSize = getMaxEncodedLength(blockSize); int swaps = 0; // Process transforms sequentially for (int i = 0; i < _length; i++) { if (_transforms[i] == nullptr) continue; // Check that the output buffer has enough room. If not, allocate a new one. if (out->_length < requiredSize) { if ((out == &input) || (out == &output)) out = &buffer; if (out->_length < requiredSize) { delete[] out->_array; out->_array = new T[requiredSize]; out->_length = requiredSize; } } const int savedIIdx = in->_index; const int savedOIdx = out->_index; // Apply forward transform if (_transforms[i]->forward(*in, *out, count) == false) { // Transform failed. Either it does not apply to this type // of data or a recoverable error occurred => revert in->_index = savedIIdx; out->_index = savedOIdx; continue; } _skipFlags &= ~byte(1 << (7 - i)); count = out->_index - savedOIdx; in->_index = savedIIdx; out->_index = savedOIdx; std::swap(in, out); swaps++; } if ((swaps & 1) == 0) { if ((output._index + count > output._length) || (in->_index + count > in->_length)) { _skipFlags = SKIP_MASK; } else { const byte* inPtr = &in->_array[in->_index]; byte* outPtr = &output._array[output._index]; if ((inPtr + count >= outPtr) && (outPtr + count >= inPtr)) { std::memmove(&output._array[output._index], &in->_array[in->_index], size_t(count)); } else { std::memcpy(&output._array[output._index], &in->_array[in->_index], size_t(count)); } } } input._index += blockSize; output._index += count; delete[] buffer._array; return _skipFlags != SKIP_MASK; } template bool TransformSequence::inverse(SliceArray& input, SliceArray& output, int count) { if (!SliceArray::isValid(input)) throw std::invalid_argument("Invalid input block"); if (!SliceArray::isValid(output)) throw std::invalid_argument("Invalid output block"); if ((count < 0) || (count + input._index > input._length)) return false; if (count == 0) return true; if (_skipFlags == SKIP_MASK) { const byte* inPtr = &input._array[input._index]; byte* outPtr = &output._array[output._index]; if ((inPtr + count >= outPtr) && (outPtr + count >= inPtr)) { std::memmove(&output._array[output._index], &input._array[input._index], size_t(count)); } else { std::memcpy(&output._array[output._index], &input._array[input._index], size_t(count)); } input._index += count; output._index += count; return true; } const int blockSize = count; bool res = true; SliceArray* in = &input; SliceArray* out = &output; SliceArray buffer(nullptr, 0, 0); int swaps = 0; // Process transforms sequentially in reverse order for (int i = _length - 1; i >= 0; i--) { if ((_skipFlags & byte(1 << (7 - i))) != byte(0)) continue; if (_transforms[i] == nullptr) continue; // Check that the output buffer has enough room. If not, allocate a new one. if (out->_length < output._length) { if ((out == &input) || (out == &output)) out = &buffer; if (out->_length < output._length) { delete[] out->_array; out->_array = new T[output._length]; out->_length = output._length; } } const int savedIIdx = in->_index; const int savedOIdx = out->_index; // Apply inverse transform res = _transforms[i]->inverse(*in, *out, count); // All inverse transforms must succeed if (res == false) break; count = out->_index - savedOIdx; in->_index = savedIIdx; out->_index = savedOIdx; std::swap(in, out); swaps++; } if ((res == true) && ((swaps & 1) == 0)) { if ((output._index + count > output._length) || (input._index + count > input._length)) res = false; else { const byte* inPtr = &in->_array[in->_index]; byte* outPtr = &output._array[output._index]; if ((inPtr + count >= outPtr) && (outPtr + count >= inPtr)) { std::memmove(&output._array[output._index], &input._array[input._index], size_t(count)); } else { std::memcpy(&output._array[output._index], &input._array[input._index], size_t(count)); } } } input._index += blockSize; output._index += count; delete[] buffer._array; return res; } template int TransformSequence::getMaxEncodedLength(int srcLength) const { int requiredSize = srcLength; for (int i = 0; i < _length; i++) { if (_transforms[i] == nullptr) continue; const int nxtSize = _transforms[i]->getMaxEncodedLength(requiredSize); if (nxtSize > requiredSize) requiredSize = nxtSize; } return requiredSize; } } #endif kanzi-cpp-2.5.2/src/transform/UTFCodec.cpp000066400000000000000000000300351516423635400203300ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include #include "UTFCodec.hpp" #include "../Global.hpp" #include "../types.hpp" using namespace kanzi; using namespace std; const int UTFCodec::MIN_BLOCK_SIZE = 1024; const int UTFCodec::LEN_SEQ[256] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; bool UTFCodec::forward(SliceArray& input, SliceArray& output, int count) { if (count == 0) return true; if (count < MIN_BLOCK_SIZE) return false; if (!SliceArray::isValid(input)) throw invalid_argument("UTFCodec: Invalid input block"); if (!SliceArray::isValid(output)) throw invalid_argument("UTFCodec: Invalid output block"); if (output._length - output._index < getMaxEncodedLength(count)) return false; const kanzi::byte* src = &input._array[input._index]; kanzi::byte* dst = &output._array[output._index]; bool mustValidate = true; if (_pCtx != nullptr) { Global::DataType dt = (Global::DataType)_pCtx->getInt("dataType", Global::UNDEFINED); if ((dt != Global::UNDEFINED) && (dt != Global::UTF8)) return false; mustValidate = dt != Global::UTF8; } int start = 0; if ((count >= 3) && (src[0] == kanzi::byte(0xEF)) && (src[1] == kanzi::byte(0xBB)) && (src[2] == kanzi::byte(0xBF))) { // Byte Order Mark (BOM) start = 3; } else { // First (possibly) invalid symbols (due to block truncation). while ((start < 4) && (LEN_SEQ[uint8(src[start])] == 0)) start++; } if ((mustValidate == true) && (validate(&src[start], count - start - 4)) == false) return false; if (_pCtx != nullptr) _pCtx->putInt("dataType", Global::UTF8); // 1-3 bit size + (7 or 11 or 16 or 21) bit payload // 3 MSBs indicate symbol size (limit map size to 22 bits) // 000 -> 7 bits // 001 -> 11 bits // 010 -> 16 bits // 1xx -> 21 bits if (_aliasMap == nullptr) _aliasMap = new uint32[1 << 22]; memset(_aliasMap, 0, size_t(1 << 22) * sizeof(uint32)); vector v; v.reserve(max(count >> 9, 256)); int n = 0; bool res = true; for (int i = start; i < (count - 4); ) { uint32 val; const int s = pack(&src[i], val); res = s != 0; // Validation of longer sequences // Third kanzi::byte in [0x80..0xBF] res &= ((s != 3) || ((src[i + 2] & kanzi::byte(0xC0)) == kanzi::byte(0x80))); // Third and fourth bytes in [0x80..0xBF] res &= ((s != 4) || ((((uint16(src[i + 2]) << 8) | uint16(src[i + 3])) & 0xC0C0) == 0x8080)); // Add to map ? if (_aliasMap[val] == 0) { n++; res &= (n < 32768); #if __cplusplus >= 201103L v.emplace_back(val, 0); #else sdUTF u(val, 0); v.push_back(u); #endif } if (res == false) break; _aliasMap[val]++; i += s; } const int maxTarget = count - (count / 10); if ((res == false) || (n == 0) || ((3 * n + 6) >= maxTarget)) { return false; } for (int i = 0; i < n; i++) v[i].freq = _aliasMap[v[i].val]; // Sort ranks by decreasing frequencies; sort(v.begin(), v.end()); int dstIdx = 2; // Emit map length then map data dst[dstIdx++] = kanzi::byte(n >> 8); dst[dstIdx++] = kanzi::byte(n); int estimate = dstIdx + 6; for (int i = 0; i < n; i++) { estimate += int((i < 128) ? v[i].freq : 2 * v[i].freq); const uint32 s = v[i].val; _aliasMap[s] = (i < 128) ? i : 0x10080 | ((i << 1) & 0xFF00) | (i & 0x7F); dst[dstIdx] = kanzi::byte(s >> 16); dst[dstIdx + 1] = kanzi::byte(s >> 8); dst[dstIdx + 2] = kanzi::byte(s); dstIdx += 3; } if (estimate >= maxTarget) { // Not worth it return false; } // Emit first (possibly invalid) symbols (due to block truncation) for (int i = 0; i < start; i++) dst[dstIdx++] = src[i]; v.clear(); int srcIdx = start; // Emit aliases while (srcIdx < count - 4) { uint32 val; srcIdx += pack(&src[srcIdx], val); const uint32 alias = _aliasMap[val]; dst[dstIdx++] = kanzi::byte(alias); dst[dstIdx] = kanzi::byte(alias >> 8); dstIdx += (alias >> 16); } dst[0] = kanzi::byte(start); dst[1] = kanzi::byte(srcIdx - (count - 4)); // Emit last (possibly invalid) symbols (due to block truncation) while (srcIdx < count) dst[dstIdx++] = src[srcIdx++]; input._index += srcIdx; output._index += dstIdx; return dstIdx < maxTarget; } bool UTFCodec::inverse(SliceArray& input, SliceArray& output, int count) { if (count == 0) return true; if (count < 4) return false; if (!SliceArray::isValid(input)) throw invalid_argument("UTFCodec: Invalid input block"); if (!SliceArray::isValid(output)) throw invalid_argument("UTFCodec: Invalid output block"); if (input._index + count > input._length) return false; const kanzi::byte* src = &input._array[input._index]; kanzi::byte* dst = &output._array[output._index]; const int start = int(src[0]) & 0x03; const int adjust = int(src[1]) & 0x03; // adjust end of regular processing const int n = (int(src[2]) << 8) + int(src[3]); // Protect against invalid map size value if ((n == 0) || (n >= 32768) || (3 * n > count - 4)) return false; struct symb { uint32 val; uint8 len; }; symb m[32768]; int srcIdx = 4; // Build inverse mapping for (int i = 0; i < n; i++) { if (srcIdx + 3 > count) return false; int s = (uint32(src[srcIdx]) << 16) | (uint32(src[srcIdx + 1]) << 8) | uint32(src[srcIdx + 2]); const int sl = unpack(s, reinterpret_cast(&m[i].val)); if (sl == 0) return false; m[i].len = uint8(sl); srcIdx += 3; } int dstIdx = 0; const int srcEnd = count - 4 + adjust; const int dstCap = output._length - output._index; const int dstEnd = dstCap - 4; if (dstEnd < 0) return false; if ((srcEnd > count) || (srcIdx + start > srcEnd) || (dstIdx + start > dstCap)) return false; for (int i = 0; i < start; i++) dst[dstIdx++] = src[srcIdx++]; // Emit data while (srcIdx < srcEnd) { uint alias = uint(src[srcIdx++]); alias = alias >= 128 ? (uint(src[srcIdx++]) << 7) + (alias & 0x7F) : alias; if (alias >= uint(n)) return false; const symb& s = m[alias]; if (dstIdx + int(s.len) > dstCap) return false; memcpy(&dst[dstIdx], &s.val, 4); dstIdx += s.len; } if ((srcIdx == srcEnd) && (dstIdx < dstEnd + adjust)) { if ((srcIdx + 4 - adjust > count) || (dstIdx + 4 - adjust > dstCap)) return false; for (int i = 0; i < 4 - adjust; i++) dst[dstIdx++] = src[srcIdx++]; } input._index += srcIdx; output._index += dstIdx; return srcIdx == count; } // A quick partial validation // A more complete validation is done during processing for the remaining cases // (rules for 3 and 4 kanzi::byte sequences) bool UTFCodec::validate(const kanzi::byte block[], int count) { uint freqs0[256] = { 0 }; uint* freqs1 = new uint[65536]; memset(&freqs1[0], 0, 65536 * sizeof(uint)); uint f0[256] = { 0 }; uint f1[256] = { 0 }; uint f3[256] = { 0 }; uint f2[256] = { 0 }; uint8 prv = 0; const uint8* data = reinterpret_cast(&block[0]); const int count4 = count & -4; bool res = true; // Unroll loop for (int i = 0; i < count4; i += 4) { const uint8 cur0 = data[i]; const uint8 cur1 = data[i + 1]; const uint8 cur2 = data[i + 2]; const uint8 cur3 = data[i + 3]; f0[cur0]++; f1[cur1]++; f2[cur2]++; f3[cur3]++; freqs1[(prv * 256) + cur0]++; freqs1[(cur0 * 256) + cur1]++; freqs1[(cur1 * 256) + cur2]++; freqs1[(cur2 * 256) + cur3]++; prv = cur3; } for (int i = count4; i < count; i++) { freqs0[data[i]]++; freqs1[(prv * 256) + data[i]]++; prv = data[i]; } for (int i = 0; i < 256; i++) { freqs0[i] += (f0[i] + f1[i] + f2[i] + f3[i]); } // Valid UTF-8 sequences // See Unicode 16 Standard - UTF-8 Table 3.7 // U+0000..U+007F 00..7F // U+0080..U+07FF C2..DF 80..BF // U+0800..U+0FFF E0 A0..BF 80..BF // U+1000..U+CFFF E1..EC 80..BF 80..BF // U+D000..U+D7FF ED 80..9F 80..BF 80..BF // U+E000..U+FFFF EE..EF 80..BF 80..BF // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF // Check rules for 1 kanzi::byte uint sum = freqs0[0xC0] + freqs0[0xC1]; uint sum2 = 0; for (int i = 0xF5; i <= 0xFF; i++) sum += freqs0[i]; if (sum != 0) { res = false; goto end; } // Check rules for first 2 bytes for (int i = 0; i < 256; i++) { // Exclude < 0xE0A0 || > 0xE0BF if ((i < 0xA0) || (i > 0xBF)) sum += freqs1[0xE0 * 256 + i]; // Exclude < 0xED80 || > 0xEDE9F if ((i < 0x80) || (i > 0x9F)) sum += freqs1[0xED * 256 + i]; // Exclude < 0xF090 || > 0xF0BF if ((i < 0x90) || (i > 0xBF)) sum += freqs1[0xF0 * 256 + i]; // Exclude < 0xF480 || > 0xF48F if ((i < 0x80) || (i > 0x8F)) sum += freqs1[0xF4 * 256 + i]; if ((i < 0x80) || (i > 0xBF)) { // Exclude < 0x??80 || > 0x??BF with ?? in [C2..DF] for (int j = 0xC2; j <= 0xDF; j++) sum += freqs1[j * 256 + i]; // Exclude < 0x??80 || > 0x??BF with ?? in [E1..EC] for (int j = 0xE1; j <= 0xEC; j++) sum += freqs1[j * 256 + i]; // Exclude < 0x??80 || > 0x??BF with ?? in [F1..F3] sum += freqs1[0xF1 * 256 + i]; sum += freqs1[0xF2 * 256 + i]; sum += freqs1[0xF3 * 256 + i]; // Exclude < 0xEE80 || > 0xEEBF sum += freqs1[0xEE * 256 + i]; // Exclude < 0xEF80 || > 0xEFBF sum += freqs1[0xEF * 256 + i]; } else { // Count non-primary bytes sum2 += freqs0[i]; } if (sum != 0) { res = false; break; } } end: delete[] freqs1; // Ad-hoc threshold return (res == true) && (sum2 >= uint(count / 8)); } kanzi-cpp-2.5.2/src/transform/UTFCodec.hpp000066400000000000000000000073151516423635400203420ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_UTFCodec #define knz_UTFCodec #include "../Context.hpp" #include "../Transform.hpp" namespace kanzi { typedef struct ssUTF { uint32 val; uint32 freq; ssUTF(uint32 v, uint32 f) : val(v), freq(f) {} friend bool operator< (ssUTF const& lhs, ssUTF const& rhs) { int r; return ((r = int(lhs.freq - rhs.freq)) != 0) ? r > 0 : lhs.val > rhs.val; } } sdUTF; // UTF8 encoder/decoder class UTFCodec FINAL : public Transform { public: UTFCodec() : _pCtx(nullptr), _aliasMap(nullptr) {} UTFCodec(Context& ctx) : _pCtx(&ctx), _aliasMap(nullptr) {} ~UTFCodec() { delete[] _aliasMap; } bool forward(SliceArray& source, SliceArray& destination, int length); bool inverse(SliceArray& source, SliceArray& destination, int length); int getMaxEncodedLength(int srcLen) const { return srcLen + 8192; } private: static const int MIN_BLOCK_SIZE; static const int LEN_SEQ[256]; Context* _pCtx; uint32* _aliasMap; static bool validate(const byte block[], int count); static int pack(const byte in[], uint32& out); static int unpack(uint32 in, byte out[]); }; inline int UTFCodec::pack(const byte in[], uint32& out) { int s; switch (int(in[0]) >> 4) { case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7: out = uint32(in[0]); s = 1; break; case 12: case 13: out = (1 << 19) | (uint32(in[0]) << 8) | uint32(in[1]); s = 2; break; case 14: out = (2 << 19) | ((uint32(in[0]) & 0x0F) << 12) | ((uint32(in[1]) & 0x3F) << 6) | (uint32(in[2]) & 0x3F); s = 3; break; case 15: out = (4 << 19) | ((uint32(in[0]) & 0x07) << 18) | ((uint32(in[1]) & 0x3F) << 12) | ((uint32(in[2]) & 0x3F) << 6) | (uint32(in[3]) & 0x3F); s = 4; break; default: out = 0; s = 0; // signal invalid value break; } return s; } inline int UTFCodec::unpack(uint32 in, byte out[]) { int s; switch (in >> 19) { case 0: out[0] = byte(in); s = 1; break; case 1: out[0] = byte(in >> 8); out[1] = byte(in); s = 2; break; case 2: out[0] = byte(((in >> 12) & 0x0F) | 0xE0); out[1] = byte(((in >> 6) & 0x3F) | 0x80); out[2] = byte((in & 0x3F) | 0x80); s = 3; break; case 4: case 5: case 6: case 7: out[0] = byte(((in >> 18) & 0x07) | 0xF0); out[1] = byte(((in >> 12) & 0x3F) | 0x80); out[2] = byte(((in >> 6) & 0x3F) | 0x80); out[3] = byte((in & 0x3F) | 0x80); s = 4; break; default: s = 0; // signal invalid value break; } return s; } } #endif kanzi-cpp-2.5.2/src/transform/ZRLT.cpp000066400000000000000000000127201516423635400175300ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include "../Global.hpp" #include "../Memory.hpp" #include "ZRLT.hpp" using namespace kanzi; using namespace std; bool ZRLT::forward(SliceArray& input, SliceArray& output, int length) { if (length == 0) return true; if (!SliceArray::isValid(input)) throw invalid_argument("ZRLT: Invalid input block"); if (!SliceArray::isValid(output)) throw invalid_argument("ZRLT: Invalid output block"); if (output._length - output._index < getMaxEncodedLength(length)) return false; const kanzi::byte* src = &input._array[input._index]; kanzi::byte* dst = &output._array[output._index]; uint srcIdx = 0; uint dstIdx = 0; const uint srcEnd = length; const uint dstEnd = length - 16; // do not expand const uint srcEnd4 = length - 4; bool res = true; kanzi::byte zeros[4] = { kanzi::byte(0) }; while (srcIdx < srcEnd) { if (src[srcIdx] == kanzi::byte(0)) { uint runLength = 1; while ((srcIdx + runLength < srcEnd4) && (KANZI_MEM_EQ4(&src[srcIdx + runLength], &zeros[0]))) runLength += 4; while ((srcIdx + runLength < srcEnd) && src[srcIdx + runLength] == kanzi::byte(0)) runLength++; srcIdx += runLength; // Encode length runLength++; if (dstIdx >= dstEnd) { res = false; break; } int log = Global::_log2(uint32(runLength)); // Write every bit as a kanzi::byte except the most significant one while (log >= 4) { const uint32 w = (uint32(((runLength >> (log - 1)) & 1) << 24) | uint32(((runLength >> (log - 2)) & 1) << 16) | uint32(((runLength >> (log - 3)) & 1) << 8) | uint32( (runLength >> (log - 4)) & 1)); BigEndian::writeInt32(&dst[dstIdx], int32(w)); dstIdx += 4; log -= 4; } while (log > 0) { log--; dst[dstIdx++] = kanzi::byte((runLength >> log) & 1); } continue; } if (dstIdx >= dstEnd) { res = false; break; } const int val = int(src[srcIdx]); if (val >= 0xFE) { dst[dstIdx] = kanzi::byte(0xFF); dst[dstIdx + 1] = kanzi::byte(val - 0xFE); dstIdx++; } else { dst[dstIdx] = kanzi::byte(val + 1); } srcIdx++; dstIdx++; } input._index += srcIdx; output._index += dstIdx; return res && (srcIdx == srcEnd); } bool ZRLT::inverse(SliceArray& input, SliceArray& output, int length) { if (length == 0) return true; if (!SliceArray::isValid(input)) throw invalid_argument("ZRLT: Invalid input block"); if (!SliceArray::isValid(output)) throw invalid_argument("ZRLT: Invalid output block"); const kanzi::byte* src = &input._array[input._index]; kanzi::byte* dst = &output._array[output._index]; uint srcIdx = 0; uint dstIdx = 0; const uint srcEnd = length; const uint dstEnd = output._length; uint runLength = 0; while (true) { uint val = uint(src[srcIdx]); if (val <= 1) { // Generate the run length bit by bit (but force MSB) runLength = 1; do { runLength += (runLength + val); srcIdx++; if (srcIdx >= srcEnd) goto End; val = uint(src[srcIdx]); } while (val <= 1); runLength--; if (runLength > 0) { if (runLength >= dstEnd - dstIdx) goto End; memset(&dst[dstIdx], 0, size_t(runLength)); dstIdx += runLength; runLength = 0; continue; } } // Regular data processing if (val == 0xFF) { srcIdx++; if (srcIdx >= srcEnd) goto End; dst[dstIdx] = kanzi::byte(0xFE + int(src[srcIdx])); } else { dst[dstIdx] = kanzi::byte(val - 1); } srcIdx++; dstIdx++; if ((srcIdx >= srcEnd) || (dstIdx >= dstEnd)) break; } End: if (runLength > 0) { runLength--; // If runLength is not 1, add trailing 0s if (runLength > dstEnd - dstIdx) return false; if (runLength > 0) { memset(&dst[dstIdx], 0, size_t(runLength)); dstIdx += runLength; } } input._index += srcIdx; output._index += dstIdx; return srcIdx == srcEnd; } kanzi-cpp-2.5.2/src/transform/ZRLT.hpp000066400000000000000000000027111516423635400175340ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_ZRLT #define knz_ZRLT #include "../Context.hpp" #include "../Transform.hpp" namespace kanzi { // Zero Run Length Encoding is a simple encoding algorithm by Wheeler // closely related to Run Length Encoding. The main difference is // that only runs of 0 values are processed. Also, the length is // encoded in a different way (each digit in a different byte) // This algorithm is well adapted to process post BWT/MTFT data. class ZRLT FINAL : public Transform { public: ZRLT() {} ZRLT(Context&) {} ~ZRLT() {} bool forward(SliceArray& pSrc, SliceArray& pDst, int length); bool inverse(SliceArray& pSrc, SliceArray& pDst, int length); // Required encoding output buffer size unknown => guess int getMaxEncodedLength(int srcLen) const { return srcLen; } }; } #endif kanzi-cpp-2.5.2/src/types.hpp000066400000000000000000000223451516423635400160770ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_types #define knz_types #if defined(_MSC_VER) && _MSC_VER < 1600 // Visual Studio < 2010: no stdint.h typedef unsigned char uint8_t; typedef signed char int8_t; typedef unsigned short uint16_t; typedef short int16_t; typedef unsigned int uint32_t; typedef int int32_t; typedef unsigned __int64 uint64_t; typedef __int64 int64_t; #else #if __cplusplus >= 201103L // C++11 or later #include #include #else // C++98 / C++03 #include #endif #endif #if defined(_MSC_VER) #if _MSC_VER < 1900 // snprintf macro for MSVC < 2015 #define snprintf _snprintf #endif #if !defined(__x86_64__) #define __x86_64__ _M_X64 #endif #if !defined(__i386__) #define __i386__ _M_IX86 #endif #endif #if defined(_MSC_VER) #include #define popcount __popcnt #else #if defined(__INTEL_COMPILER) #include #define popcount _popcnt32 #else #define popcount __builtin_popcount #endif #endif #ifdef __SSE__ #include #endif #ifdef __SSE2__ #include #endif #ifdef __SSE3__ #include #endif #ifdef __SSE4_1__ #include #endif #ifdef __AVX__ #include #endif #ifdef __AVX2__ #include #endif /* Visual Studio 2022 17.2 MSVC++ 14.28 _MSC_VER == 1932 Visual Studio 2022 17.1 MSVC++ 14.28 _MSC_VER == 1931 Visual Studio 2022 17.0 MSVC++ 14.28 _MSC_VER == 1930 Visual Studio 2019 version 16.10, 16.11 MSVC++ 14.28 _MSC_VER == 1929 Visual Studio 2019 version 16.8, 16.9 MSVC++ 14.28 _MSC_VER == 1928 Visual Studio 2019 version 16.7 MSVC++ 14.27 _MSC_VER == 1927 Visual Studio 2019 version 16.6 MSVC++ 14.26 _MSC_VER == 1926 Visual Studio 2019 version 16.5 MSVC++ 14.25 _MSC_VER == 1925 Visual Studio 2019 Update 4 MSVC++ 14.24 _MSC_VER == 1924 Visual Studio 2019 Update 3 MSVC++ 14.21 _MSC_VER == 1923 Visual Studio 2019 Update 2 MSVC++ 14.21 _MSC_VER == 1922 Visual Studio 2019 Update 1 MSVC++ 14.21 _MSC_VER == 1921 Visual Studio 2019 MSVC++ 14.20 _MSC_VER == 1920 Visual Studio 2017 Update 9 MSVC++ 14.16 _MSC_VER == 1916 Visual Studio 2017 Update 8 MSVC++ 14.15 _MSC_VER == 1915 Visual Studio 2017 Update 7 MSVC++ 14.14 _MSC_VER == 1914 Visual Studio 2017 Update 6 MSVC++ 14.13 _MSC_VER == 1913 Visual Studio 2017 Update 5 MSVC++ 14.12 _MSC_VER == 1912 Visual Studio 2017 Update 3&4 MSVC++ 14.11 _MSC_VER == 1911 Visual Studio 2017 MSVC++ 14.10 _MSC_VER == 1910 Visual Studio 2015 MSVC++ 14 _MSC_VER == 1900 Visual Studio 2013 MSVC++ 12 _MSC_VER == 1800 Visual Studio 2012 MSVC++ 11 _MSC_VER == 1700 Visual Studio 2010 MSVC++ 10 _MSC_VER == 1600 Visual Studio 2008 MSVC++ 9 _MSC_VER == 1500 Visual Studio 2005 MSVC++ 8 _MSC_VER == 1400 Visual Studio 2003 Beta MSVC++ 7.1 _MSC_VER == 1310 Visual Studio 2002 MSVC++ 7 _MSC_VER == 1300 Visual Studio MSVC++ 6.0 _MSC_VER == 1200 Visual Studio MSVC++ 5 _MSC_VER == 1100 Visual Studio MSVC++ 4.2 _MSC_VER == 1020 Visual Studio MSVC++ 4.1 _MSC_VER == 1010 Visual Studio MSVC++ 4 _MSC_VER == 1000 Visual Studio MSVC++ 2 _MSC_VER == 900 Visual Studio MSVC++ 1 _MSC_VER == 800 */ #ifdef _MSC_VER #if _MSC_VER >= 1930 #define _MSC_VER_STR 2022 #elif _MSC_VER >= 1920 #define _MSC_VER_STR 2019 #elif _MSC_VER >= 1910 #define _MSC_VER_STR 2017 #elif _MSC_VER == 1900 #define _MSC_VER_STR 2015 #elif _MSC_VER == 1800 #define _MSC_VER_STR 2013 #elif _MSC_VER == 1700 #define _MSC_VER_STR 2012 #elif _MSC_VER == 1600 #define _MSC_VER_STR 2010 #elif _MSC_VER == 1500 #define _MSC_VER_STR 2008 #elif _MSC_VER == 1400 #define _MSC_VER_STR 2005 #elif _MSC_VER == 1300 #define _MSC_VER_STR 2003 #elif _MSC_VER == 1200 #define _MSC_VER_STR 2002 #endif #endif // Notice: in Visual Studio (prior to VS2017 version 15.7) // __cplusplus always defaults to 199711L (aka C++98) !!! (unless // the extra option /Zc:__cplusplus is added to the command line). // Otherwise, using the _MSVC_LANG macro returns the proper C++ version. #if (__cplusplus >= 201103L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 201103L) // C++ 11 or higher #define FINAL final #define NOEXCEPT noexcept #else #define FINAL #define NOEXCEPT throw() #if defined(_MSC_VER) #if _MSC_VER < 1300 typedef signed char int8_t; typedef signed short int16_t; typedef signed int int32_t; typedef unsigned char uint8_t; typedef unsigned short uint16_t; typedef unsigned int uint32_t; #else typedef signed __int8 int8_t; typedef signed __int16 int16_t; typedef signed __int32 int32_t; typedef unsigned __int8 uint8_t; typedef unsigned __int16 uint16_t; typedef unsigned __int32 uint32_t; typedef signed __int64 int64_t; typedef unsigned __int64 uint64_t; #endif #else // If stdint.h did not provide fixed-width types, define them here. #ifndef INT64_MAX typedef signed char int8_t; typedef signed short int16_t; typedef signed int int32_t; typedef unsigned char uint8_t; typedef unsigned short uint16_t; typedef unsigned int uint32_t; typedef signed long long int64_t; typedef unsigned long long uint64_t; #endif #endif #if !defined(nullptr) && \ ((defined(_MSC_VER) && (_MSC_VER < 1600)) || \ (defined(__GNUC__) && !defined(__clang__) && \ ((__GNUC__ < 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ < 6)))) || \ (defined(__clang__) && \ ((__clang_major__ < 3) || ((__clang_major__ == 3) && (__clang_minor__ < 1))))) #define nullptr NULL #endif #endif namespace kanzi { #if __cplusplus >= 201703L using byte = std::byte; #else typedef uint8_t byte; #endif typedef int8_t int8; typedef uint8_t uint8; typedef int16_t int16; typedef uint16_t uint16; typedef int32_t int32; typedef uint32_t uint32; typedef uint32_t uint; typedef int64_t int64; typedef uint64_t uint64; } #if defined(__MINGW32__) #define PATH_SEPARATOR '/' #elif defined(WIN32) || defined(_WIN32) || defined(_WIN64) #define PATH_SEPARATOR '\\' #else #define PATH_SEPARATOR '/' #endif // Likely / unlikely macros #if defined(__GNUC__) || defined(__clang__) #ifndef KANZI_LIKELY #define KANZI_LIKELY(x) __builtin_expect(!!(x), 1) #endif #ifndef KANZI_UNLIKELY #define KANZI_UNLIKELY(x) __builtin_expect(!!(x), 0) #endif #else #ifndef KANZI_LIKELY #define KANZI_LIKELY(x) (x) #endif #ifndef KANZI_UNLIKELY #define KANZI_UNLIKELY(x) (x) #endif #endif // Force inline macro #if defined(__GNUC__) || defined(__clang__) #define KANZI_ALWAYS_INLINE inline __attribute__((always_inline)) #elif defined(_MSC_VER) #define KANZI_ALWAYS_INLINE __forceinline #else #define KANZI_ALWAYS_INLINE inline #endif #if defined(_MSC_VER) #define KANZI_ALIGNED_(x) __declspec(align(x)) #elif defined(__GNUC__) #define KANZI_ALIGNED_(x) __attribute__ ((aligned(x))) #endif #endif kanzi-cpp-2.5.2/src/util/000077500000000000000000000000001516423635400151715ustar00rootroot00000000000000kanzi-cpp-2.5.2/src/util/Clock.hpp000066400000000000000000000024471516423635400167440ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_Clock #define knz_Clock #include "WallTimer.hpp" namespace kanzi { class Clock { private: WallTimer _timer; WallTimer::TimeData _start; WallTimer::TimeData _stop; public: Clock() { start(); _stop = _start; } void start() { _start = _timer.getCurrentTime(); } void stop() { _stop = _timer.getCurrentTime(); } double elapsed() const { // In millisec return WallTimer::calculateDifference(_start, _stop); } }; } #endif kanzi-cpp-2.5.2/src/util/Printer.hpp000066400000000000000000000043411516423635400173270ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_Printer #define knz_Printer #ifdef CONCURRENCY_ENABLED #include #endif namespace kanzi { // Thread safe printer class Printer { public: Printer(std::ostream& os) { _os = &os; } ~Printer() { try { _os->flush(); } catch (const std::exception&) { // Ignore: best effort } } void print(const char* msg, bool print) { if ((print == true) && (msg != nullptr)) { #ifdef CONCURRENCY_ENABLED std::lock_guard lock(getMutex()); #endif (*_os) << msg; } } void println(const char* msg, bool print) { if ((print == true) && (msg != nullptr)) { #ifdef CONCURRENCY_ENABLED std::lock_guard lock(getMutex()); #endif (*_os) << msg << std::endl; } } void print(const std::string& msg, bool print) { if (print == true) { #ifdef CONCURRENCY_ENABLED std::lock_guard lock(getMutex()); #endif (*_os) << msg; } } void println(const std::string& msg, bool print) { if (print == true) { #ifdef CONCURRENCY_ENABLED std::lock_guard lock(getMutex()); #endif (*_os) << msg << std::endl; } } private: #ifdef CONCURRENCY_ENABLED static std::mutex& getMutex() { static std::mutex mtx; return mtx; } #endif std::ostream* _os; }; } #endif kanzi-cpp-2.5.2/src/util/WallTimer.cpp000066400000000000000000000013361516423635400176000ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "WallTimer.hpp" #if defined(KNZ_USE_WINDOWS_QPC) LARGE_INTEGER WallTimer::_frequency = {}; bool WallTimer::_initialized = false; #endif kanzi-cpp-2.5.2/src/util/WallTimer.hpp000066400000000000000000000067351516423635400176150ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_WallTimer #define knz_WallTimer #include "../types.hpp" // Portable wall timer // 1. Detect Standard and Platform #if __cplusplus >= 201103L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201103L) #include #define KNZ_USE_CHRONO #elif defined(_WIN32) || defined(_WIN64) #include #define KNZ_USE_WINDOWS_QPC #else #include #define KNZ_USE_POSIX_GETTIMEOFDAY #endif class WallTimer { public: struct TimeData { #if defined(KNZ_USE_CHRONO) std::chrono::steady_clock::time_point value; #elif defined(KNZ_USE_WINDOWS_QPC) LARGE_INTEGER value; #elif defined(KNZ_USE_POSIX_GETTIMEOFDAY) struct timeval value; #endif // Converts the internal timestamp into total milliseconds. #if defined(KNZ_USE_CHRONO) kanzi::uint64 to_ms() const { return static_cast(std::chrono::duration(value.time_since_epoch()).count()); } #elif defined(KNZ_USE_WINDOWS_QPC) double to_ms(long long win_freq = 1) const { return static_cast(value.QuadPart) * 1000.0 / win_freq; } #elif defined(KNZ_USE_POSIX_GETTIMEOFDAY) double to_ms() const { return (static_cast(value.tv_sec) * 1000.0) + (value.tv_usec / 1000.0); } #endif }; WallTimer() { #if defined(KNZ_USE_WINDOWS_QPC) if (_initialized == false) { QueryPerformanceFrequency(&_frequency); _initialized = true; } #endif _start = getCurrentTime(); } TimeData getCurrentTime() const { TimeData now; #if defined(KNZ_USE_CHRONO) now.value = std::chrono::steady_clock::now(); #elif defined(KNZ_USE_WINDOWS_QPC) QueryPerformanceCounter(&now.value); #elif defined(KNZ_USE_POSIX_GETTIMEOFDAY) gettimeofday(&now.value, 0); #endif return now; } static double calculateDifference(const TimeData& start, const TimeData& end) { #if defined(KNZ_USE_CHRONO) return std::chrono::duration(end.value - start.value).count(); #elif defined(KNZ_USE_WINDOWS_QPC) if (_initialized == false) { QueryPerformanceFrequency(&_frequency); _initialized = true; } return static_cast(end.value.QuadPart - start.value.QuadPart) * 1000.0 / static_cast(_frequency.QuadPart); #elif defined(KNZ_USE_POSIX_GETTIMEOFDAY) const double sec = double(end.value.tv_sec - start.value.tv_sec); const double usec = double(end.value.tv_usec - start.value.tv_usec); return (sec * 1000.0) + (usec / 1000.0); #endif } double elapsed_ms() const { return calculateDifference(_start, getCurrentTime()); } private: TimeData _start; #if defined(KNZ_USE_WINDOWS_QPC) static LARGE_INTEGER _frequency; static bool _initialized; #endif }; #endif kanzi-cpp-2.5.2/src/util/XXHash.hpp000066400000000000000000000156101516423635400170500ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_XXHash32 #define knz_XXHash32 #include #include "../Memory.hpp" namespace kanzi { // XXHash is an extremely fast hash algorithm. It was written by Yann Collet. // Original source code: https://github.com/Cyan4973/xxHash class XXHash32 { private: #if __cplusplus >= 201103L static constexpr uint32 PRIME32_1 = uint32(-1640531535); static constexpr uint32 PRIME32_2 = uint32(-2048144777); static constexpr uint32 PRIME32_3 = uint32(-1028477379); static constexpr uint32 PRIME32_4 = uint32(668265263); static constexpr uint32 PRIME32_5 = uint32(374761393); #else static const uint32 PRIME32_1 = uint32(-1640531535); static const uint32 PRIME32_2 = uint32(-2048144777); static const uint32 PRIME32_3 = uint32(-1028477379); static const uint32 PRIME32_4 = uint32(668265263); static const uint32 PRIME32_5 = uint32(374761393); #endif uint32 _seed; uint32 round(uint32 acc, int32 val) const; public: XXHash32() { _seed = uint32(time(nullptr)); } XXHash32(uint32 seed) : _seed(seed) {} ~XXHash32(){} void setSeed(uint32 seed) { _seed = seed; } uint32 hash(const byte data[], int length) const; }; inline uint32 XXHash32::hash(const byte data[], int length) const { uint32 h32; int idx = 0; if (length >= 16) { const int end16 = length - 16; uint32 v1 = _seed + PRIME32_1 + PRIME32_2; uint32 v2 = _seed + PRIME32_2; uint32 v3 = _seed; uint32 v4 = _seed - PRIME32_1; do { v1 = round(v1, LittleEndian::readInt32(&data[idx])); v2 = round(v2, LittleEndian::readInt32(&data[idx + 4])); v3 = round(v3, LittleEndian::readInt32(&data[idx + 8])); v4 = round(v4, LittleEndian::readInt32(&data[idx + 12])); idx += 16; } while (idx <= end16); h32 = ((v1 << 1) | (v1 >> 31)); h32 += ((v2 << 7) | (v2 >> 25)); h32 += ((v3 << 12) | (v3 >> 20)); h32 += ((v4 << 18) | (v4 >> 14)); } else { h32 = _seed + PRIME32_5; } h32 += uint32(length); while (idx <= length - 4) { h32 += (uint32(LittleEndian::readInt32(&data[idx])) * PRIME32_3); h32 = ((h32 << 17) | (h32 >> 15)) * PRIME32_4; idx += 4; } while (idx < length) { h32 += ((uint32(data[idx]) & 0xFF) * PRIME32_5); h32 = ((h32 << 11) | (h32 >> 21)) * PRIME32_1; idx++; } h32 ^= (h32 >> 15); h32 *= PRIME32_2; h32 ^= (h32 >> 13); h32 *= PRIME32_3; return h32 ^ (h32 >> 16); } inline uint32 XXHash32::round(uint32 acc, int32 val) const { acc += (uint32(val) * PRIME32_2); return ((acc << 13) | (acc >> 19)) * PRIME32_1; } class XXHash64 { private: #if __cplusplus >= 201103L static constexpr uint64 PRIME64_1 = uint64(0x9E3779B185EBCA87); static constexpr uint64 PRIME64_2 = uint64(0xC2B2AE3D27D4EB4F); static constexpr uint64 PRIME64_3 = uint64(0x165667B19E3779F9); static constexpr uint64 PRIME64_4 = uint64(0x85EBCA77C2b2AE63); static constexpr uint64 PRIME64_5 = uint64(0x27D4EB2F165667C5); #else static const uint64 PRIME64_1 = uint64(0x9E3779B185EBCA87); static const uint64 PRIME64_2 = uint64(0xC2B2AE3D27D4EB4F); static const uint64 PRIME64_3 = uint64(0x165667B19E3779F9); static const uint64 PRIME64_4 = uint64(0x85EBCA77C2b2AE63); static const uint64 PRIME64_5 = uint64(0x27D4EB2F165667C5); #endif int64 _seed; uint64 round(uint64 acc, uint64 val) const; uint64 mergeRound(uint64 acc, uint64 val) const; public: XXHash64() { _seed = int64(time(nullptr)); } XXHash64(int64 seed) : _seed(seed) {} ~XXHash64(){} void setSeed(int64 seed) { _seed = seed; } uint64 hash(const byte data[], int length) const; }; inline uint64 XXHash64::hash(const byte data[], int length) const { uint64 h64; int idx = 0; if (length >= 32) { const int length32 = length - 32; uint64 v1 = _seed + PRIME64_1 + PRIME64_2; uint64 v2 = _seed + PRIME64_2; uint64 v3 = _seed; uint64 v4 = _seed - PRIME64_1; do { v1 = round(v1, uint64(LittleEndian::readLong64(&data[idx]))); v2 = round(v2, uint64(LittleEndian::readLong64(&data[idx + 8]))); v3 = round(v3, uint64(LittleEndian::readLong64(&data[idx + 16]))); v4 = round(v4, uint64(LittleEndian::readLong64(&data[idx + 24]))); idx += 32; } while (idx <= length32); h64 = ((v1 << 1) | (v1 >> 31)) + ((v2 << 7) | (v2 >> 25)) + ((v3 << 12) | (v3 >> 20)) + ((v4 << 18) | (v4 >> 14)); h64 = mergeRound(h64, v1); h64 = mergeRound(h64, v2); h64 = mergeRound(h64, v3); h64 = mergeRound(h64, v4); } else { h64 = _seed + PRIME64_5; } h64 += length; while (idx+8 <= length) { h64 ^= round(0, uint64(LittleEndian::readLong64(&data[idx]))); h64 = ((h64 << 27) | (h64 >> 37)) * PRIME64_1 + PRIME64_4; idx += 8; } while (idx+4 <= length) { h64 ^= (uint32(LittleEndian::readInt32(&data[idx])) * PRIME64_1); h64 = ((h64 << 23) | (h64 >> 41)) * PRIME64_2 + PRIME64_3; idx += 4; } while (idx < length) { h64 ^= (uint64(data[idx] & byte(0xFF)) * PRIME64_5); h64 = ((h64 << 11) | (h64 >> 53)) * PRIME64_1; idx++; } // Finalize h64 ^= (h64 >> 33); h64 *= PRIME64_2; h64 ^= (h64 >> 29); h64 *= PRIME64_3; return h64 ^ (h64 >> 32); } inline uint64 XXHash64::round(uint64 acc, uint64 val) const { acc += (val*PRIME64_2); return ((acc << 31) | (acc >> 33)) * PRIME64_1; } inline uint64 XXHash64::mergeRound(uint64 acc, uint64 val) const { acc ^= round(0, val); return acc*PRIME64_1 + PRIME64_4; } } #endif kanzi-cpp-2.5.2/src/util/fixedbuf.hpp000066400000000000000000000032341516423635400175000ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_fixedbuf #define knz_fixedbuf #include #include // Ahem ... Visual Studio // This code is required because Microsoft cannot bother to implement streambuf::pubsetbuf(). // Also On libstdc++, pubsetbuf() silently ignores the supplied buffer and leaves internal pointers null. class ifixedbuf : public std::streambuf { public: ifixedbuf(char* data, std::size_t size) { // Always manually set the read pointers. // pubsetbuf() is unreliable on libstdc++, and MSVC doesn't implement it. this->setg(data, data, data + size); } }; class ofixedbuf : public std::streambuf { public: ofixedbuf(char* data, std::size_t size) { // Always set buffer manually - pubsetbuf is useless on libstdc++ this->setp(data, data + size); } std::size_t written() const { return this->pptr() - this->pbase(); } }; class iofixedbuf : public std::streambuf { public: iofixedbuf(char* data, std::size_t size) { this->setg(data, data, data + size); this->setp(data, data + size); } }; #endif kanzi-cpp-2.5.2/src/util/strings.hpp000066400000000000000000000076341516423635400174050ustar00rootroot00000000000000/* Copyright 2011-2026 Frederic Langlet Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. you may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef knz_strings #define knz_strings #include #include #include #include #include #if __cplusplus < 201103L // to_string() not available before C++ 11 template std::string to_string(T value) { std::ostringstream os; os << value; return os.str(); } #define TOSTR(v) to_string(v) #else #define TOSTR(v) std::to_string(v) #endif inline void to_binary(int num, char* buffer, int length) { for (int i = length - 2; i >= 0; i--) { buffer[i] = (num & 1) ? '1' : '0'; num >>= 1; } buffer[length - 1] = '\0'; } // trim from end of string (right) inline std::string& rtrim(std::string& s) { static const char* whitespaces = " \t\f\v\n\r"; std::size_t pos = s.find_last_not_of(whitespaces); if (pos == std::string::npos) s.clear(); else s.erase(pos + 1); return s; } // trim from beginning of string (left) inline std::string& ltrim(std::string& s) { static const char* whitespaces = " \t\f\v\n\r"; std::size_t pos = s.find_first_not_of(whitespaces); if (pos == std::string::npos) s.clear(); else s.erase(0, pos); return s; } // Ensures that the function works on platforms where char is signed inline char safeToUpper(char c) { return static_cast(::toupper(static_cast(c))); } // trim from both ends of string (right then left) inline std::string& trim(std::string& s) { return ltrim(rtrim(s)); } inline void tokenize(const std::string& str, std::vector& v, char token) { std::istringstream ss(str); std::string s; while (getline(ss, s, token)) v.push_back(s); } inline std::string escapeJSONString(const std::string& src) { std::stringstream ss; for (size_t i = 0; i < src.size(); i++) { const unsigned char c = static_cast(src[i]); switch (c) { case '"': ss << "\\\""; break; case '\\': ss << "\\\\"; break; case '\b': ss << "\\b"; break; case '\f': ss << "\\f"; break; case '\n': ss << "\\n"; break; case '\r': ss << "\\r"; break; case '\t': ss << "\\t"; break; default: if (c < 0x20) { ss << "\\u00"; ss << std::uppercase << std::hex << std::setw(2) << std::setfill('0') << int(c); ss << std::dec; } else { ss << char(c); } } } return ss.str(); } inline std::string formatSize(double size) { std::stringstream ss; std::string s; if (size >= double(1 << 30)) { size /= double(1024 * 1024 * 1024); ss << std::fixed << std::setprecision(2) << size << " GiB"; s = ss.str(); } else if (size >= double(1 << 20)) { size /= double(1024 * 1024); ss << std::fixed << std::setprecision(2) << size << " MiB"; s = ss.str(); } else if (size >= double(1 << 10)) { size /= double(1024); ss << std::fixed << std::setprecision(2) << size << " KiB"; s = ss.str(); } else { ss << size; s = ss.str(); } return s; } inline std::string formatSize(const std::string& input) { return formatSize(atof(input.c_str())); } #endif